codedump: sanae.site/scrape.py comparison

*: add sanae.site scraper scripts

comparison

equal deleted inserted replaced

-:c27afe8ead5f
+:0c3cd90e91f7
+import requests
+import itertools
+import time
+import random
+random.seed()
+# Root URL
+ROOT = "https://donsebagay.mom/"
+# This file allows you to start/stop the script at any time, and it
+# will pick up right where it left off.
+FILE = "donseba-status.txt"
+# String containing all possible characters in a file ID
+CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
+def all_possibilities(l):
+	# maybe we need products of 2 as well...
+	yield from itertools.product(*([l] * 3))
+s = requests.Session()
+try:
+	with open(FILE, "r", encoding="utf-8") as f:
+		skip = f.read()
+except:
+	skip = None
+	pass
+for x in all_possibilities(CHARS):
+	x = ''.join(x)
+	if skip is not None:
+		if x == skip:
+			skip = None
+		continue
+	r = s.get(ROOT + x + "?raw=1&download=1")
+	print(x, r)
+	# For some reason, vhoda has a "Page Not Found" page
+	# that doesn't return a 404 status code, so we have to
+	# detect it through 'Content-Type' instead
+	if r.headers["Content-Type"] != "text/html; charset=UTF-8":
+		with open(x, "wb") as f:
+			f.write(r.content)
+	with open(FILE, "w") as f:
+		f.write(x)
+	time.sleep(random.random() * 5)

Mercurial > codedump