diff sanae.site/scrape.py @ 135:0c3cd90e91f7 default tip

*: add sanae.site scraper scripts
author Paper <paper@tflc.us>
date Sat, 24 Jan 2026 15:10:05 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sanae.site/scrape.py	Sat Jan 24 15:10:05 2026 -0500
@@ -0,0 +1,45 @@
+import requests
+import itertools
+import time
+import random
+
+random.seed()
+
+# Root URL
+ROOT = "https://donsebagay.mom/"
+# This file allows you to start/stop the script at any time, and it
+# will pick up right where it left off.
+FILE = "donseba-status.txt"
+# String containing all possible characters in a file ID
+CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
+
+def all_possibilities(l):
+	# maybe we need products of 2 as well...
+	yield from itertools.product(*([l] * 3))
+
+s = requests.Session()
+
+try:
+	with open(FILE, "r", encoding="utf-8") as f:
+		skip = f.read()
+except:
+	skip = None
+	pass
+
+for x in all_possibilities(CHARS):
+	x = ''.join(x)
+	if skip is not None:
+		if x == skip:
+			skip = None
+		continue
+	r = s.get(ROOT + x + "?raw=1&download=1")
+	print(x, r)
+	# For some reason, vhoda has a "Page Not Found" page
+	# that doesn't return a 404 status code, so we have to
+	# detect it through 'Content-Type' instead
+	if r.headers["Content-Type"] != "text/html; charset=UTF-8":
+		with open(x, "wb") as f:
+			f.write(r.content)
+	with open(FILE, "w") as f:
+		f.write(x)
+	time.sleep(random.random() * 5)