Mercurial > codedump
diff sanae.site/scrape.py @ 135:0c3cd90e91f7 default tip
*: add sanae.site scraper scripts
| author | Paper <paper@tflc.us> |
|---|---|
| date | Sat, 24 Jan 2026 15:10:05 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sanae.site/scrape.py Sat Jan 24 15:10:05 2026 -0500 @@ -0,0 +1,45 @@ +import requests +import itertools +import time +import random + +random.seed() + +# Root URL +ROOT = "https://donsebagay.mom/" +# This file allows you to start/stop the script at any time, and it +# will pick up right where it left off. +FILE = "donseba-status.txt" +# String containing all possible characters in a file ID +CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-" + +def all_possibilities(l): + # maybe we need products of 2 as well... + yield from itertools.product(*([l] * 3)) + +s = requests.Session() + +try: + with open(FILE, "r", encoding="utf-8") as f: + skip = f.read() +except: + skip = None + pass + +for x in all_possibilities(CHARS): + x = ''.join(x) + if skip is not None: + if x == skip: + skip = None + continue + r = s.get(ROOT + x + "?raw=1&download=1") + print(x, r) + # For some reason, vhoda has a "Page Not Found" page + # that doesn't return a 404 status code, so we have to + # detect it through 'Content-Type' instead + if r.headers["Content-Type"] != "text/html; charset=UTF-8": + with open(x, "wb") as f: + f.write(r.content) + with open(FILE, "w") as f: + f.write(x) + time.sleep(random.random() * 5)
