Mercurial > codedump
comparison sanae.site/scrape.py @ 135:0c3cd90e91f7 default tip
*: add sanae.site scraper scripts
| author | Paper <paper@tflc.us> |
|---|---|
| date | Sat, 24 Jan 2026 15:10:05 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 134:c27afe8ead5f | 135:0c3cd90e91f7 |
|---|---|
| 1 import requests | |
| 2 import itertools | |
| 3 import time | |
| 4 import random | |
| 5 | |
| 6 random.seed() | |
| 7 | |
| 8 # Root URL | |
| 9 ROOT = "https://donsebagay.mom/" | |
| 10 # This file allows you to start/stop the script at any time, and it | |
| 11 # will pick up right where it left off. | |
| 12 FILE = "donseba-status.txt" | |
| 13 # String containing all possible characters in a file ID | |
| 14 CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-" | |
| 15 | |
| 16 def all_possibilities(l): | |
| 17 # maybe we need products of 2 as well... | |
| 18 yield from itertools.product(*([l] * 3)) | |
| 19 | |
| 20 s = requests.Session() | |
| 21 | |
| 22 try: | |
| 23 with open(FILE, "r", encoding="utf-8") as f: | |
| 24 skip = f.read() | |
| 25 except: | |
| 26 skip = None | |
| 27 pass | |
| 28 | |
| 29 for x in all_possibilities(CHARS): | |
| 30 x = ''.join(x) | |
| 31 if skip is not None: | |
| 32 if x == skip: | |
| 33 skip = None | |
| 34 continue | |
| 35 r = s.get(ROOT + x + "?raw=1&download=1") | |
| 36 print(x, r) | |
| 37 # For some reason, vhoda has a "Page Not Found" page | |
| 38 # that doesn't return a 404 status code, so we have to | |
| 39 # detect it through 'Content-Type' instead | |
| 40 if r.headers["Content-Type"] != "text/html; charset=UTF-8": | |
| 41 with open(x, "wb") as f: | |
| 42 f.write(r.content) | |
| 43 with open(FILE, "w") as f: | |
| 44 f.write(x) | |
| 45 time.sleep(random.random() * 5) |
