Mercurial > codedump
view sanae.site/scrape.py @ 135:0c3cd90e91f7 default tip
*: add sanae.site scraper scripts
| author | Paper <paper@tflc.us> |
|---|---|
| date | Sat, 24 Jan 2026 15:10:05 -0500 |
| parents | |
| children |
line wrap: on
line source
import requests import itertools import time import random random.seed() # Root URL ROOT = "https://donsebagay.mom/" # This file allows you to start/stop the script at any time, and it # will pick up right where it left off. FILE = "donseba-status.txt" # String containing all possible characters in a file ID CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-" def all_possibilities(l): # maybe we need products of 2 as well... yield from itertools.product(*([l] * 3)) s = requests.Session() try: with open(FILE, "r", encoding="utf-8") as f: skip = f.read() except: skip = None pass for x in all_possibilities(CHARS): x = ''.join(x) if skip is not None: if x == skip: skip = None continue r = s.get(ROOT + x + "?raw=1&download=1") print(x, r) # For some reason, vhoda has a "Page Not Found" page # that doesn't return a 404 status code, so we have to # detect it through 'Content-Type' instead if r.headers["Content-Type"] != "text/html; charset=UTF-8": with open(x, "wb") as f: f.write(r.content) with open(FILE, "w") as f: f.write(x) time.sleep(random.random() * 5)
