view sanae.site/scrape.py @ 135:0c3cd90e91f7 default tip

*: add sanae.site scraper scripts
author Paper <paper@tflc.us>
date Sat, 24 Jan 2026 15:10:05 -0500
parents
children
line wrap: on
line source

import requests
import itertools
import time
import random

random.seed()

# Root URL
ROOT = "https://donsebagay.mom/"
# This file allows you to start/stop the script at any time, and it
# will pick up right where it left off.
FILE = "donseba-status.txt"
# String containing all possible characters in a file ID
CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"

def all_possibilities(l):
	# maybe we need products of 2 as well...
	yield from itertools.product(*([l] * 3))

s = requests.Session()

try:
	with open(FILE, "r", encoding="utf-8") as f:
		skip = f.read()
except:
	skip = None
	pass

for x in all_possibilities(CHARS):
	x = ''.join(x)
	if skip is not None:
		if x == skip:
			skip = None
		continue
	r = s.get(ROOT + x + "?raw=1&download=1")
	print(x, r)
	# For some reason, vhoda has a "Page Not Found" page
	# that doesn't return a 404 status code, so we have to
	# detect it through 'Content-Type' instead
	if r.headers["Content-Type"] != "text/html; charset=UTF-8":
		with open(x, "wb") as f:
			f.write(r.content)
	with open(FILE, "w") as f:
		f.write(x)
	time.sleep(random.random() * 5)