annotate sanae.site/scrape.py @ 135:0c3cd90e91f7 default tip

*: add sanae.site scraper scripts
author Paper <paper@tflc.us>
date Sat, 24 Jan 2026 15:10:05 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
135
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
1 import requests
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
2 import itertools
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
3 import time
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
4 import random
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
5
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
6 random.seed()
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
7
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
8 # Root URL
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
9 ROOT = "https://donsebagay.mom/"
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
10 # This file allows you to start/stop the script at any time, and it
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
11 # will pick up right where it left off.
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
12 FILE = "donseba-status.txt"
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
13 # String containing all possible characters in a file ID
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
14 CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
15
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
16 def all_possibilities(l):
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
17 # maybe we need products of 2 as well...
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
18 yield from itertools.product(*([l] * 3))
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
19
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
20 s = requests.Session()
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
21
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
22 try:
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
23 with open(FILE, "r", encoding="utf-8") as f:
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
24 skip = f.read()
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
25 except:
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
26 skip = None
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
27 pass
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
28
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
29 for x in all_possibilities(CHARS):
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
30 x = ''.join(x)
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
31 if skip is not None:
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
32 if x == skip:
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
33 skip = None
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
34 continue
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
35 r = s.get(ROOT + x + "?raw=1&download=1")
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
36 print(x, r)
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
37 # For some reason, vhoda has a "Page Not Found" page
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
38 # that doesn't return a 404 status code, so we have to
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
39 # detect it through 'Content-Type' instead
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
40 if r.headers["Content-Type"] != "text/html; charset=UTF-8":
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
41 with open(x, "wb") as f:
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
42 f.write(r.content)
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
43 with open(FILE, "w") as f:
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
44 f.write(x)
0c3cd90e91f7 *: add sanae.site scraper scripts
Paper <paper@tflc.us>
parents:
diff changeset
45 time.sleep(random.random() * 5)