Mercurial > codedump
comparison sanae.site/guess.py @ 135:0c3cd90e91f7 default tip
*: add sanae.site scraper scripts
| author | Paper <paper@tflc.us> |
|---|---|
| date | Sat, 24 Jan 2026 15:10:05 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 134:c27afe8ead5f | 135:0c3cd90e91f7 |
|---|---|
| 1 import requests | |
| 2 import re | |
| 3 import os | |
| 4 import time | |
| 5 import datetime | |
| 6 import json | |
| 7 import lxml | |
| 8 | |
| 9 from lxml.html.clean import Cleaner | |
| 10 | |
| 11 ROOT = "https://donsebagay.mom/" | |
| 12 | |
| 13 # Global requests session | |
| 14 SESSION = requests.Session() | |
| 15 | |
| 16 # Raw string for regex | |
| 17 FILEPATTERN = re.compile(r"^[A-Z0-9\-_]{3}$") | |
| 18 USERPATTERN = re.compile(r"<small class=\"text-white fw-semibold\">(.+?)</small>") | |
| 19 # NOTE: This keeps the quotes in tact; you must remove them yourself with strip() | |
| 20 DISPOSITIONPATTERN = re.compile(r"^attachment;?(?:\s+)filename=(.+?)$") | |
| 21 DATEPATTERN = re.compile(r"<i class=\"bi bi-calendar\"></i> Uploaded: <strong>(.+?)</strong>") | |
| 22 UNLISTEDPATTERN = re.compile(r"<i class=\"bi bi-eye-slash me-1\"></i>Unlisted") | |
| 23 YOUTUBEPATTERN = re.compile(r"<a href=\"https://www\.youtube\.com/watch\?v=([A-Za-z0-9_\-]{11})\"") | |
| 24 | |
| 25 CLEANER = Cleaner() | |
| 26 # Remove JS and CSS; we don't want them | |
| 27 CLEANER.javascript = True | |
| 28 CLEANER.style = True | |
| 29 | |
| 30 # Filter all of the files in the current directory | |
| 31 for i in filter(lambda x: bool(re.match(FILEPATTERN, x)), os.listdir()): | |
| 32 # I should have done this before but I was lazy and I really | |
| 33 # just wanted to get everything downloaded in time. | |
| 34 # | |
| 35 # What is important now is saving all of the metadata, such as | |
| 36 # the original filename, upload date, and uploader. | |
| 37 # Sometimes the uploader is simply "Anonymous" and in this case | |
| 38 # we should probably just throw it out. | |
| 39 # The original filename and date are likely always available. | |
| 40 # The date is stored in an asctime()-like format in the "date" | |
| 41 # header. The original filename is in the content-disposition | |
| 42 # header. In case either of these two are available, we should | |
| 43 # guess the extension by the contents. | |
| 44 | |
| 45 print(i) | |
| 46 | |
| 47 # JSON to write; the below lines will add metadata to this | |
| 48 info = dict() | |
| 49 | |
| 50 # assume public visibility | |
| 51 info["visibility"] = "public" | |
| 52 | |
| 53 try: | |
| 54 page = SESSION.get(ROOT + i) | |
| 55 text = lxml.html.tostring(CLEANER.clean_html(lxml.html.document_fromstring(page.text))).decode('utf-8') | |
| 56 | |
| 57 if True: | |
| 58 # basic ass regex | |
| 59 usermatch = re.search(USERPATTERN, text) | |
| 60 | |
| 61 if usermatch: | |
| 62 info["username"] = usermatch.group(1) | |
| 63 | |
| 64 if True: | |
| 65 # basic regex part 2 | |
| 66 datematch = re.search(DATEPATTERN, text) | |
| 67 | |
| 68 if datematch: | |
| 69 d = datetime.datetime.strptime(datematch.group(1), "%d/%m/%Y %H:%M") | |
| 70 info["date"] = d.isoformat() | |
| 71 | |
| 72 if True: | |
| 73 unlistmatch = re.search(UNLISTEDPATTERN, text) | |
| 74 if unlistmatch: | |
| 75 info["visibility"] = "unlisted" | |
| 76 | |
| 77 if True: | |
| 78 yturlmatch = re.search(YOUTUBEPATTERN, text) | |
| 79 if yturlmatch: | |
| 80 info["yturl"] = "https://www.youtube.com/watch?v=" + yturlmatch.group(1) | |
| 81 | |
| 82 except Exception as e: | |
| 83 print(e) | |
| 84 pass | |
| 85 | |
| 86 # Now request headers for this; it contains everything we need | |
| 87 try: | |
| 88 head = SESSION.head(ROOT + i + "?raw=1&download=1") | |
| 89 | |
| 90 hdrs = head.headers | |
| 91 | |
| 92 if 'content-disposition' in hdrs: | |
| 93 # Filenames are in UTF-8, but HTTP headers are supposed to be latin-1. | |
| 94 # So encode it and re-decode it as proper UTF-8 | |
| 95 m = re.match(DISPOSITIONPATTERN, hdrs['content-disposition'].encode('iso-8859-1').decode('utf-8')) | |
| 96 if m: | |
| 97 info["filename"] = m.group(1).strip("\"") | |
| 98 except Exception as e: | |
| 99 print(e) | |
| 100 pass | |
| 101 | |
| 102 if True: | |
| 103 if "filename" in info: | |
| 104 n = i + " - " + info["filename"] | |
| 105 try: | |
| 106 os.rename(i, n) | |
| 107 i = n | |
| 108 except: | |
| 109 pass | |
| 110 | |
| 111 with open(i + ".json", "w", encoding="utf-8") as f: | |
| 112 json.dump(info, f) | |
| 113 | |
| 114 # Sleep for 1 second to not overload the servers | |
| 115 # and possibly get bot-killed | |
| 116 time.sleep(2) |
