codedump: sanae.site/guess.py comparison

comparison sanae.site/guess.py @ 135:0c3cd90e91f7 default tip

*: add sanae.site scraper scripts

author	Paper <paper@tflc.us>
date	Sat, 24 Jan 2026 15:10:05 -0500
parents
children

comparison

equal deleted inserted replaced

-:c27afe8ead5f
+:0c3cd90e91f7
+import requests
+import re
+import os
+import time
+import datetime
+import json
+import lxml
+from lxml.html.clean import Cleaner
+ROOT = "https://donsebagay.mom/"
+# Global requests session
+SESSION = requests.Session()
+# Raw string for regex
+FILEPATTERN = re.compile(r"^[A-Z0-9\-_]{3}$")
+USERPATTERN = re.compile(r"<small class=\"text-white fw-semibold\">(.+?)</small>")
+# NOTE: This keeps the quotes in tact; you must remove them yourself with strip()
+DISPOSITIONPATTERN = re.compile(r"^attachment;?(?:\s+)filename=(.+?)$")
+DATEPATTERN = re.compile(r"<i class=\"bi bi-calendar\"></i> Uploaded: <strong>(.+?)</strong>")
+UNLISTEDPATTERN = re.compile(r"<i class=\"bi bi-eye-slash me-1\"></i>Unlisted")
+YOUTUBEPATTERN = re.compile(r"<a href=\"https://www\.youtube\.com/watch\?v=([A-Za-z0-9_\-]{11})\"")
+CLEANER = Cleaner()
+# Remove JS and CSS; we don't want them
+CLEANER.javascript = True
+CLEANER.style = True
+# Filter all of the files in the current directory
+for i in filter(lambda x: bool(re.match(FILEPATTERN, x)), os.listdir()):
+# I should have done this before but I was lazy and I really
+# just wanted to get everything downloaded in time.
+#
+# What is important now is saving all of the metadata, such as
+# the original filename, upload date, and uploader.
+# Sometimes the uploader is simply "Anonymous" and in this case
+# we should probably just throw it out.
+# The original filename and date are likely always available.
+# The date is stored in an asctime()-like format in the "date"
+# header. The original filename is in the content-disposition
+# header. In case either of these two are available, we should
+# guess the extension by the contents.
+print(i)
+# JSON to write; the below lines will add metadata to this
+info = dict()
+# assume public visibility
+info["visibility"] = "public"
+try:
+page = SESSION.get(ROOT + i)
+text = lxml.html.tostring(CLEANER.clean_html(lxml.html.document_fromstring(page.text))).decode('utf-8')
+if True:
+# basic ass regex
+usermatch = re.search(USERPATTERN, text)
+if usermatch:
+info["username"] = usermatch.group(1)
+if True:
+# basic regex part 2
+datematch = re.search(DATEPATTERN, text)
+if datematch:
+d = datetime.datetime.strptime(datematch.group(1), "%d/%m/%Y %H:%M")
+info["date"] = d.isoformat()
+if True:
+unlistmatch = re.search(UNLISTEDPATTERN, text)
+if unlistmatch:
+info["visibility"] = "unlisted"
+if True:
+yturlmatch = re.search(YOUTUBEPATTERN, text)
+if yturlmatch:
+info["yturl"] = "https://www.youtube.com/watch?v=" + yturlmatch.group(1)
+except Exception as e:
+print(e)
+pass
+# Now request headers for this; it contains everything we need
+try:
+head = SESSION.head(ROOT + i + "?raw=1&download=1")
+hdrs = head.headers
+if 'content-disposition' in hdrs:
+# Filenames are in UTF-8, but HTTP headers are supposed to be latin-1.
+# So encode it and re-decode it as proper UTF-8
+m = re.match(DISPOSITIONPATTERN, hdrs['content-disposition'].encode('iso-8859-1').decode('utf-8'))
+if m:
+info["filename"] = m.group(1).strip("\"")
+except Exception as e:
+print(e)
+pass
+if True:
+if "filename" in info:
+n = i + " - " + info["filename"]
+try:
+os.rename(i, n)
+i = n
+except:
+pass
+with open(i + ".json", "w", encoding="utf-8") as f:
+json.dump(info, f)
+# Sleep for 1 second to not overload the servers
+# and possibly get bot-killed
+time.sleep(2)

Mercurial > codedump

comparison sanae.site/guess.py @ 135:0c3cd90e91f7 default tip