Mercurial > codedump

diff sanae.site/guess.py @ 135:0c3cd90e91f7 default tip
*: add sanae.site scraper scripts
author: Paper <paper@tflc.us>
date: Sat, 24 Jan 2026 15:10:05 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sanae.site/guess.py	Sat Jan 24 15:10:05 2026 -0500
@@ -0,0 +1,116 @@
+import requests
+import re
+import os
+import time
+import datetime
+import json
+import lxml
+
+from lxml.html.clean import Cleaner
+
+ROOT = "https://donsebagay.mom/"
+
+# Global requests session
+SESSION = requests.Session()
+
+# Raw string for regex
+FILEPATTERN = re.compile(r"^[A-Z0-9\-_]{3}$")
+USERPATTERN = re.compile(r"<small class=\"text-white fw-semibold\">(.+?)</small>")
+# NOTE: This keeps the quotes in tact; you must remove them yourself with strip()
+DISPOSITIONPATTERN = re.compile(r"^attachment;?(?:\s+)filename=(.+?)$")
+DATEPATTERN = re.compile(r"<i class=\"bi bi-calendar\"></i> Uploaded: <strong>(.+?)</strong>")
+UNLISTEDPATTERN = re.compile(r"<i class=\"bi bi-eye-slash me-1\"></i>Unlisted")
+YOUTUBEPATTERN = re.compile(r"<a href=\"https://www\.youtube\.com/watch\?v=([A-Za-z0-9_\-]{11})\"")
+
+CLEANER = Cleaner()
+# Remove JS and CSS; we don't want them
+CLEANER.javascript = True
+CLEANER.style = True
+
+# Filter all of the files in the current directory
+for i in filter(lambda x: bool(re.match(FILEPATTERN, x)), os.listdir()):
+    # I should have done this before but I was lazy and I really
+    # just wanted to get everything downloaded in time.
+    #
+    # What is important now is saving all of the metadata, such as
+    # the original filename, upload date, and uploader.
+    # Sometimes the uploader is simply "Anonymous" and in this case
+    # we should probably just throw it out.
+    # The original filename and date are likely always available.
+    # The date is stored in an asctime()-like format in the "date"
+    # header. The original filename is in the content-disposition
+    # header. In case either of these two are available, we should
+    # guess the extension by the contents.
+
+    print(i)
+
+    # JSON to write; the below lines will add metadata to this
+    info = dict()
+
+    # assume public visibility
+    info["visibility"] = "public"
+
+    try:
+        page = SESSION.get(ROOT + i)
+        text = lxml.html.tostring(CLEANER.clean_html(lxml.html.document_fromstring(page.text))).decode('utf-8')
+
+        if True:
+            # basic ass regex
+            usermatch = re.search(USERPATTERN, text)
+
+            if usermatch:
+                info["username"] = usermatch.group(1)
+
+        if True:
+            # basic regex part 2
+            datematch = re.search(DATEPATTERN, text)
+
+            if datematch:
+                d = datetime.datetime.strptime(datematch.group(1), "%d/%m/%Y %H:%M")
+                info["date"] = d.isoformat()
+
+        if True:
+            unlistmatch = re.search(UNLISTEDPATTERN, text)
+            if unlistmatch:
+                info["visibility"] = "unlisted"
+
+        if True:
+            yturlmatch = re.search(YOUTUBEPATTERN, text)
+            if yturlmatch:
+                info["yturl"] = "https://www.youtube.com/watch?v=" + yturlmatch.group(1)
+
+    except Exception as e:
+        print(e)
+        pass
+
+    # Now request headers for this; it contains everything we need
+    try:
+        head = SESSION.head(ROOT + i + "?raw=1&download=1")
+
+        hdrs = head.headers
+
+        if 'content-disposition' in hdrs:
+            # Filenames are in UTF-8, but HTTP headers are supposed to be latin-1.
+            # So encode it and re-decode it as proper UTF-8
+            m = re.match(DISPOSITIONPATTERN, hdrs['content-disposition'].encode('iso-8859-1').decode('utf-8'))
+            if m:
+                info["filename"] = m.group(1).strip("\"")
+    except Exception as e:
+        print(e)
+        pass
+
+    if True:
+        if "filename" in info:
+            n = i + " - " + info["filename"]
+            try:
+                os.rename(i, n)
+                i = n
+            except:
+                pass
+
+        with open(i + ".json", "w", encoding="utf-8") as f:
+            json.dump(info, f)
+
+    # Sleep for 1 second to not overload the servers
+    # and possibly get bot-killed
+    time.sleep(2)
author	Paper <paper@tflc.us>
date	Sat, 24 Jan 2026 15:10:05 -0500
parents
children