changeset 135:0c3cd90e91f7 default tip

*: add sanae.site scraper scripts
author Paper <paper@tflc.us>
date Sat, 24 Jan 2026 15:10:05 -0500
parents c27afe8ead5f
children
files sanae.site/README sanae.site/guess.py sanae.site/scrape.py
diffstat 3 files changed, 192 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sanae.site/README	Sat Jan 24 15:10:05 2026 -0500
@@ -0,0 +1,31 @@
+These two scripts were used to scrape files and metadata off sanae.site, a
+short-lived file-upload service ran by vhoda. He completely killed off any
+access from the sanae.site domain, but it was still accessible via
+donsebagay.mom, and that's what the ROOT variables contain.
+
+The first script "scrape.py" was used just to download all of the files.
+These were the most important to save after all. After this, I wrote
+"guess.py" which scraped all of the metadata (as the filename implies,
+this was originally going to "guess" the file extension for each file,
+but the servers were still up so I just scraped the metadata)
+
+The "guess.py" script requires the lxml package, which you will probably
+already have installed. This is only used to strip off <script> and <style>
+tags from the file.
+
+The resulting files from these scripts should be of the format:
+	"[id] - [filename w/ extension]"
+	"[id] - [filename w/ extension].json"
+
+Of which the latter is a JSON object that may or may not contain any of the
+following fields:
+	"filename"   -- original filename, HTTP 'Content-Disposition' header
+	"date"       -- the date and time of upload, ISO format
+	"visibility" -- "public" if accessible from a user's page, "unlisted"
+	                if not. private videos cannot be accessed as this
+	                script has no login details nor cookies.
+	"yturl"      -- the original YouTube URL, if this is a YouTube download
+	"username"   -- the username of the uploader; this includes the "!"
+	                prefix. this will be "Anonymous" if the website
+	                provided it as that.
+	                for some files, e.g. FLAC, this is not available :(
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sanae.site/guess.py	Sat Jan 24 15:10:05 2026 -0500
@@ -0,0 +1,116 @@
+import requests
+import re
+import os
+import time
+import datetime
+import json
+import lxml
+
+from lxml.html.clean import Cleaner
+
+ROOT = "https://donsebagay.mom/"
+
+# Global requests session
+SESSION = requests.Session()
+
+# Raw string for regex
+FILEPATTERN = re.compile(r"^[A-Z0-9\-_]{3}$")
+USERPATTERN = re.compile(r"<small class=\"text-white fw-semibold\">(.+?)</small>")
+# NOTE: This keeps the quotes in tact; you must remove them yourself with strip()
+DISPOSITIONPATTERN = re.compile(r"^attachment;?(?:\s+)filename=(.+?)$")
+DATEPATTERN = re.compile(r"<i class=\"bi bi-calendar\"></i> Uploaded: <strong>(.+?)</strong>")
+UNLISTEDPATTERN = re.compile(r"<i class=\"bi bi-eye-slash me-1\"></i>Unlisted")
+YOUTUBEPATTERN = re.compile(r"<a href=\"https://www\.youtube\.com/watch\?v=([A-Za-z0-9_\-]{11})\"")
+
+CLEANER = Cleaner()
+# Remove JS and CSS; we don't want them
+CLEANER.javascript = True
+CLEANER.style = True
+
+# Filter all of the files in the current directory
+for i in filter(lambda x: bool(re.match(FILEPATTERN, x)), os.listdir()):
+    # I should have done this before but I was lazy and I really
+    # just wanted to get everything downloaded in time.
+    #
+    # What is important now is saving all of the metadata, such as
+    # the original filename, upload date, and uploader.
+    # Sometimes the uploader is simply "Anonymous" and in this case
+    # we should probably just throw it out.
+    # The original filename and date are likely always available.
+    # The date is stored in an asctime()-like format in the "date"
+    # header. The original filename is in the content-disposition
+    # header. In case either of these two are available, we should
+    # guess the extension by the contents.
+
+    print(i)
+
+    # JSON to write; the below lines will add metadata to this
+    info = dict()
+
+    # assume public visibility
+    info["visibility"] = "public"
+
+    try:
+        page = SESSION.get(ROOT + i)
+        text = lxml.html.tostring(CLEANER.clean_html(lxml.html.document_fromstring(page.text))).decode('utf-8')
+
+        if True:
+            # basic ass regex
+            usermatch = re.search(USERPATTERN, text)
+
+            if usermatch:
+                info["username"] = usermatch.group(1)
+
+        if True:
+            # basic regex part 2
+            datematch = re.search(DATEPATTERN, text)
+
+            if datematch:
+                d = datetime.datetime.strptime(datematch.group(1), "%d/%m/%Y %H:%M")
+                info["date"] = d.isoformat()
+
+        if True:
+            unlistmatch = re.search(UNLISTEDPATTERN, text)
+            if unlistmatch:
+                info["visibility"] = "unlisted"
+
+        if True:
+            yturlmatch = re.search(YOUTUBEPATTERN, text)
+            if yturlmatch:
+                info["yturl"] = "https://www.youtube.com/watch?v=" + yturlmatch.group(1)
+
+    except Exception as e:
+        print(e)
+        pass
+
+    # Now request headers for this; it contains everything we need
+    try:
+        head = SESSION.head(ROOT + i + "?raw=1&download=1")
+
+        hdrs = head.headers
+
+        if 'content-disposition' in hdrs:
+            # Filenames are in UTF-8, but HTTP headers are supposed to be latin-1.
+            # So encode it and re-decode it as proper UTF-8
+            m = re.match(DISPOSITIONPATTERN, hdrs['content-disposition'].encode('iso-8859-1').decode('utf-8'))
+            if m:
+                info["filename"] = m.group(1).strip("\"")
+    except Exception as e:
+        print(e)
+        pass
+
+    if True:
+        if "filename" in info:
+            n = i + " - " + info["filename"]
+            try:
+                os.rename(i, n)
+                i = n
+            except:
+                pass
+
+        with open(i + ".json", "w", encoding="utf-8") as f:
+            json.dump(info, f)
+
+    # Sleep for 1 second to not overload the servers
+    # and possibly get bot-killed
+    time.sleep(2)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sanae.site/scrape.py	Sat Jan 24 15:10:05 2026 -0500
@@ -0,0 +1,45 @@
+import requests
+import itertools
+import time
+import random
+
+random.seed()
+
+# Root URL
+ROOT = "https://donsebagay.mom/"
+# This file allows you to start/stop the script at any time, and it
+# will pick up right where it left off.
+FILE = "donseba-status.txt"
+# String containing all possible characters in a file ID
+CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
+
+def all_possibilities(l):
+	# maybe we need products of 2 as well...
+	yield from itertools.product(*([l] * 3))
+
+s = requests.Session()
+
+try:
+	with open(FILE, "r", encoding="utf-8") as f:
+		skip = f.read()
+except:
+	skip = None
+	pass
+
+for x in all_possibilities(CHARS):
+	x = ''.join(x)
+	if skip is not None:
+		if x == skip:
+			skip = None
+		continue
+	r = s.get(ROOT + x + "?raw=1&download=1")
+	print(x, r)
+	# For some reason, vhoda has a "Page Not Found" page
+	# that doesn't return a 404 status code, so we have to
+	# detect it through 'Content-Type' instead
+	if r.headers["Content-Type"] != "text/html; charset=UTF-8":
+		with open(x, "wb") as f:
+			f.write(r.content)
+	with open(FILE, "w") as f:
+		f.write(x)
+	time.sleep(random.random() * 5)