changeset 15:615e1ca0212a

*: add support for loading the split db from a zip file saves time having to decompress it. also fixed a couple bugs here and there (notably with the IA downloading)
author Paper <paper@tflc.us>
date Fri, 27 Feb 2026 17:01:18 -0500
parents 03c8fd4069fb
children 088d9a3a2524
files channeldownloader.py
diffstat 1 files changed, 68 insertions(+), 39 deletions(-) [+]
line wrap: on
line diff
--- a/channeldownloader.py	Sat Aug 30 17:09:56 2025 -0400
+++ b/channeldownloader.py	Fri Feb 27 17:01:18 2026 -0500
@@ -44,6 +44,7 @@
 import urllib.request
 import os
 import ssl
+import io
 from urllib.error import HTTPError
 from pathlib import Path
 
@@ -86,6 +87,15 @@
     print("failed to import the Internet Archive's python library!")
     print("downloading from IA will not work.")
 
+zipfile_works = False
+
+try:
+    import zipfile
+    zipfile_works = True
+except ImportError:
+    print("failed to import zipfile!")
+    print("loading the database from a .zip file will not work.")
+
 ##############################################################################
 ## DOWNLOADERS
 
@@ -210,7 +220,7 @@
 
 # Internet Archive (tubeup)
 def ia_dl(video: dict, basename: str, output: str) -> int:
-    def ia_file_legit(file: internetarchive.File, vidid: str) -> bool:
+    def ia_file_legit(f: str, vidid: str) -> bool:
         # FIXME:
         #
         # There are some items on IA that combine the old tubeup behavior
@@ -231,11 +241,9 @@
         # than a local copy... :)
         if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w"
                          r"ebm|info\.json|description|annotations.xml))",
-                         f.name):
+                         f):
             return False
 
-        # now, check the metadata
-        print(f)
         return True
 
 
@@ -247,10 +255,11 @@
         for f in internetarchive.get_files("youtube-%s" % video["id"])
         if ia_file_legit(f.name, video["id"])
     ]
+
     while True:
         try:
             internetarchive.download("youtube-%s" % video["id"], files=flist,
-                                     verbose=True, destdir=output,
+                                     verbose=True,
                                      no_directory=True, ignore_existing=True,
                                      retries=9999)
             break
@@ -266,14 +275,29 @@
     #
     # paper/2025-08-30: fixed a bug where video IDs with hyphens
     # would incorrectly truncate
+    #
+    # paper/2026-02-27: an update in the IA python library changed
+    # the way destdir works, so it just gets entirely ignored.
     for fname in flist:
-        # ignore any files whose names are not simply the ID
-        if os.path.splitext(fname)[0] != video["id"]:
+        def whitelist(s: str, vidid: str) -> bool:
+            # special case: .info.json files
+            if s == ("%s.info.json" % vidid):
+                return ".info.json"
+
+            spli = os.path.splitext(fname)
+            if spli is None or len(spli) != 2 or spli[0] != vidid:
+                return None
+
+            return spli[1]
+
+        if not os.path.exists(fname):
             continue
-            
-        if os.path.exists("%s/%s" % (output, fname)):
-            os.replace("%s/%s" % (output, fname),
-                       "%s.%s" % (basename, os.path.splitext(fname))[1])
+
+        ext = whitelist(fname, video["id"])
+        if ext is None:
+            continue
+
+        os.replace(fname, "%s%s" % (basename, ext))
     return 0
 
 
@@ -351,36 +375,41 @@
 
 
 def main():
-    # generator; creates a list of files, and returns the parsed form of
-    # each. note that the parser is not necessarily 
     def load_split_files(path: str):
-        list_files = []
-
-        # build the path list
-        if not os.path.isdir(path):
-            list_files.append(path)
-        else:
-            for fi in os.listdir(path):
-                if re.search(r"vids[0-9\-]+?\.json", fi):
-                    list_files.append(path + "/" + fi)
+        def cruft(isdir: bool, listdir, openf):
+            # build the path list
+            if not isdir:
+                list_files = [path]
+            else:
+                list_files = filter(lambda x: re.search(r"vids[0-9\-]+?\.json", x), listdir())
 
-        # now open each as a json
-        for fi in list_files:
-            print(fi)
-            with open(fi, "r", encoding="utf-8") as infile:
-                if simdjson:
-                    # Using this is a lot faster in SIMDJSON, since instead
-                    # of converting all of the JSON key/value pairs into
-                    # native Python objects, they stay in an internal state.
-                    #
-                    # This means we only get the stuff we absolutely need,
-                    # which is the uploader ID, and copy everything else
-                    # if the ID is one we are looking for.
-                    parser = json.Parser()
-                    yield parser.parse(infile.read())
-                    del parser
-                else:
-                    yield json.load(infile)
+            # now open each as a json
+            for fi in list_files:
+                print(fi)
+                with openf(fi, "r") as infile:
+                    if simdjson:
+                        # Using this is a lot faster in SIMDJSON, since instead
+                        # of converting all of the JSON key/value pairs into
+                        # native Python objects, they stay in an internal state.
+                        #
+                        # This means we only get the stuff we absolutely need,
+                        # which is the uploader ID, and copy everything else
+                        # if the ID is one we are looking for.
+                        parser = json.Parser()
+                        yield parser.parse(infile.read())
+                        del parser
+                    else:
+                        yield json.load(infile)
+
+
+        try:
+            if not zipfile_works or os.path.isdir(path):
+                raise Exception
+
+            with zipfile.ZipFile(path, "r") as myzip:
+                yield from cruft(True, lambda: myzip.namelist(), lambda f, m: io.TextIOWrapper(myzip.open(f, mode=m), encoding="utf-8"))
+        except Exception as e:
+            yield from cruft(os.path.isdir(path), lambda: os.listdir(path), lambda f, m: open(path + "/" + f, m, encoding="utf-8"))
 
 
     def write_metadata(i: dict, basename: str) -> None: