channeldownloader: channeldownloader.py comparison

comparison channeldownloader.py @ 15:615e1ca0212a

*: add support for loading the split db from a zip file saves time having to decompress it. also fixed a couple bugs here and there (notably with the IA downloading)

author	Paper <paper@tflc.us>
date	Fri, 27 Feb 2026 17:01:18 -0500
parents	03c8fd4069fb
children	088d9a3a2524

comparison

equal deleted inserted replaced

-:03c8fd4069fb
+:615e1ca0212a
 import re
 import time
 import urllib.request
 import os
 import ssl
+import io
 from urllib.error import HTTPError
 from pathlib import Path
 # We can utilize special simdjson features if it is available
 simdjson = False
 from requests.exceptions import ConnectTimeout
 ia_works = True
 except ImportError:
 print("failed to import the Internet Archive's python library!")
 print("downloading from IA will not work.")
+zipfile_works = False
+try:
+import zipfile
+zipfile_works = True
+except ImportError:
+print("failed to import zipfile!")
+print("loading the database from a .zip file will not work.")
 ##############################################################################
 ## DOWNLOADERS
 # All downloaders should be a function under this signature:
 return 1
 # Internet Archive (tubeup)
 def ia_dl(video: dict, basename: str, output: str) -> int:
-def ia_file_legit(file: internetarchive.File, vidid: str) -> bool:
+def ia_file_legit(f: str, vidid: str) -> bool:
 # FIXME:
 #
 # There are some items on IA that combine the old tubeup behavior
 # (i.e., including the sanitized video name before the ID)
 # and the new tubeup behavior (filename only contains the video ID)
 #
 # We should also check if whether the copy on IA is higher quality
 # than a local copy... :)
 if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w"
 r"ebm|info\.json|description|annotations.xml))",
-f.name):
+f):
 return False
-# now, check the metadata
-print(f)
 return True
 if not internetarchive.get_item("youtube-%s" % video["id"]).exists:
 return 2
 flist = [
 f.name
 for f in internetarchive.get_files("youtube-%s" % video["id"])
 if ia_file_legit(f.name, video["id"])
 ]
 while True:
 try:
 internetarchive.download("youtube-%s" % video["id"], files=flist,
-verbose=True, destdir=output,
+verbose=True,
 no_directory=True, ignore_existing=True,
 retries=9999)
 break
 except ConnectTimeout:
 time.sleep(1)
 # Newer versions of tubeup save only the video ID.
 # Account for this by replacing it.
 #
 # paper/2025-08-30: fixed a bug where video IDs with hyphens
 # would incorrectly truncate
+#
+# paper/2026-02-27: an update in the IA python library changed
+# the way destdir works, so it just gets entirely ignored.
 for fname in flist:
-# ignore any files whose names are not simply the ID
+def whitelist(s: str, vidid: str) -> bool:
-if os.path.splitext(fname)[0] != video["id"]:
+# special case: .info.json files
+if s == ("%s.info.json" % vidid):
+return ".info.json"
+spli = os.path.splitext(fname)
+if spli is None or len(spli) != 2 or spli[0] != vidid:
+return None
+return spli[1]
+if not os.path.exists(fname):
 continue
-if os.path.exists("%s/%s" % (output, fname)):
+ext = whitelist(fname, video["id"])
-os.replace("%s/%s" % (output, fname),
+if ext is None:
-"%s.%s" % (basename, os.path.splitext(fname))[1])
+continue
+os.replace(fname, "%s%s" % (basename, ext))
 return 0
 def ytdlp_dl(video: dict, basename: str, output: str) -> int:
 # intentionally ignores all messages besides errors
 ##############################################################################
 def main():
-# generator; creates a list of files, and returns the parsed form of
-# each. note that the parser is not necessarily
 def load_split_files(path: str):
-list_files = []
+def cruft(isdir: bool, listdir, openf):
+# build the path list
-# build the path list
+if not isdir:
-if not os.path.isdir(path):
+list_files = [path]
-list_files.append(path)
+else:
-else:
+list_files = filter(lambda x: re.search(r"vids[0-9\-]+?\.json", x), listdir())
-for fi in os.listdir(path):
-if re.search(r"vids[0-9\-]+?\.json", fi):
+# now open each as a json
-list_files.append(path + "/" + fi)
+for fi in list_files:
+print(fi)
-# now open each as a json
+with openf(fi, "r") as infile:
-for fi in list_files:
+if simdjson:
-print(fi)
+# Using this is a lot faster in SIMDJSON, since instead
-with open(fi, "r", encoding="utf-8") as infile:
+# of converting all of the JSON key/value pairs into
-if simdjson:
+# native Python objects, they stay in an internal state.
-# Using this is a lot faster in SIMDJSON, since instead
+#
-# of converting all of the JSON key/value pairs into
+# This means we only get the stuff we absolutely need,
-# native Python objects, they stay in an internal state.
+# which is the uploader ID, and copy everything else
-#
+# if the ID is one we are looking for.
-# This means we only get the stuff we absolutely need,
+parser = json.Parser()
-# which is the uploader ID, and copy everything else
+yield parser.parse(infile.read())
-# if the ID is one we are looking for.
+del parser
-parser = json.Parser()
+else:
-yield parser.parse(infile.read())
+yield json.load(infile)
-del parser
-else:
-yield json.load(infile)
+try:
+if not zipfile_works or os.path.isdir(path):
+raise Exception
+with zipfile.ZipFile(path, "r") as myzip:
+yield from cruft(True, lambda: myzip.namelist(), lambda f, m: io.TextIOWrapper(myzip.open(f, mode=m), encoding="utf-8"))
+except Exception as e:
+yield from cruft(os.path.isdir(path), lambda: os.listdir(path), lambda f, m: open(path + "/" + f, m, encoding="utf-8"))
 def write_metadata(i: dict, basename: str) -> None:
 # ehhh
 if not os.path.exists(basename + ".info.json"):

Mercurial > channeldownloader

comparison channeldownloader.py @ 15:615e1ca0212a