# HG changeset patch # User Paper # Date 1772229678 18000 # Node ID 615e1ca0212a7d0307bb9814ce24444fa5d2f7e2 # Parent 03c8fd4069fbc348924bed3e7d0bb1d219f8e9ac *: add support for loading the split db from a zip file saves time having to decompress it. also fixed a couple bugs here and there (notably with the IA downloading) diff -r 03c8fd4069fb -r 615e1ca0212a channeldownloader.py --- a/channeldownloader.py Sat Aug 30 17:09:56 2025 -0400 +++ b/channeldownloader.py Fri Feb 27 17:01:18 2026 -0500 @@ -44,6 +44,7 @@ import urllib.request import os import ssl +import io from urllib.error import HTTPError from pathlib import Path @@ -86,6 +87,15 @@ print("failed to import the Internet Archive's python library!") print("downloading from IA will not work.") +zipfile_works = False + +try: + import zipfile + zipfile_works = True +except ImportError: + print("failed to import zipfile!") + print("loading the database from a .zip file will not work.") + ############################################################################## ## DOWNLOADERS @@ -210,7 +220,7 @@ # Internet Archive (tubeup) def ia_dl(video: dict, basename: str, output: str) -> int: - def ia_file_legit(file: internetarchive.File, vidid: str) -> bool: + def ia_file_legit(f: str, vidid: str) -> bool: # FIXME: # # There are some items on IA that combine the old tubeup behavior @@ -231,11 +241,9 @@ # than a local copy... :) if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w" r"ebm|info\.json|description|annotations.xml))", - f.name): + f): return False - # now, check the metadata - print(f) return True @@ -247,10 +255,11 @@ for f in internetarchive.get_files("youtube-%s" % video["id"]) if ia_file_legit(f.name, video["id"]) ] + while True: try: internetarchive.download("youtube-%s" % video["id"], files=flist, - verbose=True, destdir=output, + verbose=True, no_directory=True, ignore_existing=True, retries=9999) break @@ -266,14 +275,29 @@ # # paper/2025-08-30: fixed a bug where video IDs with hyphens # would incorrectly truncate + # + # paper/2026-02-27: an update in the IA python library changed + # the way destdir works, so it just gets entirely ignored. for fname in flist: - # ignore any files whose names are not simply the ID - if os.path.splitext(fname)[0] != video["id"]: + def whitelist(s: str, vidid: str) -> bool: + # special case: .info.json files + if s == ("%s.info.json" % vidid): + return ".info.json" + + spli = os.path.splitext(fname) + if spli is None or len(spli) != 2 or spli[0] != vidid: + return None + + return spli[1] + + if not os.path.exists(fname): continue - - if os.path.exists("%s/%s" % (output, fname)): - os.replace("%s/%s" % (output, fname), - "%s.%s" % (basename, os.path.splitext(fname))[1]) + + ext = whitelist(fname, video["id"]) + if ext is None: + continue + + os.replace(fname, "%s%s" % (basename, ext)) return 0 @@ -351,36 +375,41 @@ def main(): - # generator; creates a list of files, and returns the parsed form of - # each. note that the parser is not necessarily def load_split_files(path: str): - list_files = [] - - # build the path list - if not os.path.isdir(path): - list_files.append(path) - else: - for fi in os.listdir(path): - if re.search(r"vids[0-9\-]+?\.json", fi): - list_files.append(path + "/" + fi) + def cruft(isdir: bool, listdir, openf): + # build the path list + if not isdir: + list_files = [path] + else: + list_files = filter(lambda x: re.search(r"vids[0-9\-]+?\.json", x), listdir()) - # now open each as a json - for fi in list_files: - print(fi) - with open(fi, "r", encoding="utf-8") as infile: - if simdjson: - # Using this is a lot faster in SIMDJSON, since instead - # of converting all of the JSON key/value pairs into - # native Python objects, they stay in an internal state. - # - # This means we only get the stuff we absolutely need, - # which is the uploader ID, and copy everything else - # if the ID is one we are looking for. - parser = json.Parser() - yield parser.parse(infile.read()) - del parser - else: - yield json.load(infile) + # now open each as a json + for fi in list_files: + print(fi) + with openf(fi, "r") as infile: + if simdjson: + # Using this is a lot faster in SIMDJSON, since instead + # of converting all of the JSON key/value pairs into + # native Python objects, they stay in an internal state. + # + # This means we only get the stuff we absolutely need, + # which is the uploader ID, and copy everything else + # if the ID is one we are looking for. + parser = json.Parser() + yield parser.parse(infile.read()) + del parser + else: + yield json.load(infile) + + + try: + if not zipfile_works or os.path.isdir(path): + raise Exception + + with zipfile.ZipFile(path, "r") as myzip: + yield from cruft(True, lambda: myzip.namelist(), lambda f, m: io.TextIOWrapper(myzip.open(f, mode=m), encoding="utf-8")) + except Exception as e: + yield from cruft(os.path.isdir(path), lambda: os.listdir(path), lambda f, m: open(path + "/" + f, m, encoding="utf-8")) def write_metadata(i: dict, basename: str) -> None: