# HG changeset patch # User Paper # Date 1772307484 18000 # Node ID 088d9a3a2524f47a915ab8e9b3d814859c12ae38 # Parent 615e1ca0212a7d0307bb9814ce24444fa5d2f7e2 improvements to IA downloader now we explicitly ignore any file not "original". this seems to filter out derivative files (such as ogv and other shit we don't want) but keeps some of the toplevel metadata diff -r 615e1ca0212a -r 088d9a3a2524 channeldownloader.py --- a/channeldownloader.py Fri Feb 27 17:01:18 2026 -0500 +++ b/channeldownloader.py Sat Feb 28 14:38:04 2026 -0500 @@ -45,6 +45,8 @@ import os import ssl import io +import shutil +import xml.etree.ElementTree as XmlET from urllib.error import HTTPError from pathlib import Path @@ -218,9 +220,13 @@ return 1 +# Also captures the ID for comparison +IA_REGEX = re.compile(r"(?:(?P\d{8}) - )?(?P.+?)?(?:-| \[)?(?:(?P<id>[A-z0-9_\-]{11})]?|(?: \((?P<format>(?:(?:(?P<resolution>\d+)p_(?P<fps>\d+)fps_(?P<vcodec>H264)-)?(?P<abitrate>\d+)kbit_(?P<acodec>AAC|Vorbis))|BQ|Description)\)))\.(?P<extension>mp4|info\.json|description|annotations\.xml|webp|mkv|webm|jpg|jpeg|ogg|txt|m4a)$") + + # Internet Archive (tubeup) def ia_dl(video: dict, basename: str, output: str) -> int: - def ia_file_legit(f: str, vidid: str) -> bool: + def ia_file_legit(f: str, vidid: str, vidtitle: str) -> bool: # FIXME: # # There are some items on IA that combine the old tubeup behavior @@ -239,28 +245,80 @@ # # We should also check if whether the copy on IA is higher quality # than a local copy... :) - if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w" - r"ebm|info\.json|description|annotations.xml))", - f): + + IA_ID = "youtube-%s" % vidid + + # Ignore IA generated thumbnails + if f.startswith("%s.thumbs/" % IA_ID) or f == "__ia_thumb.jpg": + return False + + for i in ["_archive.torrent", "_files.xml", "_meta.sqlite", "_meta.xml"]: + if f == (IA_ID + i): + return False + + # Try to match with our known filename regex + # This properly matches: + # ??????????? - YYYYMMDD - TITLE [ID].EXTENSION + # old tubeup - TITLE-ID.EXTENSION + # tubeup - ID.EXTENSION + # JDownloader - TITLE (FORMAT).EXTENSION + # (Possibly we should match other filenames too??) + m = re.match(IA_REGEX, f) + if m is None: return False - return True + if m.group("id"): + return (m.group("id") == vidid) + elif m.group("title") is not None: + def asciify(s: str) -> str: + # Replace all non-ASCII chars with underscores, and get rid of any whitespace + return ''.join([i if ord(i) >= 0x20 and ord(i) < 0x80 and i not in "/\\" else '_' for i in s]).strip() + + if asciify(m.group("title")) == asciify(vidtitle): + return True # Close enough + + # Uh oh + return False + def ia_get_original_files(identifier: str) -> typing.Optional[list]: + def ia_xml(identifier: str) -> typing.Optional[str]: + for _ in range(1, 9999): + try: + with urllib.request.urlopen("https://archive.org/download/%s/%s_files.xml" % (identifier, identifier)) as req: + return req.read().decode("utf-8") + except HTTPError as e: + if e.code == 404 or e.code == 503: + return None + time.sleep(5) - if not internetarchive.get_item("youtube-%s" % video["id"]).exists: + d = ia_xml(identifier) + if d is None: + return None + + try: + # Now parse the XML and make a list of each original file + return [x.attrib["name"] for x in filter(lambda x: x.attrib["source"] == "original", XmlET.fromstring(d))] + except Exception as e: + print(e) + return None + + originalfiles = ia_get_original_files("youtube-%s" % video["id"]) + if not originalfiles: return 2 flist = [ - f.name - for f in internetarchive.get_files("youtube-%s" % video["id"]) - if ia_file_legit(f.name, video["id"]) + f + for f in originalfiles + if ia_file_legit(f, video["id"], video["title"] if not "fulltitle" in video else video["fulltitle"]) ] + if not flist: + return 2 # ?????? + while True: try: internetarchive.download("youtube-%s" % video["id"], files=flist, - verbose=True, - no_directory=True, ignore_existing=True, + verbose=True, ignore_existing=True, retries=9999) break except ConnectTimeout: @@ -279,25 +337,36 @@ # paper/2026-02-27: an update in the IA python library changed # the way destdir works, so it just gets entirely ignored. for fname in flist: - def whitelist(s: str, vidid: str) -> bool: - # special case: .info.json files - if s == ("%s.info.json" % vidid): - return ".info.json" + def getext(s: str, vidid: str) -> typing.Optional[str]: + # special cases + for i in [".info.json", ".annotations.xml"]: + if s.endswith(i): + return i - spli = os.path.splitext(fname) - if spli is None or len(spli) != 2 or spli[0] != vidid: + # Handle JDownloader "TITLE (Description).txt" + if s.endswith(" (Description).txt"): + return ".description" + + # Catch-all for remaining extensions + spli = os.path.splitext(s) + if spli is None or len(spli) != 2: return None return spli[1] - if not os.path.exists(fname): + ondisk = "youtube-%s/%s" % (video["id"], fname) + + if not os.path.exists(ondisk): continue - ext = whitelist(fname, video["id"]) + ext = getext(fname, video["id"]) if ext is None: continue - os.replace(fname, "%s%s" % (basename, ext)) + os.replace(ondisk, "%s%s" % (basename, ext)) + + shutil.rmtree("youtube-%s" % video["id"]) + return 0 @@ -470,9 +539,18 @@ print("%s:" % i["id"]) basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"]) - files = [y + def filenotworthit(f) -> bool: + try: + return bool(os.path.getsize(f)) + except: + return False + + pathoutput = Path(output) + + # This is terrible + files = list(filter(filenotworthit, [y for p in ["mkv", "mp4", "webm"] - for y in Path(output).glob(("*-%s." + p) % i["id"])] + for y in pathoutput.glob(("*-%s." + p) % i["id"])])) if files: print(" video already downloaded!") videos.remove(i)