Mercurial > channeldownloader
changeset 16:088d9a3a2524
improvements to IA downloader
now we explicitly ignore any file not "original". this seems to
filter out derivative files (such as ogv and other shit we don't
want) but keeps some of the toplevel metadata
| author | Paper <paper@tflc.us> |
|---|---|
| date | Sat, 28 Feb 2026 14:38:04 -0500 |
| parents | 615e1ca0212a |
| children | 0d10b2ce0140 |
| files | channeldownloader.py |
| diffstat | 1 files changed, 100 insertions(+), 22 deletions(-) [+] |
line wrap: on
line diff
--- a/channeldownloader.py Fri Feb 27 17:01:18 2026 -0500 +++ b/channeldownloader.py Sat Feb 28 14:38:04 2026 -0500 @@ -45,6 +45,8 @@ import os import ssl import io +import shutil +import xml.etree.ElementTree as XmlET from urllib.error import HTTPError from pathlib import Path @@ -218,9 +220,13 @@ return 1 +# Also captures the ID for comparison +IA_REGEX = re.compile(r"(?:(?P<date>\d{8}) - )?(?P<title>.+?)?(?:-| \[)?(?:(?P<id>[A-z0-9_\-]{11})]?|(?: \((?P<format>(?:(?:(?P<resolution>\d+)p_(?P<fps>\d+)fps_(?P<vcodec>H264)-)?(?P<abitrate>\d+)kbit_(?P<acodec>AAC|Vorbis))|BQ|Description)\)))\.(?P<extension>mp4|info\.json|description|annotations\.xml|webp|mkv|webm|jpg|jpeg|ogg|txt|m4a)$") + + # Internet Archive (tubeup) def ia_dl(video: dict, basename: str, output: str) -> int: - def ia_file_legit(f: str, vidid: str) -> bool: + def ia_file_legit(f: str, vidid: str, vidtitle: str) -> bool: # FIXME: # # There are some items on IA that combine the old tubeup behavior @@ -239,28 +245,80 @@ # # We should also check if whether the copy on IA is higher quality # than a local copy... :) - if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w" - r"ebm|info\.json|description|annotations.xml))", - f): + + IA_ID = "youtube-%s" % vidid + + # Ignore IA generated thumbnails + if f.startswith("%s.thumbs/" % IA_ID) or f == "__ia_thumb.jpg": + return False + + for i in ["_archive.torrent", "_files.xml", "_meta.sqlite", "_meta.xml"]: + if f == (IA_ID + i): + return False + + # Try to match with our known filename regex + # This properly matches: + # ??????????? - YYYYMMDD - TITLE [ID].EXTENSION + # old tubeup - TITLE-ID.EXTENSION + # tubeup - ID.EXTENSION + # JDownloader - TITLE (FORMAT).EXTENSION + # (Possibly we should match other filenames too??) + m = re.match(IA_REGEX, f) + if m is None: return False - return True + if m.group("id"): + return (m.group("id") == vidid) + elif m.group("title") is not None: + def asciify(s: str) -> str: + # Replace all non-ASCII chars with underscores, and get rid of any whitespace + return ''.join([i if ord(i) >= 0x20 and ord(i) < 0x80 and i not in "/\\" else '_' for i in s]).strip() + + if asciify(m.group("title")) == asciify(vidtitle): + return True # Close enough + + # Uh oh + return False + def ia_get_original_files(identifier: str) -> typing.Optional[list]: + def ia_xml(identifier: str) -> typing.Optional[str]: + for _ in range(1, 9999): + try: + with urllib.request.urlopen("https://archive.org/download/%s/%s_files.xml" % (identifier, identifier)) as req: + return req.read().decode("utf-8") + except HTTPError as e: + if e.code == 404 or e.code == 503: + return None + time.sleep(5) - if not internetarchive.get_item("youtube-%s" % video["id"]).exists: + d = ia_xml(identifier) + if d is None: + return None + + try: + # Now parse the XML and make a list of each original file + return [x.attrib["name"] for x in filter(lambda x: x.attrib["source"] == "original", XmlET.fromstring(d))] + except Exception as e: + print(e) + return None + + originalfiles = ia_get_original_files("youtube-%s" % video["id"]) + if not originalfiles: return 2 flist = [ - f.name - for f in internetarchive.get_files("youtube-%s" % video["id"]) - if ia_file_legit(f.name, video["id"]) + f + for f in originalfiles + if ia_file_legit(f, video["id"], video["title"] if not "fulltitle" in video else video["fulltitle"]) ] + if not flist: + return 2 # ?????? + while True: try: internetarchive.download("youtube-%s" % video["id"], files=flist, - verbose=True, - no_directory=True, ignore_existing=True, + verbose=True, ignore_existing=True, retries=9999) break except ConnectTimeout: @@ -279,25 +337,36 @@ # paper/2026-02-27: an update in the IA python library changed # the way destdir works, so it just gets entirely ignored. for fname in flist: - def whitelist(s: str, vidid: str) -> bool: - # special case: .info.json files - if s == ("%s.info.json" % vidid): - return ".info.json" + def getext(s: str, vidid: str) -> typing.Optional[str]: + # special cases + for i in [".info.json", ".annotations.xml"]: + if s.endswith(i): + return i - spli = os.path.splitext(fname) - if spli is None or len(spli) != 2 or spli[0] != vidid: + # Handle JDownloader "TITLE (Description).txt" + if s.endswith(" (Description).txt"): + return ".description" + + # Catch-all for remaining extensions + spli = os.path.splitext(s) + if spli is None or len(spli) != 2: return None return spli[1] - if not os.path.exists(fname): + ondisk = "youtube-%s/%s" % (video["id"], fname) + + if not os.path.exists(ondisk): continue - ext = whitelist(fname, video["id"]) + ext = getext(fname, video["id"]) if ext is None: continue - os.replace(fname, "%s%s" % (basename, ext)) + os.replace(ondisk, "%s%s" % (basename, ext)) + + shutil.rmtree("youtube-%s" % video["id"]) + return 0 @@ -470,9 +539,18 @@ print("%s:" % i["id"]) basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"]) - files = [y + def filenotworthit(f) -> bool: + try: + return bool(os.path.getsize(f)) + except: + return False + + pathoutput = Path(output) + + # This is terrible + files = list(filter(filenotworthit, [y for p in ["mkv", "mp4", "webm"] - for y in Path(output).glob(("*-%s." + p) % i["id"])] + for y in pathoutput.glob(("*-%s." + p) % i["id"])])) if files: print(" video already downloaded!") videos.remove(i)
