Mercurial > channeldownloader

--- a/channeldownloader.py	Fri Feb 27 17:01:18 2026 -0500
+++ b/channeldownloader.py	Sat Feb 28 14:38:04 2026 -0500
@@ -45,6 +45,8 @@
 import os
 import ssl
 import io
+import shutil
+import xml.etree.ElementTree as XmlET
 from urllib.error import HTTPError
 from pathlib import Path

@@ -218,9 +220,13 @@
         return 1


+# Also captures the ID for comparison
+IA_REGEX = re.compile(r"(?:(?P<date>\d{8}) - )?(?P<title>.+?)?(?:-| \[)?(?:(?P<id>[A-z0-9_\-]{11})]?|(?: \((?P<format>(?:(?:(?P<resolution>\d+)p_(?P<fps>\d+)fps_(?P<vcodec>H264)-)?(?P<abitrate>\d+)kbit_(?P<acodec>AAC|Vorbis))|BQ|Description)\)))\.(?P<extension>mp4|info\.json|description|annotations\.xml|webp|mkv|webm|jpg|jpeg|ogg|txt|m4a)$")
+
+
 # Internet Archive (tubeup)
 def ia_dl(video: dict, basename: str, output: str) -> int:
-    def ia_file_legit(f: str, vidid: str) -> bool:
+    def ia_file_legit(f: str, vidid: str, vidtitle: str) -> bool:
         # FIXME:
         #
         # There are some items on IA that combine the old tubeup behavior
@@ -239,28 +245,80 @@
         #
         # We should also check if whether the copy on IA is higher quality
         # than a local copy... :)
-        if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w"
-                         r"ebm|info\.json|description|annotations.xml))",
-                         f):
+
+        IA_ID = "youtube-%s" % vidid
+
+        # Ignore IA generated thumbnails
+        if f.startswith("%s.thumbs/" % IA_ID) or f == "__ia_thumb.jpg":
+            return False
+
+        for i in ["_archive.torrent", "_files.xml", "_meta.sqlite", "_meta.xml"]:
+            if f == (IA_ID + i):
+                return False
+
+        # Try to match with our known filename regex
+        # This properly matches:
+        #   ??????????? - YYYYMMDD - TITLE [ID].EXTENSION
+        #   old tubeup  - TITLE-ID.EXTENSION
+        #   tubeup      - ID.EXTENSION
+        #   JDownloader - TITLE (FORMAT).EXTENSION
+        # (Possibly we should match other filenames too??)
+        m = re.match(IA_REGEX, f)
+        if m is None:
             return False

-        return True
+        if m.group("id"):
+            return (m.group("id") == vidid)
+        elif m.group("title") is not None:
+            def asciify(s: str) -> str:
+                # Replace all non-ASCII chars with underscores, and get rid of any whitespace
+                return ''.join([i if ord(i) >= 0x20 and ord(i) < 0x80 and i not in "/\\" else '_' for i in s]).strip()
+
+            if asciify(m.group("title")) == asciify(vidtitle):
+                return True  # Close enough
+
+        # Uh oh
+        return False

+    def ia_get_original_files(identifier: str) -> typing.Optional[list]:
+        def ia_xml(identifier: str) -> typing.Optional[str]:
+            for _ in range(1, 9999):
+                try:
+                    with urllib.request.urlopen("https://archive.org/download/%s/%s_files.xml" % (identifier, identifier)) as req:
+                        return req.read().decode("utf-8")
+                except HTTPError as e:
+                    if e.code == 404 or e.code == 503:
+                        return None
+                    time.sleep(5)

-    if not internetarchive.get_item("youtube-%s" % video["id"]).exists:
+        d = ia_xml(identifier)
+        if d is None:
+            return None
+
+        try:
+            # Now parse the XML and make a list of each original file
+            return [x.attrib["name"] for x in filter(lambda x: x.attrib["source"] == "original", XmlET.fromstring(d))]
+        except Exception as e:
+            print(e)
+            return None
+
+    originalfiles = ia_get_original_files("youtube-%s" % video["id"])
+    if not originalfiles:
         return 2

     flist = [
-        f.name
-        for f in internetarchive.get_files("youtube-%s" % video["id"])
-        if ia_file_legit(f.name, video["id"])
+        f
+        for f in originalfiles
+        if ia_file_legit(f, video["id"], video["title"] if not "fulltitle" in video else video["fulltitle"])
     ]

+    if not flist:
+        return 2 # ??????
+
     while True:
         try:
             internetarchive.download("youtube-%s" % video["id"], files=flist,
-                                     verbose=True,
-                                     no_directory=True, ignore_existing=True,
+                                     verbose=True, ignore_existing=True,
                                      retries=9999)
             break
         except ConnectTimeout:
@@ -279,25 +337,36 @@
     # paper/2026-02-27: an update in the IA python library changed
     # the way destdir works, so it just gets entirely ignored.
     for fname in flist:
-        def whitelist(s: str, vidid: str) -> bool:
-            # special case: .info.json files
-            if s == ("%s.info.json" % vidid):
-                return ".info.json"
+        def getext(s: str, vidid: str) -> typing.Optional[str]:
+            # special cases
+            for i in [".info.json", ".annotations.xml"]:
+                if s.endswith(i):
+                    return i

-            spli = os.path.splitext(fname)
-            if spli is None or len(spli) != 2 or spli[0] != vidid:
+            # Handle JDownloader "TITLE (Description).txt"
+            if s.endswith(" (Description).txt"):
+                return ".description"
+
+            # Catch-all for remaining extensions
+            spli = os.path.splitext(s)
+            if spli is None or len(spli) != 2:
                 return None

             return spli[1]

-        if not os.path.exists(fname):
+        ondisk = "youtube-%s/%s" % (video["id"], fname)
+
+        if not os.path.exists(ondisk):
             continue

-        ext = whitelist(fname, video["id"])
+        ext = getext(fname, video["id"])
         if ext is None:
             continue

-        os.replace(fname, "%s%s" % (basename, ext))
+        os.replace(ondisk, "%s%s" % (basename, ext))
+
+    shutil.rmtree("youtube-%s" % video["id"])
+
     return 0


@@ -470,9 +539,18 @@
             print("%s:" % i["id"])
             basename = "%s/%s-%s" % (output, sanitize_filename(i["title"],
                                      restricted=True), i["id"])
-            files = [y
+            def filenotworthit(f) -> bool:
+                try:
+                    return bool(os.path.getsize(f))
+                except:
+                    return False
+
+            pathoutput = Path(output)
+
+            # This is terrible
+            files = list(filter(filenotworthit, [y
                      for p in ["mkv", "mp4", "webm"]
-                     for y in Path(output).glob(("*-%s." + p) % i["id"])]
+                     for y in pathoutput.glob(("*-%s." + p) % i["id"])]))
             if files:
                 print(" video already downloaded!")
                 videos.remove(i)