channeldownloader: channeldownloader.py comparison

comparison channeldownloader.py @ 16:088d9a3a2524

improvements to IA downloader now we explicitly ignore any file not "original". this seems to filter out derivative files (such as ogv and other shit we don't want) but keeps some of the toplevel metadata

author	Paper <paper@tflc.us>
date	Sat, 28 Feb 2026 14:38:04 -0500
parents	615e1ca0212a
children	0d10b2ce0140

comparison

equal deleted inserted replaced

-:615e1ca0212a
+:088d9a3a2524
 import time
 import urllib.request
 import os
 import ssl
 import io
+import shutil
+import xml.etree.ElementTree as XmlET
 from urllib.error import HTTPError
 from pathlib import Path
 # We can utilize special simdjson features if it is available
 simdjson = False
 print(" unknown error downloading video!")
 print(e)
 return 1
+# Also captures the ID for comparison
+IA_REGEX = re.compile(r"(?:(?P<date>\d{8}) - )?(?P<title>.+?)?(?:-| \[)?(?:(?P<id>[A-z0-9_\-]{11})]?|(?: \((?P<format>(?:(?:(?P<resolution>\d+)p_(?P<fps>\d+)fps_(?P<vcodec>H264)-)?(?P<abitrate>\d+)kbit_(?P<acodec>AAC|Vorbis))|BQ|Description)\)))\.(?P<extension>mp4|info\.json|description|annotations\.xml|webp|mkv|webm|jpg|jpeg|ogg|txt|m4a)$")
 # Internet Archive (tubeup)
 def ia_dl(video: dict, basename: str, output: str) -> int:
-def ia_file_legit(f: str, vidid: str) -> bool:
+def ia_file_legit(f: str, vidid: str, vidtitle: str) -> bool:
 # FIXME:
 #
 # There are some items on IA that combine the old tubeup behavior
 # (i.e., including the sanitized video name before the ID)
 # and the new tubeup behavior (filename only contains the video ID)
 # (from when the owners changed the title). We should ideally only
 # download unique files. IA seems to provide SHA1 hashes...
 #
 # We should also check if whether the copy on IA is higher quality
 # than a local copy... :)
-if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w"
-r"ebm|info\.json|description|annotations.xml))",
+IA_ID = "youtube-%s" % vidid
-f):
+# Ignore IA generated thumbnails
+if f.startswith("%s.thumbs/" % IA_ID) or f == "__ia_thumb.jpg":
 return False
-return True
+for i in ["_archive.torrent", "_files.xml", "_meta.sqlite", "_meta.xml"]:
+if f == (IA_ID + i):
+return False
-if not internetarchive.get_item("youtube-%s" % video["id"]).exists:
+# Try to match with our known filename regex
+# This properly matches:
+#   ??????????? - YYYYMMDD - TITLE [ID].EXTENSION
+#   old tubeup  - TITLE-ID.EXTENSION
+#   tubeup      - ID.EXTENSION
+#   JDownloader - TITLE (FORMAT).EXTENSION
+# (Possibly we should match other filenames too??)
+m = re.match(IA_REGEX, f)
+if m is None:
+return False
+if m.group("id"):
+return (m.group("id") == vidid)
+elif m.group("title") is not None:
+def asciify(s: str) -> str:
+# Replace all non-ASCII chars with underscores, and get rid of any whitespace
+return ''.join([i if ord(i) >= 0x20 and ord(i) < 0x80 and i not in "/\\" else '_' for i in s]).strip()
+if asciify(m.group("title")) == asciify(vidtitle):
+return True  # Close enough
+# Uh oh
+return False
+def ia_get_original_files(identifier: str) -> typing.Optional[list]:
+def ia_xml(identifier: str) -> typing.Optional[str]:
+for _ in range(1, 9999):
+try:
+with urllib.request.urlopen("https://archive.org/download/%s/%s_files.xml" % (identifier, identifier)) as req:
+return req.read().decode("utf-8")
+except HTTPError as e:
+if e.code == 404 or e.code == 503:
+return None
+time.sleep(5)
+d = ia_xml(identifier)
+if d is None:
+return None
+try:
+# Now parse the XML and make a list of each original file
+return [x.attrib["name"] for x in filter(lambda x: x.attrib["source"] == "original", XmlET.fromstring(d))]
+except Exception as e:
+print(e)
+return None
+originalfiles = ia_get_original_files("youtube-%s" % video["id"])
+if not originalfiles:
 return 2
 flist = [
-f.name
+f
-for f in internetarchive.get_files("youtube-%s" % video["id"])
+for f in originalfiles
-if ia_file_legit(f.name, video["id"])
+if ia_file_legit(f, video["id"], video["title"] if not "fulltitle" in video else video["fulltitle"])
 ]
+if not flist:
+return 2 # ??????
 while True:
 try:
 internetarchive.download("youtube-%s" % video["id"], files=flist,
-verbose=True,
+verbose=True, ignore_existing=True,
-no_directory=True, ignore_existing=True,
 retries=9999)
 break
 except ConnectTimeout:
 time.sleep(1)
 continue
 # would incorrectly truncate
 #
 # paper/2026-02-27: an update in the IA python library changed
 # the way destdir works, so it just gets entirely ignored.
 for fname in flist:
-def whitelist(s: str, vidid: str) -> bool:
+def getext(s: str, vidid: str) -> typing.Optional[str]:
-# special case: .info.json files
+# special cases
-if s == ("%s.info.json" % vidid):
+for i in [".info.json", ".annotations.xml"]:
-return ".info.json"
+if s.endswith(i):
+return i
-spli = os.path.splitext(fname)
-if spli is None or len(spli) != 2 or spli[0] != vidid:
+# Handle JDownloader "TITLE (Description).txt"
+if s.endswith(" (Description).txt"):
+return ".description"
+# Catch-all for remaining extensions
+spli = os.path.splitext(s)
+if spli is None or len(spli) != 2:
 return None
 return spli[1]
-if not os.path.exists(fname):
+ondisk = "youtube-%s/%s" % (video["id"], fname)
+if not os.path.exists(ondisk):
 continue
-ext = whitelist(fname, video["id"])
+ext = getext(fname, video["id"])
 if ext is None:
 continue
-os.replace(fname, "%s%s" % (basename, ext))
+os.replace(ondisk, "%s%s" % (basename, ext))
+shutil.rmtree("youtube-%s" % video["id"])
 return 0
 def ytdlp_dl(video: dict, basename: str, output: str) -> int:
 # intentionally ignores all messages besides errors
 output = channel["output"]
 print("%s:" % i["id"])
 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"],
 restricted=True), i["id"])
-files = [y
+def filenotworthit(f) -> bool:
+try:
+return bool(os.path.getsize(f))
+except:
+return False
+pathoutput = Path(output)
+# This is terrible
+files = list(filter(filenotworthit, [y
 for p in ["mkv", "mp4", "webm"]
-for y in Path(output).glob(("*-%s." + p) % i["id"])]
+for y in pathoutput.glob(("*-%s." + p) % i["id"])]))
 if files:
 print(" video already downloaded!")
 videos.remove(i)
 write_metadata(i, basename)
 continue

Mercurial > channeldownloader

comparison channeldownloader.py @ 16:088d9a3a2524