Mercurial > channeldownloader
comparison channeldownloader.py @ 15:615e1ca0212a
*: add support for loading the split db from a zip file
saves time having to decompress it.
also fixed a couple bugs here and there (notably with the IA downloading)
| author | Paper <paper@tflc.us> |
|---|---|
| date | Fri, 27 Feb 2026 17:01:18 -0500 |
| parents | 03c8fd4069fb |
| children | 088d9a3a2524 |
comparison
equal
deleted
inserted
replaced
| 14:03c8fd4069fb | 15:615e1ca0212a |
|---|---|
| 42 import re | 42 import re |
| 43 import time | 43 import time |
| 44 import urllib.request | 44 import urllib.request |
| 45 import os | 45 import os |
| 46 import ssl | 46 import ssl |
| 47 import io | |
| 47 from urllib.error import HTTPError | 48 from urllib.error import HTTPError |
| 48 from pathlib import Path | 49 from pathlib import Path |
| 49 | 50 |
| 50 # We can utilize special simdjson features if it is available | 51 # We can utilize special simdjson features if it is available |
| 51 simdjson = False | 52 simdjson = False |
| 83 from requests.exceptions import ConnectTimeout | 84 from requests.exceptions import ConnectTimeout |
| 84 ia_works = True | 85 ia_works = True |
| 85 except ImportError: | 86 except ImportError: |
| 86 print("failed to import the Internet Archive's python library!") | 87 print("failed to import the Internet Archive's python library!") |
| 87 print("downloading from IA will not work.") | 88 print("downloading from IA will not work.") |
| 89 | |
| 90 zipfile_works = False | |
| 91 | |
| 92 try: | |
| 93 import zipfile | |
| 94 zipfile_works = True | |
| 95 except ImportError: | |
| 96 print("failed to import zipfile!") | |
| 97 print("loading the database from a .zip file will not work.") | |
| 88 | 98 |
| 89 ############################################################################## | 99 ############################################################################## |
| 90 ## DOWNLOADERS | 100 ## DOWNLOADERS |
| 91 | 101 |
| 92 # All downloaders should be a function under this signature: | 102 # All downloaders should be a function under this signature: |
| 208 return 1 | 218 return 1 |
| 209 | 219 |
| 210 | 220 |
| 211 # Internet Archive (tubeup) | 221 # Internet Archive (tubeup) |
| 212 def ia_dl(video: dict, basename: str, output: str) -> int: | 222 def ia_dl(video: dict, basename: str, output: str) -> int: |
| 213 def ia_file_legit(file: internetarchive.File, vidid: str) -> bool: | 223 def ia_file_legit(f: str, vidid: str) -> bool: |
| 214 # FIXME: | 224 # FIXME: |
| 215 # | 225 # |
| 216 # There are some items on IA that combine the old tubeup behavior | 226 # There are some items on IA that combine the old tubeup behavior |
| 217 # (i.e., including the sanitized video name before the ID) | 227 # (i.e., including the sanitized video name before the ID) |
| 218 # and the new tubeup behavior (filename only contains the video ID) | 228 # and the new tubeup behavior (filename only contains the video ID) |
| 229 # | 239 # |
| 230 # We should also check if whether the copy on IA is higher quality | 240 # We should also check if whether the copy on IA is higher quality |
| 231 # than a local copy... :) | 241 # than a local copy... :) |
| 232 if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w" | 242 if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w" |
| 233 r"ebm|info\.json|description|annotations.xml))", | 243 r"ebm|info\.json|description|annotations.xml))", |
| 234 f.name): | 244 f): |
| 235 return False | 245 return False |
| 236 | 246 |
| 237 # now, check the metadata | |
| 238 print(f) | |
| 239 return True | 247 return True |
| 240 | 248 |
| 241 | 249 |
| 242 if not internetarchive.get_item("youtube-%s" % video["id"]).exists: | 250 if not internetarchive.get_item("youtube-%s" % video["id"]).exists: |
| 243 return 2 | 251 return 2 |
| 245 flist = [ | 253 flist = [ |
| 246 f.name | 254 f.name |
| 247 for f in internetarchive.get_files("youtube-%s" % video["id"]) | 255 for f in internetarchive.get_files("youtube-%s" % video["id"]) |
| 248 if ia_file_legit(f.name, video["id"]) | 256 if ia_file_legit(f.name, video["id"]) |
| 249 ] | 257 ] |
| 258 | |
| 250 while True: | 259 while True: |
| 251 try: | 260 try: |
| 252 internetarchive.download("youtube-%s" % video["id"], files=flist, | 261 internetarchive.download("youtube-%s" % video["id"], files=flist, |
| 253 verbose=True, destdir=output, | 262 verbose=True, |
| 254 no_directory=True, ignore_existing=True, | 263 no_directory=True, ignore_existing=True, |
| 255 retries=9999) | 264 retries=9999) |
| 256 break | 265 break |
| 257 except ConnectTimeout: | 266 except ConnectTimeout: |
| 258 time.sleep(1) | 267 time.sleep(1) |
| 264 # Newer versions of tubeup save only the video ID. | 273 # Newer versions of tubeup save only the video ID. |
| 265 # Account for this by replacing it. | 274 # Account for this by replacing it. |
| 266 # | 275 # |
| 267 # paper/2025-08-30: fixed a bug where video IDs with hyphens | 276 # paper/2025-08-30: fixed a bug where video IDs with hyphens |
| 268 # would incorrectly truncate | 277 # would incorrectly truncate |
| 278 # | |
| 279 # paper/2026-02-27: an update in the IA python library changed | |
| 280 # the way destdir works, so it just gets entirely ignored. | |
| 269 for fname in flist: | 281 for fname in flist: |
| 270 # ignore any files whose names are not simply the ID | 282 def whitelist(s: str, vidid: str) -> bool: |
| 271 if os.path.splitext(fname)[0] != video["id"]: | 283 # special case: .info.json files |
| 284 if s == ("%s.info.json" % vidid): | |
| 285 return ".info.json" | |
| 286 | |
| 287 spli = os.path.splitext(fname) | |
| 288 if spli is None or len(spli) != 2 or spli[0] != vidid: | |
| 289 return None | |
| 290 | |
| 291 return spli[1] | |
| 292 | |
| 293 if not os.path.exists(fname): | |
| 272 continue | 294 continue |
| 273 | 295 |
| 274 if os.path.exists("%s/%s" % (output, fname)): | 296 ext = whitelist(fname, video["id"]) |
| 275 os.replace("%s/%s" % (output, fname), | 297 if ext is None: |
| 276 "%s.%s" % (basename, os.path.splitext(fname))[1]) | 298 continue |
| 299 | |
| 300 os.replace(fname, "%s%s" % (basename, ext)) | |
| 277 return 0 | 301 return 0 |
| 278 | 302 |
| 279 | 303 |
| 280 def ytdlp_dl(video: dict, basename: str, output: str) -> int: | 304 def ytdlp_dl(video: dict, basename: str, output: str) -> int: |
| 281 # intentionally ignores all messages besides errors | 305 # intentionally ignores all messages besides errors |
| 349 | 373 |
| 350 ############################################################################## | 374 ############################################################################## |
| 351 | 375 |
| 352 | 376 |
| 353 def main(): | 377 def main(): |
| 354 # generator; creates a list of files, and returns the parsed form of | |
| 355 # each. note that the parser is not necessarily | |
| 356 def load_split_files(path: str): | 378 def load_split_files(path: str): |
| 357 list_files = [] | 379 def cruft(isdir: bool, listdir, openf): |
| 358 | 380 # build the path list |
| 359 # build the path list | 381 if not isdir: |
| 360 if not os.path.isdir(path): | 382 list_files = [path] |
| 361 list_files.append(path) | 383 else: |
| 362 else: | 384 list_files = filter(lambda x: re.search(r"vids[0-9\-]+?\.json", x), listdir()) |
| 363 for fi in os.listdir(path): | 385 |
| 364 if re.search(r"vids[0-9\-]+?\.json", fi): | 386 # now open each as a json |
| 365 list_files.append(path + "/" + fi) | 387 for fi in list_files: |
| 366 | 388 print(fi) |
| 367 # now open each as a json | 389 with openf(fi, "r") as infile: |
| 368 for fi in list_files: | 390 if simdjson: |
| 369 print(fi) | 391 # Using this is a lot faster in SIMDJSON, since instead |
| 370 with open(fi, "r", encoding="utf-8") as infile: | 392 # of converting all of the JSON key/value pairs into |
| 371 if simdjson: | 393 # native Python objects, they stay in an internal state. |
| 372 # Using this is a lot faster in SIMDJSON, since instead | 394 # |
| 373 # of converting all of the JSON key/value pairs into | 395 # This means we only get the stuff we absolutely need, |
| 374 # native Python objects, they stay in an internal state. | 396 # which is the uploader ID, and copy everything else |
| 375 # | 397 # if the ID is one we are looking for. |
| 376 # This means we only get the stuff we absolutely need, | 398 parser = json.Parser() |
| 377 # which is the uploader ID, and copy everything else | 399 yield parser.parse(infile.read()) |
| 378 # if the ID is one we are looking for. | 400 del parser |
| 379 parser = json.Parser() | 401 else: |
| 380 yield parser.parse(infile.read()) | 402 yield json.load(infile) |
| 381 del parser | 403 |
| 382 else: | 404 |
| 383 yield json.load(infile) | 405 try: |
| 406 if not zipfile_works or os.path.isdir(path): | |
| 407 raise Exception | |
| 408 | |
| 409 with zipfile.ZipFile(path, "r") as myzip: | |
| 410 yield from cruft(True, lambda: myzip.namelist(), lambda f, m: io.TextIOWrapper(myzip.open(f, mode=m), encoding="utf-8")) | |
| 411 except Exception as e: | |
| 412 yield from cruft(os.path.isdir(path), lambda: os.listdir(path), lambda f, m: open(path + "/" + f, m, encoding="utf-8")) | |
| 384 | 413 |
| 385 | 414 |
| 386 def write_metadata(i: dict, basename: str) -> None: | 415 def write_metadata(i: dict, basename: str) -> None: |
| 387 # ehhh | 416 # ehhh |
| 388 if not os.path.exists(basename + ".info.json"): | 417 if not os.path.exists(basename + ".info.json"): |
