comparison channeldownloader.py @ 15:615e1ca0212a

*: add support for loading the split db from a zip file saves time having to decompress it. also fixed a couple bugs here and there (notably with the IA downloading)
author Paper <paper@tflc.us>
date Fri, 27 Feb 2026 17:01:18 -0500
parents 03c8fd4069fb
children 088d9a3a2524
comparison
equal deleted inserted replaced
14:03c8fd4069fb 15:615e1ca0212a
42 import re 42 import re
43 import time 43 import time
44 import urllib.request 44 import urllib.request
45 import os 45 import os
46 import ssl 46 import ssl
47 import io
47 from urllib.error import HTTPError 48 from urllib.error import HTTPError
48 from pathlib import Path 49 from pathlib import Path
49 50
50 # We can utilize special simdjson features if it is available 51 # We can utilize special simdjson features if it is available
51 simdjson = False 52 simdjson = False
83 from requests.exceptions import ConnectTimeout 84 from requests.exceptions import ConnectTimeout
84 ia_works = True 85 ia_works = True
85 except ImportError: 86 except ImportError:
86 print("failed to import the Internet Archive's python library!") 87 print("failed to import the Internet Archive's python library!")
87 print("downloading from IA will not work.") 88 print("downloading from IA will not work.")
89
90 zipfile_works = False
91
92 try:
93 import zipfile
94 zipfile_works = True
95 except ImportError:
96 print("failed to import zipfile!")
97 print("loading the database from a .zip file will not work.")
88 98
89 ############################################################################## 99 ##############################################################################
90 ## DOWNLOADERS 100 ## DOWNLOADERS
91 101
92 # All downloaders should be a function under this signature: 102 # All downloaders should be a function under this signature:
208 return 1 218 return 1
209 219
210 220
211 # Internet Archive (tubeup) 221 # Internet Archive (tubeup)
212 def ia_dl(video: dict, basename: str, output: str) -> int: 222 def ia_dl(video: dict, basename: str, output: str) -> int:
213 def ia_file_legit(file: internetarchive.File, vidid: str) -> bool: 223 def ia_file_legit(f: str, vidid: str) -> bool:
214 # FIXME: 224 # FIXME:
215 # 225 #
216 # There are some items on IA that combine the old tubeup behavior 226 # There are some items on IA that combine the old tubeup behavior
217 # (i.e., including the sanitized video name before the ID) 227 # (i.e., including the sanitized video name before the ID)
218 # and the new tubeup behavior (filename only contains the video ID) 228 # and the new tubeup behavior (filename only contains the video ID)
229 # 239 #
230 # We should also check if whether the copy on IA is higher quality 240 # We should also check if whether the copy on IA is higher quality
231 # than a local copy... :) 241 # than a local copy... :)
232 if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w" 242 if not re.search(r"((?:.+?-)?" + vidid + r"\.(?:mp4|jpg|webp|mkv|w"
233 r"ebm|info\.json|description|annotations.xml))", 243 r"ebm|info\.json|description|annotations.xml))",
234 f.name): 244 f):
235 return False 245 return False
236 246
237 # now, check the metadata
238 print(f)
239 return True 247 return True
240 248
241 249
242 if not internetarchive.get_item("youtube-%s" % video["id"]).exists: 250 if not internetarchive.get_item("youtube-%s" % video["id"]).exists:
243 return 2 251 return 2
245 flist = [ 253 flist = [
246 f.name 254 f.name
247 for f in internetarchive.get_files("youtube-%s" % video["id"]) 255 for f in internetarchive.get_files("youtube-%s" % video["id"])
248 if ia_file_legit(f.name, video["id"]) 256 if ia_file_legit(f.name, video["id"])
249 ] 257 ]
258
250 while True: 259 while True:
251 try: 260 try:
252 internetarchive.download("youtube-%s" % video["id"], files=flist, 261 internetarchive.download("youtube-%s" % video["id"], files=flist,
253 verbose=True, destdir=output, 262 verbose=True,
254 no_directory=True, ignore_existing=True, 263 no_directory=True, ignore_existing=True,
255 retries=9999) 264 retries=9999)
256 break 265 break
257 except ConnectTimeout: 266 except ConnectTimeout:
258 time.sleep(1) 267 time.sleep(1)
264 # Newer versions of tubeup save only the video ID. 273 # Newer versions of tubeup save only the video ID.
265 # Account for this by replacing it. 274 # Account for this by replacing it.
266 # 275 #
267 # paper/2025-08-30: fixed a bug where video IDs with hyphens 276 # paper/2025-08-30: fixed a bug where video IDs with hyphens
268 # would incorrectly truncate 277 # would incorrectly truncate
278 #
279 # paper/2026-02-27: an update in the IA python library changed
280 # the way destdir works, so it just gets entirely ignored.
269 for fname in flist: 281 for fname in flist:
270 # ignore any files whose names are not simply the ID 282 def whitelist(s: str, vidid: str) -> bool:
271 if os.path.splitext(fname)[0] != video["id"]: 283 # special case: .info.json files
284 if s == ("%s.info.json" % vidid):
285 return ".info.json"
286
287 spli = os.path.splitext(fname)
288 if spli is None or len(spli) != 2 or spli[0] != vidid:
289 return None
290
291 return spli[1]
292
293 if not os.path.exists(fname):
272 continue 294 continue
273 295
274 if os.path.exists("%s/%s" % (output, fname)): 296 ext = whitelist(fname, video["id"])
275 os.replace("%s/%s" % (output, fname), 297 if ext is None:
276 "%s.%s" % (basename, os.path.splitext(fname))[1]) 298 continue
299
300 os.replace(fname, "%s%s" % (basename, ext))
277 return 0 301 return 0
278 302
279 303
280 def ytdlp_dl(video: dict, basename: str, output: str) -> int: 304 def ytdlp_dl(video: dict, basename: str, output: str) -> int:
281 # intentionally ignores all messages besides errors 305 # intentionally ignores all messages besides errors
349 373
350 ############################################################################## 374 ##############################################################################
351 375
352 376
353 def main(): 377 def main():
354 # generator; creates a list of files, and returns the parsed form of
355 # each. note that the parser is not necessarily
356 def load_split_files(path: str): 378 def load_split_files(path: str):
357 list_files = [] 379 def cruft(isdir: bool, listdir, openf):
358 380 # build the path list
359 # build the path list 381 if not isdir:
360 if not os.path.isdir(path): 382 list_files = [path]
361 list_files.append(path) 383 else:
362 else: 384 list_files = filter(lambda x: re.search(r"vids[0-9\-]+?\.json", x), listdir())
363 for fi in os.listdir(path): 385
364 if re.search(r"vids[0-9\-]+?\.json", fi): 386 # now open each as a json
365 list_files.append(path + "/" + fi) 387 for fi in list_files:
366 388 print(fi)
367 # now open each as a json 389 with openf(fi, "r") as infile:
368 for fi in list_files: 390 if simdjson:
369 print(fi) 391 # Using this is a lot faster in SIMDJSON, since instead
370 with open(fi, "r", encoding="utf-8") as infile: 392 # of converting all of the JSON key/value pairs into
371 if simdjson: 393 # native Python objects, they stay in an internal state.
372 # Using this is a lot faster in SIMDJSON, since instead 394 #
373 # of converting all of the JSON key/value pairs into 395 # This means we only get the stuff we absolutely need,
374 # native Python objects, they stay in an internal state. 396 # which is the uploader ID, and copy everything else
375 # 397 # if the ID is one we are looking for.
376 # This means we only get the stuff we absolutely need, 398 parser = json.Parser()
377 # which is the uploader ID, and copy everything else 399 yield parser.parse(infile.read())
378 # if the ID is one we are looking for. 400 del parser
379 parser = json.Parser() 401 else:
380 yield parser.parse(infile.read()) 402 yield json.load(infile)
381 del parser 403
382 else: 404
383 yield json.load(infile) 405 try:
406 if not zipfile_works or os.path.isdir(path):
407 raise Exception
408
409 with zipfile.ZipFile(path, "r") as myzip:
410 yield from cruft(True, lambda: myzip.namelist(), lambda f, m: io.TextIOWrapper(myzip.open(f, mode=m), encoding="utf-8"))
411 except Exception as e:
412 yield from cruft(os.path.isdir(path), lambda: os.listdir(path), lambda f, m: open(path + "/" + f, m, encoding="utf-8"))
384 413
385 414
386 def write_metadata(i: dict, basename: str) -> None: 415 def write_metadata(i: dict, basename: str) -> None:
387 # ehhh 416 # ehhh
388 if not os.path.exists(basename + ".info.json"): 417 if not os.path.exists(basename + ".info.json"):