comparison channeldownloader.py @ 18:05e71dd6b6ca default tip

no more ia python library
author Paper <paper@tflc.us>
date Sat, 28 Feb 2026 22:31:59 -0500
parents 0d10b2ce0140
children
comparison
equal deleted inserted replaced
17:0d10b2ce0140 18:05e71dd6b6ca
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details. 15 # GNU General Public License for more details.
16 # 16 #
17 # You should have received a copy of the GNU General Public License 17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 # Okay, this is a bit of a clusterfuck.
21 #
22 # This originated as a script that simply helped me scrape a bunch
23 # of videos off some deleted channels (in fact, that's still it's main
24 # purpose) and was very lackluster (hardcoded shite everywhere).
25 # Fortunately in recent times I've cleaned up the code and added some
26 # other mirrors, as well as improved the archive.org scraper to not
27 # shoot itself when it encounters an upload that's not from tubeup.
28 #
29 # Nevertheless, I still consider much of this file to be dirty hacks,
30 # especially some of the HTTP stuff.
19 31
20 """ 32 """
21 Usage: 33 Usage:
22 channeldownloader.py <url>... (--database <file>) 34 channeldownloader.py <url>... (--database <file>)
23 [--output <folder>] 35 [--output <folder>]
40 import docopt 52 import docopt
41 import os 53 import os
42 import re 54 import re
43 import time 55 import time
44 import urllib.request 56 import urllib.request
57 import urllib.parse
45 import os 58 import os
46 import ssl 59 import ssl
47 import io 60 import io
48 import shutil 61 import shutil
49 import xml.etree.ElementTree as XmlET 62 import xml.etree.ElementTree as XmlET
63 import enum
50 from urllib.error import HTTPError 64 from urllib.error import HTTPError
51 from pathlib import Path 65 from pathlib import Path
52 66
53 # We can utilize special simdjson features if it is available 67 # We can utilize special simdjson features if it is available
54 simdjson = False 68 simdjson = False
77 ytdlp_works = True 91 ytdlp_works = True
78 except ImportError: 92 except ImportError:
79 print("failed to import yt-dlp!") 93 print("failed to import yt-dlp!")
80 print("downloading from YouTube directly will not work.") 94 print("downloading from YouTube directly will not work.")
81 95
82 ia_works = False
83
84 try:
85 import internetarchive
86 from requests.exceptions import ConnectTimeout
87 ia_works = True
88 except ImportError:
89 print("failed to import the Internet Archive's python library!")
90 print("downloading from IA will not work.")
91
92 zipfile_works = False 96 zipfile_works = False
93 97
94 try: 98 try:
95 import zipfile 99 import zipfile
96 zipfile_works = True 100 zipfile_works = True
97 except ImportError: 101 except ImportError:
98 print("failed to import zipfile!") 102 print("failed to import zipfile!")
99 print("loading the database from a .zip file will not work.") 103 print("loading the database from a .zip file will not work.")
104
100 105
101 ############################################################################## 106 ##############################################################################
102 ## DOWNLOADERS 107 ## DOWNLOADERS
103 108
104 # All downloaders should be a function under this signature: 109 # All downloaders should be a function under this signature:
106 # where: 111 # where:
107 # 'video': the .info.json scraped from the YTPMV metadata archive. 112 # 'video': the .info.json scraped from the YTPMV metadata archive.
108 # 'basename': the basename output to write as. 113 # 'basename': the basename output to write as.
109 # 'output': the output directory. 114 # 'output': the output directory.
110 # yes, it's weird, but I don't care ;) 115 # yes, it's weird, but I don't care ;)
111 # 116
112 # Magic return values: 117 class DownloaderStatus(enum.Enum):
113 # 0 -- all good, video is downloaded 118 # Download finished successfully.
114 # 1 -- error downloading video; it may still be available if we try again 119 SUCCESS = 0
115 # 2 -- video is proved totally unavailable here. give up 120 # Download failed.
121 # Note that this should NOT be used for when the video is unavailable
122 # (i.e. error 404); it should only be used when the video cannot be
123 # downloaded *at this time*, indicating a server problem. This is very
124 # common for the Internet Archive, not sure about others.
125 ERROR = 1
126 # Video is unavailable from this provider.
127 UNAVAILABLE = 2
128
129 """
130 Downloads a file from `url` to `path`, and prints the progress to the
131 screen.
132 """
133 def download_file(url: str, path: str, guessext: bool = False, length: int = None) -> DownloaderStatus:
134 # Download in 32KiB chunks
135 CHUNK_SIZE = 32768
136
137 # Don't exceed 79 chars.
138 try:
139 with urllib.request.urlopen(url) as http:
140 if length is None:
141 # Check whether the URL gives us Content-Length.
142 # If so, call f.truncate to tell the filesystem how much
143 # we will be downloading before we start writing.
144 #
145 # This is also useful for displaying how much we've
146 # downloaded overall as a percent.
147 length = http.getheader("Content-Length", default=None)
148 try:
149 if length is not None:
150 length = int(length)
151 f.truncate(length)
152 except:
153 # fuck it
154 length = None
155
156 if guessext:
157 # Guess file extension from MIME type
158 mime = http.getheader("Content-Type", default=None)
159 if not mime:
160 return DownloaderStatus.ERROR
161
162 if mime == "video/mp4":
163 path += ".mp4"
164 elif mime == "video/webm":
165 path += ".webm"
166 else:
167 return DownloaderStatus.ERROR
168
169 par = os.path.dirname(path)
170 if not os.path.isdir(par):
171 os.makedirs(par)
172
173 with open(path, "wb") as f:
174 # Download the entire file
175 while True:
176 data = http.read(CHUNK_SIZE)
177 if not data:
178 break
179
180 f.write(data)
181 print("\r downloading to %s, " % path, end="")
182 if length:
183 print("%.2f%%" % (f.tell() / length * 100.0), end="")
184 else:
185 print("%.2f MiB" % (f.tell() / (1 << 20)), end="")
186
187 print("\r downloaded to %s " % path)
188
189 if length is not None and length != f.tell():
190 # Server lied about what the length was?
191 print(" INFO: HTTP server's Content-Length header lied??")
192 except TimeoutError:
193 return DownloaderStatus.ERROR
194 except HTTPError:
195 return DownloaderStatus.UNAVAILABLE
196 except Exception as e:
197 print(" unknown error downloading video;", e);
198 return DownloaderStatus.ERROR
199
200 return DownloaderStatus.SUCCESS
116 201
117 202
118 # Basic downloader template. 203 # Basic downloader template.
119 # 204 #
120 # This does a brute-force of all extensions within vexts and iexts 205 # This does a brute-force of all extensions within vexts and iexts
122 # 207 #
123 # linktemplate is a template to be created using the video ID and 208 # linktemplate is a template to be created using the video ID and
124 # extension. For example: 209 # extension. For example:
125 # https://cdn.ytarchiver.com/%s.%s 210 # https://cdn.ytarchiver.com/%s.%s
126 def basic_dl_template(video: dict, basename: str, output: str, 211 def basic_dl_template(video: dict, basename: str, output: str,
127 linktemplate: str, vexts: list, iexts: list) -> int: 212 linktemplate: str, vexts: list, iexts: list) -> DownloaderStatus:
128 # actual downloader 213 # actual downloader
129 def basic_dl_impl(vid: str, ext: str) -> int: 214 def basic_dl_impl(vid: str, ext: str) -> int:
130 url = (linktemplate % (vid, ext)) 215 url = (linktemplate % (vid, ext))
131 try: 216 return download_file(url, "%s.%s" % (basename, ext))
132 with urllib.request.urlopen(url) as headers:
133 with open("%s.%s" % (basename, ext), "wb") as f:
134 f.write(headers.read())
135 print(" downloaded %s.%s" % (basename, ext))
136 return 0
137 except TimeoutError:
138 return 1
139 except HTTPError:
140 return 2
141 except Exception as e:
142 print(" unknown error downloading video!")
143 print(e)
144 return 1
145 217
146 for exts in [vexts, iexts]: 218 for exts in [vexts, iexts]:
147 for ext in exts: 219 for ext in exts:
148 r = basic_dl_impl(video["id"], ext) 220 r = basic_dl_impl(video["id"], ext)
149 if r == 0: 221 if r == DownloaderStatus.SUCCESS:
150 break # done! 222 break # done!
151 elif r == 1: 223 elif r == DownloaderStatus.ERROR:
152 # timeout; try again later? 224 # timeout; try again later?
153 return 1 225 return DownloaderStatus.ERROR
154 elif r == 2: 226 elif r == DownloaderStatus.UNAVAILABLE:
155 continue 227 continue
156 else: 228 else:
157 # we did not break out of the loop 229 # we did not break out of the loop
158 # which means all extensions were unavailable 230 # which means all extensions were unavailable
159 return 2 231 return DownloaderStatus.UNAVAILABLE
160 232
161 # video was downloaded successfully 233 # video was downloaded successfully
162 return 0 234 return DownloaderStatus.SUCCESS
163 235
164 236
165 # GhostArchive, basic... 237 # GhostArchive, basic...
166 def ghostarchive_dl(video: dict, basename: str, output: str) -> int: 238 def ghostarchive_dl(video: dict, basename: str, output: str) -> DownloaderStatus:
167 return basic_dl_template(video, basename, output, 239 return basic_dl_template(video, basename, output,
168 "https://ghostvideo.b-cdn.net/chimurai/%s.%s", 240 "https://ghostvideo.b-cdn.net/chimurai/%s.%s",
169 ["mp4", "webm", "mkv"], 241 ["mp4", "webm", "mkv"],
170 [] # none 242 [] # none
171 ) 243 )
176 # holds PRIMARILY popular videos (i.e. no niche internet microcelebrities) 248 # holds PRIMARILY popular videos (i.e. no niche internet microcelebrities)
177 # or weeb shit, however it seems to be growing to other stuff. 249 # or weeb shit, however it seems to be growing to other stuff.
178 # 250 #
179 # there isn't really a proper API; I've based the scraping off of the HTML 251 # there isn't really a proper API; I've based the scraping off of the HTML
180 # and the public source code. 252 # and the public source code.
181 def desirintoplaisir_dl(video: dict, basename: str, output: str) -> int: 253 def desirintoplaisir_dl(video: dict, basename: str, output: str) -> DownloaderStatus:
182 return basic_dl_template(video, basename, output, 254 return basic_dl_template(video, basename, output,
183 "https://media.desirintoplaisir.net/content/%s.%s", 255 "https://media.desirintoplaisir.net/content/%s.%s",
184 ["mp4", "webm", "mkv"], 256 ["mp4", "webm", "mkv"],
185 ["webp"] 257 ["webp"]
186 ) 258 )
192 # URL used here. 264 # URL used here.
193 # 265 #
194 # TODO: Download thumbnails through the CDX API: 266 # TODO: Download thumbnails through the CDX API:
195 # https://github.com/TheTechRobo/youtubevideofinder/blob/master/lostmediafinder/finder.py 267 # https://github.com/TheTechRobo/youtubevideofinder/blob/master/lostmediafinder/finder.py
196 # the CDX API is pretty slow though, so it should be used as a last resort. 268 # the CDX API is pretty slow though, so it should be used as a last resort.
197 def wayback_dl(video: dict, basename: str, output: str) -> int: 269 def wayback_dl(video: dict, basename: str, output: str) -> DownloaderStatus:
198 try: 270 PREFIX = "https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/"
199 url = ("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archiv" 271 return download_file(PREFIX + video["id"], basename, True)
200 "e.org/yt/%s" % video["id"])
201 with urllib.request.urlopen(url) as headers:
202 contenttype = headers.getheader("Content-Type")
203 if contenttype == "video/webm" or contenttype == "video/mp4":
204 ext = contenttype.split("/")[-1]
205 else:
206 raise HTTPError(url=None, code=None, msg=None,
207 hdrs=None, fp=None)
208 with open("%s.%s" % (basename, ext), "wb") as f:
209 f.write(headers.read())
210 print(" downloaded %s.%s" % (basename, ext))
211 return 0
212 except TimeoutError:
213 return 1
214 except HTTPError:
215 # dont keep trying
216 return 2
217 except Exception as e:
218 print(" unknown error downloading video!")
219 print(e)
220 return 1
221 272
222 273
223 # Also captures the ID for comparison 274 # Also captures the ID for comparison
224 IA_REGEX = re.compile(r"(?:(?P<date>\d{8}) - )?(?P<title>.+?)?(?:-| \[)?(?:(?P<id>[A-z0-9_\-]{11})]?|(?: \((?P<format>(?:(?:(?P<resolution>\d+)p_(?P<fps>\d+)fps_(?P<vcodec>H264)-)?(?P<abitrate>\d+)kbit_(?P<acodec>AAC|Vorbis))|BQ|Description)\)))\.(?P<extension>mp4|info\.json|description|annotations\.xml|webp|mkv|webm|jpg|jpeg|ogg|txt|m4a)$") 275 IA_REGEX = re.compile(r"(?:(?P<date>\d{8}) - )?(?P<title>.+?)?(?:-| \[)?(?:(?P<id>[A-z0-9_\-]{11})]?|(?: \((?P<format>(?:(?:(?P<resolution>\d+)p_(?P<fps>\d+)fps_(?P<vcodec>H264)-)?(?P<abitrate>\d+)kbit_(?P<acodec>AAC|Vorbis))|BQ|Description)\)))\.(?P<extension>mp4|info\.json|description|annotations\.xml|webp|mkv|webm|jpg|jpeg|ogg|txt|m4a)$")
225 276
226 277
227 # Internet Archive (tubeup) 278 # Internet Archive (tubeup)
228 def ia_dl(video: dict, basename: str, output: str) -> int: 279 #
280 # NOTE: We don't actually need the python library anymore; we already
281 # explicitly download the file listing using our own logic, so there's
282 # really nothing stopping us from going ahead and downloading everything
283 # else using the download_file function.
284 def ia_dl(video: dict, basename: str, output: str) -> DownloaderStatus:
229 def ia_file_legit(f: str, vidid: str, vidtitle: str) -> bool: 285 def ia_file_legit(f: str, vidid: str, vidtitle: str) -> bool:
230 # FIXME: 286 # FIXME:
231 # 287 #
232 # There are some items on IA that combine the old tubeup behavior 288 # There are some items on IA that combine the old tubeup behavior
233 # (i.e., including the sanitized video name before the ID) 289 # (i.e., including the sanitized video name before the ID)
294 d = ia_xml(identifier) 350 d = ia_xml(identifier)
295 if d is None: 351 if d is None:
296 return None 352 return None
297 353
298 try: 354 try:
355 r = []
356
299 # Now parse the XML and make a list of each original file 357 # Now parse the XML and make a list of each original file
300 return [x.attrib["name"] for x in filter(lambda x: x.attrib["source"] == "original", XmlET.fromstring(d))] 358 for x in filter(lambda x: x.attrib["source"] == "original", XmlET.fromstring(d)):
359 l = {"name": x.attrib["name"]}
360
361 sz = x.find("size")
362 if sz is not None:
363 l["size"] = int(sz.text)
364
365 r.append(l)
366
367 return r
368
301 except Exception as e: 369 except Exception as e:
302 print(e) 370 print(e)
303 return None 371 return None
304 372
305 originalfiles = ia_get_original_files("youtube-%s" % video["id"]) 373 IA_IDENTIFIER = "youtube-%s" % video["id"]
374
375 originalfiles = ia_get_original_files(IA_IDENTIFIER)
306 if not originalfiles: 376 if not originalfiles:
307 return 2 377 return DownloaderStatus.UNAVAILABLE
308 378
309 flist = [ 379 flist = [
310 f 380 f
311 for f in originalfiles 381 for f in originalfiles
312 if ia_file_legit(f, video["id"], video["title"] if not "fulltitle" in video else video["fulltitle"]) 382 if ia_file_legit(f["name"], video["id"], video["title"] if not "fulltitle" in video else video["fulltitle"])
313 ] 383 ]
314 384
315 if not flist: 385 if not flist:
316 return 2 # ?????? 386 return DownloaderStatus.UNAVAILABLE # ??????
317 387
318 while True: 388 for i in flist:
319 try: 389 for _ in range(1, 10):
320 internetarchive.download("youtube-%s" % video["id"], files=flist, 390 path = "%s/%s" % (IA_IDENTIFIER, i["name"])
321 verbose=True, ignore_existing=True, 391 r = download_file("https://archive.org/download/" + urllib.parse.quote(path, encoding="utf-8"), path, False, None if not "size" in i else i["size"])
322 retries=9999) 392 if r == DownloaderStatus.SUCCESS:
323 break 393 break
324 except ConnectTimeout: 394 elif r == DownloaderStatus.ERROR:
325 time.sleep(1) 395 # sleep for a bit and retry
326 continue 396 time.sleep(1.0)
327 except Exception as e: 397 continue
328 print(e) 398 elif r == DownloaderStatus.UNAVAILABLE:
329 return 1 399 return DownloaderStatus.UNAVAILABLE
330 400
331 # Newer versions of tubeup save only the video ID. 401 # Newer versions of tubeup save only the video ID.
332 # Account for this by replacing it. 402 # Account for this by replacing it.
333 # 403 #
334 # paper/2025-08-30: fixed a bug where video IDs with hyphens 404 # paper/2025-08-30: fixed a bug where video IDs with hyphens
335 # would incorrectly truncate 405 # would incorrectly truncate
336 # 406 #
337 # paper/2026-02-27: an update in the IA python library changed 407 # paper/2026-02-27: an update in the IA python library changed
338 # the way destdir works, so it just gets entirely ignored. 408 # the way destdir works, so it just gets entirely ignored.
339 for fname in flist: 409 for f in flist:
340 def getext(s: str, vidid: str) -> typing.Optional[str]: 410 def getext(s: str, vidid: str) -> typing.Optional[str]:
341 # special cases 411 # special cases
342 for i in [".info.json", ".annotations.xml"]: 412 for i in [".info.json", ".annotations.xml"]:
343 if s.endswith(i): 413 if s.endswith(i):
344 return i 414 return i
352 if spli is None or len(spli) != 2: 422 if spli is None or len(spli) != 2:
353 return None 423 return None
354 424
355 return spli[1] 425 return spli[1]
356 426
357 ondisk = "youtube-%s/%s" % (video["id"], fname) 427 ondisk = "youtube-%s/%s" % (video["id"], f["name"])
358 428
359 if not os.path.exists(ondisk): 429 if not os.path.exists(ondisk):
360 continue 430 continue
361 431
362 ext = getext(fname, video["id"]) 432 ext = getext(f["name"], video["id"])
363 if ext is None: 433 if ext is None:
364 continue 434 continue
365 435
366 os.replace(ondisk, "%s%s" % (basename, ext)) 436 os.replace(ondisk, "%s%s" % (basename, ext))
367 437
368 shutil.rmtree("youtube-%s" % video["id"]) 438 shutil.rmtree("youtube-%s" % video["id"])
369 439
370 return 0 440 return DownloaderStatus.SUCCESS
371 441
372 442
373 def ytdlp_dl(video: dict, basename: str, output: str) -> int: 443 def ytdlp_dl(video: dict, basename: str, output: str) -> DownloaderStatus:
374 # intentionally ignores all messages besides errors 444 # intentionally ignores all messages besides errors
375 class MyLogger(object): 445 class MyLogger(object):
376 def debug(self, msg): 446 def debug(self, msg):
377 pass 447 pass
378 448
412 "restrictfilenames": True, 482 "restrictfilenames": True,
413 "no_warnings": True, 483 "no_warnings": True,
414 "progress_hooks": [ytdl_hook], 484 "progress_hooks": [ytdl_hook],
415 "logger": MyLogger(), 485 "logger": MyLogger(),
416 "ignoreerrors": False, 486 "ignoreerrors": False,
417 487 # yummy
418 #mm, output template
419 "outtmpl": output + "/%(title)s-%(id)s.%(ext)s", 488 "outtmpl": output + "/%(title)s-%(id)s.%(ext)s",
420 } 489 }
421 490
422 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: 491 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
423 try: 492 try:
424 ytdl.extract_info("https://youtube.com/watch?v=%s" % video["id"]) 493 ytdl.extract_info("https://youtube.com/watch?v=%s" % video["id"])
425 return 0 494 return DownloaderStatus.SUCCESS
426 except DownloadError: 495 except DownloadError:
427 return 2 496 return DownloaderStatus.UNAVAILABLE
428 except Exception as e: 497 except Exception as e:
429 print(" unknown error downloading video!\n") 498 print(" unknown error downloading video!\n")
430 print(e) 499 print(e)
431 500
432 return 1 501 return DownloaderStatus.ERROR
433 502
434 503
435 # TODO: There are multiple other youtube archival websites available. 504 # TODO: There are multiple other youtube archival websites available.
436 # Most notable is https://findyoutubevideo.thetechrobo.ca . 505 # Most notable is https://findyoutubevideo.thetechrobo.ca .
437 # This combines a lot of sparse youtube archival services, and has 506 # This combines a lot of sparse youtube archival services, and has
565 dls.append({ 634 dls.append({
566 "func": ytdlp_dl, 635 "func": ytdlp_dl,
567 "name": "using yt-dlp", 636 "name": "using yt-dlp",
568 }) 637 })
569 638
570 if ia_works: 639 dls.append({
571 dls.append({ 640 "func": ia_dl,
572 "func": ia_dl, 641 "name": "from the Internet Archive",
573 "name": "from the Internet Archive", 642 })
574 })
575 643
576 dls.append({ 644 dls.append({
577 "func": desirintoplaisir_dl, 645 "func": desirintoplaisir_dl,
578 "name": "from LMIJLM/DJ Plaisir's archive", 646 "name": "from LMIJLM/DJ Plaisir's archive",
579 }) 647 })
587 }) 655 })
588 656
589 for dl in dls: 657 for dl in dls:
590 print(" attempting to download %s" % dl["name"]) 658 print(" attempting to download %s" % dl["name"])
591 r = dl["func"](i, basename, output) 659 r = dl["func"](i, basename, output)
592 if r == 0: 660 if r == DownloaderStatus.SUCCESS:
593 # all good, video's downloaded 661 # all good, video's downloaded
594 return 0 662 return DownloaderStatus.SUCCESS
595 elif r == 2: 663 elif r == DownloaderStatus.UNAVAILABLE:
596 # video is unavailable here 664 # video is unavailable here
597 print(" oops, video is not available there...") 665 print(" oops, video is not available there...")
598 continue 666 continue
599 elif r == 1: 667 elif r == DownloaderStatus.ERROR:
600 # error while downloading; likely temporary. 668 # error while downloading; likely temporary.
601 # TODO we should save which downloader the video 669 # TODO we should save which downloader the video
602 # was on, so we can continue back at it later. 670 # was on, so we can continue back at it later.
603 return 1 671 return DownloaderStatus.ERROR
604 # video is unavailable everywhere 672
605 return 2 673 return DownloaderStatus.UNAVAILABLE
606 674
607 r = dl(i, basename, output) 675 r = dl(i, basename, output)
608 if r == 1: 676 if r == DownloaderStatus.ERROR:
609 continue 677 continue
610 678
611 # video is downloaded, or it's totally unavailable, so 679 # video is downloaded, or it's totally unavailable, so
612 # remove it from being checked again. 680 # remove it from being checked again.
613 videos.remove(i) 681 videos.remove(i)
614 # ... and then dump the metadata, if there isn't any on disk. 682 # ... and then dump the metadata, if there isn't any on disk.
615 write_metadata(i, basename) 683 write_metadata(i, basename)
616 684
617 if r == 0: 685 if r == DownloaderStatus.SUCCESS:
618 # video is downloaded 686 # video is downloaded
619 continue 687 continue
620 688
621 # video is unavailable; write out the metadata. 689 # video is unavailable; write out the metadata.
622 print(" video is unavailable everywhere; dumping out metadata only") 690 print(" video is unavailable everywhere; dumping out metadata only")