channeldownloader: channeldownloader.py comparison

comparison channeldownloader.py @ 18:05e71dd6b6ca default tip

no more ia python library

author	Paper <paper@tflc.us>
date	Sat, 28 Feb 2026 22:31:59 -0500
parents	0d10b2ce0140
children

comparison

equal deleted inserted replaced

-:0d10b2ce0140
+:05e71dd6b6ca
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# Okay, this is a bit of a clusterfuck.
+#
+# This originated as a script that simply helped me scrape a bunch
+# of videos off some deleted channels (in fact, that's still it's main
+# purpose) and was very lackluster (hardcoded shite everywhere).
+# Fortunately in recent times I've cleaned up the code and added some
+# other mirrors, as well as improved the archive.org scraper to not
+# shoot itself when it encounters an upload that's not from tubeup.
+#
+# Nevertheless, I still consider much of this file to be dirty hacks,
+# especially some of the HTTP stuff.
 """
 Usage:
 channeldownloader.py <url>... (--database <file>)
 [--output <folder>]
 import docopt
 import os
 import re
 import time
 import urllib.request
+import urllib.parse
 import os
 import ssl
 import io
 import shutil
 import xml.etree.ElementTree as XmlET
+import enum
 from urllib.error import HTTPError
 from pathlib import Path
 # We can utilize special simdjson features if it is available
 simdjson = False
 ytdlp_works = True
 except ImportError:
 print("failed to import yt-dlp!")
 print("downloading from YouTube directly will not work.")
-ia_works = False
-try:
-import internetarchive
-from requests.exceptions import ConnectTimeout
-ia_works = True
-except ImportError:
-print("failed to import the Internet Archive's python library!")
-print("downloading from IA will not work.")
 zipfile_works = False
 try:
 import zipfile
 zipfile_works = True
 except ImportError:
 print("failed to import zipfile!")
 print("loading the database from a .zip file will not work.")
 ##############################################################################
 ## DOWNLOADERS
 # All downloaders should be a function under this signature:
 # where:
 #    'video': the .info.json scraped from the YTPMV metadata archive.
 #    'basename': the basename output to write as.
 #    'output': the output directory.
 # yes, it's weird, but I don't care ;)
-#
-# Magic return values:
+class DownloaderStatus(enum.Enum):
-#  0 -- all good, video is downloaded
+# Download finished successfully.
-#  1 -- error downloading video; it may still be available if we try again
+SUCCESS = 0
-#  2 -- video is proved totally unavailable here. give up
+# Download failed.
+# Note that this should NOT be used for when the video is unavailable
+# (i.e. error 404); it should only be used when the video cannot be
+# downloaded *at this time*, indicating a server problem. This is very
+# common for the Internet Archive, not sure about others.
+ERROR = 1
+# Video is unavailable from this provider.
+UNAVAILABLE = 2
+"""
+Downloads a file from `url` to `path`, and prints the progress to the
+screen.
+"""
+def download_file(url: str, path: str, guessext: bool = False, length: int = None) -> DownloaderStatus:
+# Download in 32KiB chunks
+CHUNK_SIZE = 32768
+# Don't exceed 79 chars.
+try:
+with urllib.request.urlopen(url) as http:
+if length is None:
+# Check whether the URL gives us Content-Length.
+# If so, call f.truncate to tell the filesystem how much
+# we will be downloading before we start writing.
+#
+# This is also useful for displaying how much we've
+# downloaded overall as a percent.
+length = http.getheader("Content-Length", default=None)
+try:
+if length is not None:
+length = int(length)
+f.truncate(length)
+except:
+# fuck it
+length = None
+if guessext:
+# Guess file extension from MIME type
+mime = http.getheader("Content-Type", default=None)
+if not mime:
+return DownloaderStatus.ERROR
+if mime == "video/mp4":
+path += ".mp4"
+elif mime == "video/webm":
+path += ".webm"
+else:
+return DownloaderStatus.ERROR
+par = os.path.dirname(path)
+if not os.path.isdir(par):
+os.makedirs(par)
+with open(path, "wb") as f:
+# Download the entire file
+while True:
+data = http.read(CHUNK_SIZE)
+if not data:
+break
+f.write(data)
+print("\r downloading to %s, " % path, end="")
+if length:
+print("%.2f%%" % (f.tell() / length * 100.0), end="")
+else:
+print("%.2f MiB" % (f.tell() / (1 << 20)), end="")
+print("\r downloaded to %s        " % path)
+if length is not None and length != f.tell():
+# Server lied about what the length was?
+print(" INFO: HTTP server's Content-Length header lied??")
+except TimeoutError:
+return DownloaderStatus.ERROR
+except HTTPError:
+return DownloaderStatus.UNAVAILABLE
+except Exception as e:
+print(" unknown error downloading video;", e);
+return DownloaderStatus.ERROR
+return DownloaderStatus.SUCCESS
 # Basic downloader template.
 #
 # This does a brute-force of all extensions within vexts and iexts
 #
 # linktemplate is a template to be created using the video ID and
 # extension. For example:
 #    https://cdn.ytarchiver.com/%s.%s
 def basic_dl_template(video: dict, basename: str, output: str,
-linktemplate: str, vexts: list, iexts: list) -> int:
+linktemplate: str, vexts: list, iexts: list) -> DownloaderStatus:
 # actual downloader
 def basic_dl_impl(vid: str, ext: str) -> int:
 url = (linktemplate % (vid, ext))
-try:
+return download_file(url, "%s.%s" % (basename, ext))
-with urllib.request.urlopen(url) as headers:
-with open("%s.%s" % (basename, ext), "wb") as f:
-f.write(headers.read())
-print(" downloaded %s.%s" % (basename, ext))
-return 0
-except TimeoutError:
-return 1
-except HTTPError:
-return 2
-except Exception as e:
-print(" unknown error downloading video!")
-print(e)
-return 1
 for exts in [vexts, iexts]:
 for ext in exts:
 r = basic_dl_impl(video["id"], ext)
-if r == 0:
+if r == DownloaderStatus.SUCCESS:
 break  # done!
-elif r == 1:
+elif r == DownloaderStatus.ERROR:
 # timeout; try again later?
-return 1
+return DownloaderStatus.ERROR
-elif r == 2:
+elif r == DownloaderStatus.UNAVAILABLE:
 continue
 else:
 # we did not break out of the loop
 # which means all extensions were unavailable
-return 2
+return DownloaderStatus.UNAVAILABLE
 # video was downloaded successfully
-return 0
+return DownloaderStatus.SUCCESS
 # GhostArchive, basic...
-def ghostarchive_dl(video: dict, basename: str, output: str) -> int:
+def ghostarchive_dl(video: dict, basename: str, output: str) -> DownloaderStatus:
 return basic_dl_template(video, basename, output,
 "https://ghostvideo.b-cdn.net/chimurai/%s.%s",
 ["mp4", "webm", "mkv"],
 [] # none
 )
 # holds PRIMARILY popular videos (i.e. no niche internet microcelebrities)
 # or weeb shit, however it seems to be growing to other stuff.
 #
 # there isn't really a proper API; I've based the scraping off of the HTML
 # and the public source code.
-def desirintoplaisir_dl(video: dict, basename: str, output: str) -> int:
+def desirintoplaisir_dl(video: dict, basename: str, output: str) -> DownloaderStatus:
 return basic_dl_template(video, basename, output,
 "https://media.desirintoplaisir.net/content/%s.%s",
 ["mp4", "webm", "mkv"],
 ["webp"]
 )
 # URL used here.
 #
 # TODO: Download thumbnails through the CDX API:
 # https://github.com/TheTechRobo/youtubevideofinder/blob/master/lostmediafinder/finder.py
 # the CDX API is pretty slow though, so it should be used as a last resort.
-def wayback_dl(video: dict, basename: str, output: str) -> int:
+def wayback_dl(video: dict, basename: str, output: str) -> DownloaderStatus:
-try:
+PREFIX = "https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/"
-url = ("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archiv"
+return download_file(PREFIX + video["id"], basename, True)
-"e.org/yt/%s" % video["id"])
-with urllib.request.urlopen(url) as headers:
-contenttype = headers.getheader("Content-Type")
-if contenttype == "video/webm" or contenttype == "video/mp4":
-ext = contenttype.split("/")[-1]
-else:
-raise HTTPError(url=None, code=None, msg=None,
-hdrs=None, fp=None)
-with open("%s.%s" % (basename, ext), "wb") as f:
-f.write(headers.read())
-print(" downloaded %s.%s" % (basename, ext))
-return 0
-except TimeoutError:
-return 1
-except HTTPError:
-# dont keep trying
-return 2
-except Exception as e:
-print(" unknown error downloading video!")
-print(e)
-return 1
 # Also captures the ID for comparison
 IA_REGEX = re.compile(r"(?:(?P<date>\d{8}) - )?(?P<title>.+?)?(?:-| \[)?(?:(?P<id>[A-z0-9_\-]{11})]?|(?: \((?P<format>(?:(?:(?P<resolution>\d+)p_(?P<fps>\d+)fps_(?P<vcodec>H264)-)?(?P<abitrate>\d+)kbit_(?P<acodec>AAC|Vorbis))|BQ|Description)\)))\.(?P<extension>mp4|info\.json|description|annotations\.xml|webp|mkv|webm|jpg|jpeg|ogg|txt|m4a)$")
 # Internet Archive (tubeup)
-def ia_dl(video: dict, basename: str, output: str) -> int:
+#
+# NOTE: We don't actually need the python library anymore; we already
+# explicitly download the file listing using our own logic, so there's
+# really nothing stopping us from going ahead and downloading everything
+# else using the download_file function.
+def ia_dl(video: dict, basename: str, output: str) -> DownloaderStatus:
 def ia_file_legit(f: str, vidid: str, vidtitle: str) -> bool:
 # FIXME:
 #
 # There are some items on IA that combine the old tubeup behavior
 # (i.e., including the sanitized video name before the ID)
 d = ia_xml(identifier)
 if d is None:
 return None
 try:
+r = []
 # Now parse the XML and make a list of each original file
-return [x.attrib["name"] for x in filter(lambda x: x.attrib["source"] == "original", XmlET.fromstring(d))]
+for x in filter(lambda x: x.attrib["source"] == "original", XmlET.fromstring(d)):
+l = {"name": x.attrib["name"]}
+sz = x.find("size")
+if sz is not None:
+l["size"] = int(sz.text)
+r.append(l)
+return r
 except Exception as e:
 print(e)
 return None
-originalfiles = ia_get_original_files("youtube-%s" % video["id"])
+IA_IDENTIFIER = "youtube-%s" % video["id"]
+originalfiles = ia_get_original_files(IA_IDENTIFIER)
 if not originalfiles:
-return 2
+return DownloaderStatus.UNAVAILABLE
 flist = [
 f
 for f in originalfiles
-if ia_file_legit(f, video["id"], video["title"] if not "fulltitle" in video else video["fulltitle"])
+if ia_file_legit(f["name"], video["id"], video["title"] if not "fulltitle" in video else video["fulltitle"])
 ]
 if not flist:
-return 2 # ??????
+return DownloaderStatus.UNAVAILABLE # ??????
-while True:
+for i in flist:
-try:
+for _ in range(1, 10):
-internetarchive.download("youtube-%s" % video["id"], files=flist,
+path = "%s/%s" % (IA_IDENTIFIER, i["name"])
-verbose=True, ignore_existing=True,
+r = download_file("https://archive.org/download/" + urllib.parse.quote(path, encoding="utf-8"), path, False, None if not "size" in i else i["size"])
-retries=9999)
+if r == DownloaderStatus.SUCCESS:
 break
-except ConnectTimeout:
+elif r == DownloaderStatus.ERROR:
-time.sleep(1)
+# sleep for a bit and retry
-continue
+time.sleep(1.0)
-except Exception as e:
+continue
-print(e)
+elif r == DownloaderStatus.UNAVAILABLE:
-return 1
+return DownloaderStatus.UNAVAILABLE
 # Newer versions of tubeup save only the video ID.
 # Account for this by replacing it.
 #
 # paper/2025-08-30: fixed a bug where video IDs with hyphens
 # would incorrectly truncate
 #
 # paper/2026-02-27: an update in the IA python library changed
 # the way destdir works, so it just gets entirely ignored.
-for fname in flist:
+for f in flist:
 def getext(s: str, vidid: str) -> typing.Optional[str]:
 # special cases
 for i in [".info.json", ".annotations.xml"]:
 if s.endswith(i):
 return i
 if spli is None or len(spli) != 2:
 return None
 return spli[1]
-ondisk = "youtube-%s/%s" % (video["id"], fname)
+ondisk = "youtube-%s/%s" % (video["id"], f["name"])
 if not os.path.exists(ondisk):
 continue
-ext = getext(fname, video["id"])
+ext = getext(f["name"], video["id"])
 if ext is None:
 continue
 os.replace(ondisk, "%s%s" % (basename, ext))
 shutil.rmtree("youtube-%s" % video["id"])
-return 0
+return DownloaderStatus.SUCCESS
-def ytdlp_dl(video: dict, basename: str, output: str) -> int:
+def ytdlp_dl(video: dict, basename: str, output: str) -> DownloaderStatus:
 # intentionally ignores all messages besides errors
 class MyLogger(object):
 def debug(self, msg):
 pass
 "restrictfilenames": True,
 "no_warnings": True,
 "progress_hooks": [ytdl_hook],
 "logger": MyLogger(),
 "ignoreerrors": False,
+# yummy
-#mm, output template
 "outtmpl": output + "/%(title)s-%(id)s.%(ext)s",
 }
 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
 try:
 ytdl.extract_info("https://youtube.com/watch?v=%s" % video["id"])
-return 0
+return DownloaderStatus.SUCCESS
 except DownloadError:
-return 2
+return DownloaderStatus.UNAVAILABLE
 except Exception as e:
 print(" unknown error downloading video!\n")
 print(e)
-return 1
+return DownloaderStatus.ERROR
 # TODO: There are multiple other youtube archival websites available.
 # Most notable is https://findyoutubevideo.thetechrobo.ca .
 # This combines a lot of sparse youtube archival services, and has
 dls.append({
 "func": ytdlp_dl,
 "name": "using yt-dlp",
 })
-if ia_works:
+dls.append({
-dls.append({
+"func": ia_dl,
-"func": ia_dl,
+"name": "from the Internet Archive",
-"name": "from the Internet Archive",
+})
-})
 dls.append({
 "func": desirintoplaisir_dl,
 "name": "from LMIJLM/DJ Plaisir's archive",
 })
 })
 for dl in dls:
 print(" attempting to download %s" % dl["name"])
 r = dl["func"](i, basename, output)
-if r == 0:
+if r == DownloaderStatus.SUCCESS:
 # all good, video's downloaded
-return 0
+return DownloaderStatus.SUCCESS
-elif r == 2:
+elif r == DownloaderStatus.UNAVAILABLE:
 # video is unavailable here
 print(" oops, video is not available there...")
 continue
-elif r == 1:
+elif r == DownloaderStatus.ERROR:
 # error while downloading; likely temporary.
 # TODO we should save which downloader the video
 # was on, so we can continue back at it later.
-return 1
+return DownloaderStatus.ERROR
-# video is unavailable everywhere
-return 2
+return DownloaderStatus.UNAVAILABLE
 r = dl(i, basename, output)
-if r == 1:
+if r == DownloaderStatus.ERROR:
 continue
 # video is downloaded, or it's totally unavailable, so
 # remove it from being checked again.
 videos.remove(i)
 # ... and then dump the metadata, if there isn't any on disk.
 write_metadata(i, basename)
-if r == 0:
+if r == DownloaderStatus.SUCCESS:
 # video is downloaded
 continue
 # video is unavailable; write out the metadata.
 print(" video is unavailable everywhere; dumping out metadata only")

Mercurial > channeldownloader

comparison channeldownloader.py @ 18:05e71dd6b6ca default tip