codedump: channeldownloader.py comparison

comparison channeldownloader.py @ 118:eac6dae753ca

*: major cleanup committer: GitHub <noreply@github.com>

author	Paper <37962225+mrpapersonic@users.noreply.github.com>
date	Fri, 03 Mar 2023 22:51:28 +0000
parents	80bd4a99ea00
children	196cf2e3d96e

comparison

equal deleted inserted replaced

-:40a7b6d9bd3b
+:eac6dae753ca
 except ImportError:
 import json
 import os
 import re
 import time
-try:
+import urllib.request
-import urllib.request as compat_urllib
+import requests  # need this for ONE (1) exception
-from urllib.error import HTTPError
+import yt_dlp as youtube_dl
-except ImportError:  # Python 2
+from urllib.error import HTTPError
-import urllib as compat_urllib
+from yt_dlp.utils import sanitize_filename, DownloadError
-from urllib2 import HTTPError
+from pathlib import Path
-try:
+from requests.exceptions import ConnectTimeout
-import yt_dlp as youtube_dl
-from yt_dlp.utils import sanitize_filename, DownloadError
-except ImportError:
-try:
-import youtube_dl
-from youtube_dl.utils import sanitize_filename, DownloadError
-except ImportError:
-print("ERROR: youtube-dl/yt-dlp not installed!")
-exit(1)
-from io import open  # for Python 2 compatibility, in Python 3 this
-# just maps to the built-in function
 class MyLogger(object):
 def debug(self, msg):
 pass
 def error(self, msg):
 print(" " + msg)
 pass
-def ytdl_hook(d):
+def ytdl_hook(d) -> None:
 if d["status"] == "finished":
 print(" downloaded %s:    100%% " % (os.path.basename(d["filename"])))
 if d["status"] == "downloading":
-print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="")
+print(" downloading %s: %s\r" % (os.path.basename(d["filename"]),
+d["_percent_str"]), end="")
 if d["status"] == "error":
-print("\n an error occurred downloading %s!" % (os.path.basename(d["filename"])))
+print("\n an error occurred downloading %s!"
+% (os.path.basename(d["filename"])))
-def load_split_files(path):
+def load_split_files(path: str) -> dict:
 if os.path.isdir(path):
 result = {"videos": []}
 for fi in os.listdir(path):
 for f in re.findall(r"vids[0-9\-]+?\.json", fi):
 with open(path + "/" + f, "r", encoding="utf-8") as infile:
 result["videos"].extend(jsonnn)
 return result
 else:
 return json.loads(open(path, "r", encoding="utf-8").read())
-def reporthook(count, block_size, total_size):
+def reporthook(count: int, block_size: int, total_size: int) -> None:
 global start_time
 if count == 0:
 start_time = time.time()
 return
-duration = time.time() - start_time
 percent = int(count * block_size * 100 / total_size)
 print(" downloading %d%%        \r" % (percent), end="")
-args = docopt.docopt(__doc__)
+def write_metadata(i: dict, basename: str) -> None:
+if not os.path.exists(basename + ".info.json"):
+with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile:
+try:
+jsonfile.write(json.dumps(i).decode("utf-8"))
+except AttributeError:
+jsonfile.write(json.dumps(i))
+print(" saved %s" % os.path.basename(jsonfile.name))
+if not os.path.exists(basename + ".description"):
+with open(basename + ".description", "w",
+encoding="utf-8") as descfile:
+descfile.write(i["description"])
+print(" saved %s" % os.path.basename(descfile.name))
 ytdl_opts = {
 "retries": 100,
 "nooverwrites": True,
 "call_home": False,
 "writedescription": True,
 "writethumbnail": True,
 "writeannotations": True,
 "writesubtitles": True,
 "allsubtitles": True,
-"ignoreerrors": True,
 "addmetadata": True,
 "continuedl": True,
 "embedthumbnail": True,
 "format": "bestvideo+bestaudio/best",
 "restrictfilenames": True,
 "progress_hooks": [ytdl_hook],
 "logger": MyLogger(),
 "ignoreerrors": False,
 }
-if not os.path.exists(args["--output"]):
-os.mkdir(args["--output"])
+def wayback_machine_dl(video: dict, basename: str) -> int:
+try:
-for i in load_split_files(args["--database"])["videos"]:
+url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu",
-uploader = i["uploader_id"] if "uploader_id" in i else None
+"rl.archive.org/yt/%s"])
-for url in args["<url>"]:
+headers = urllib.request.urlopen(url % video["id"])
-channel = url.split("/")[-1]
+contenttype = headers.getheader("Content-Type")
+if contenttype == "video/webm":
-output = "%s/%s" % (args["--output"], channel)
+ext = "webm"
-if not os.path.exists(output):
+elif contenttype == "video/mp4":
-os.mkdir(output)
+ext = "mp4"
-ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
+else:
+raise HTTPError(url=None, code=None, msg=None,
+hdrs=None, fp=None)
-if uploader == channel:
+urllib.request.urlretrieve(url % video["id"], "%s.%s" % (basename, ext),
-print("%s:" % i["id"])
+reporthook)
-# :skull:
+print(" downloaded %s.%s" % (basename, ext))
-# todo: put this in a function?
+return 0
-if any(x in os.listdir(output) for x in [sanitize_filename("%s-%s.mp4"  % (i["title"], i["id"]), restricted=True),
+except TimeoutError:
-sanitize_filename("%s-%s.mkv"  % (i["title"], i["id"]), restricted=True),
+return 1
-sanitize_filename("%s-%s.webm" % (i["title"], i["id"]), restricted=True)]):
+except HTTPError:
-print(" video already downloaded!")
+print(" video not available on the Wayback Machine!")
+return 0
+except Exception as e:
+print(" unknown error downloading video!\n")
+print(e)
+return 0
+def internet_archive_dl(video: dict, basename: str) -> int:
+if internetarchive.get_item("youtube-%s" % video["id"]).exists:
+fnames = [f.name for f in internetarchive.get_files(
+"youtube-%s" % video["id"])]
+flist = []
+for fname in range(len(fnames)):
+if re.search(''.join([r"((?:.+?-)?", video["id"],
+r"\.(?:mp4|jpg|webp|mkv|webm|info\\.json|des"
+r"cription|annotations.xml))"]),
+fnames[fname]):
+flist.append(fnames[fname])
+while True:
+try:
+internetarchive.download("youtube-%s" % video["id"],
+files=flist, verbose=True,
+destdir=output,
+no_directory=True,
+ignore_existing=True,
+retries=9999)
+break
+except ConnectTimeout:
 continue
-# this code is *really* ugly... todo a rewrite?
+except Exception:
-with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
+return 0
-try:
+if flist[0][:len(video["id"])] == video["id"]:
-result = ytdl.extract_info("https://youtube.com/watch?v=%s" % i["id"])
+for fname in flist:
+if os.path.exists("%s/%s" % (output, fname)):
+os.replace("%s/%s" % (output, fname),
+"%s-%s" % (basename.rsplit("-", 1)[0],
+fname))
+return 1
+return 0
+def main():
+args = docopt.docopt(__doc__)
+if not os.path.exists(args["--output"]):
+os.mkdir(args["--output"])
+for i in load_split_files(args["--database"])["videos"]:
+uploader = i["uploader_id"] if "uploader_id" in i else None
+for url in args["<url>"]:
+channel = url.split("/")[-1]
+output = "%s/%s" % (args["--output"], channel)
+if not os.path.exists(output):
+os.mkdir(output)
+ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
+if uploader == channel:
+print("%s:" % i["id"])
+basename = "%s/%s-%s" % (output, sanitize_filename(i["title"],
+restricted=True), i["id"])
+path = Path(output)
+files = list(path.glob("*-%s.mkv" % i["id"]))
+files.extend(list(path.glob("*-%s.mp4" % i["id"])))
+files.extend(list(path.glob("*-%s.webm" % i["id"])))
+if files:
+print(" video already downloaded!")
+write_metadata(i, basename)
 continue
-except DownloadError:
+# this code is *really* ugly... todo a rewrite?
-print(" video is not available! attempting to find Internet Archive pages of it...")
+with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
-except Exception as e:
-print(" unknown error downloading video!\n")
-print(e)
-if internetarchive.get_item("youtube-%s" % i["id"]).exists:  # download from internetarchive if available
-fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
-flist = []
-for fname in range(len(fnames)):
-if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
-flist.append(fnames[fname])
-if len(flist) >= 1:
-internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True, retries=9999)
-else:
-print(" video already downloaded!")
-continue
-if os.path.exists("%s/%s.info.json" % (output, i["id"])):  # will always exist no matter which setting was used to download
-for fname in flist:
-if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
-os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
-else:
-print("ID file not found!")
-else:
-print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
-try:  # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url,
-# and we wouldn't even know if it worked. so let's continue using our little "hack"
-headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"])
-if hasattr(headers.info(), "getheader"):
-contenttype = headers.info().getheader("Content-Type")
-else:
-contenttype = headers.getheader("Content-Type")
-if contenttype == "video/webm":
-ext = "webm"
-elif contenttype == "video/mp4":
-ext = "mp4"
-else:
-raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None)
-compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook)
-print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext))
-except HTTPError:
-print(" video not available on the Wayback Machine!")
-except Exception as e:
-print(" unknown error downloading video!\n")
-print(e)
-# metadata
-basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"])
-if not os.path.exists(basename + ".info.json"):
-with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile:
 try:
-jsonfile.write(json.dumps(i).decode("utf-8"))
+ytdl.extract_info("https://youtube.com/watch?v=%s"
-except AttributeError:
+% i["id"])
-jsonfile.write(json.dumps(i))
+continue
-print(" saved %s" % os.path.basename(jsonfile.name))
+except DownloadError:
-if not os.path.exists(basename + ".description"):
+print(" video is not available! attempting to find In"
-with open(basename + ".description", "w", encoding="utf-8") as descfile:
+"ternet Archive pages of it...")
-descfile.write(i["description"])
+except Exception as e:
-print(" saved %s" % os.path.basename(descfile.name))
+print(" unknown error downloading video!\n")
+print(e)
+if internet_archive_dl(i, basename) == 0:  # if we can't download from IA
+print(" video does not have a Internet Archive page! attem"
+"pting to download from the Wayback Machine...")
+while True:
+if wayback_machine_dl(i, basename) == 0:  # success
+break
+time.sleep(5)
+continue
+write_metadata(i, basename)
+if __name__ == "__main__":
+main()

Mercurial > codedump

comparison channeldownloader.py @ 118:eac6dae753ca