Mercurial > codedump
changeset 67:9636d5dee08c
[channeldownloader.py] Python 2.7 compatibility
Also make the code a *lot* more optimized
(e.g. removing the unnecessary double for-loop)
committer: GitHub <noreply@github.com>
author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
---|---|
date | Wed, 18 May 2022 18:57:58 -0400 |
parents | ff473892908c |
children | a43ed076b28f |
files | channeldownloader.py |
diffstat | 1 files changed, 61 insertions(+), 51 deletions(-) [+] |
line wrap: on
line diff
--- a/channeldownloader.py Mon Apr 25 02:11:46 2022 -0400 +++ b/channeldownloader.py Wed May 18 18:57:58 2022 -0400 @@ -1,14 +1,36 @@ +#!/usr/bin/env python3 +# +# download deleted vids from old yt channels +# script by paper + +from __future__ import print_function import argparse -import internetarchive # pip install internetarchive -import json +import internetarchive +try: + import orjson as json +except ImportError: + import json import glob import os import re -import urllib.request -import yt_dlp # pip install yt-dlp -import itertools -from urllib.error import HTTPError -from yt_dlp.utils import sanitize_filename +try: + import urllib.request as compat_urllib + from urllib.error import HTTPError +except ImportError: # Python 2 + import urllib as compat_urllib + from urllib2 import HTTPError +try: + import yt_dlp as youtube_dl + from yt_dlp.utils import sanitize_filename +except ImportError: + try: + import youtube_dl + from youtube_dl.utils import sanitize_filename + except ImportError: + print("ERROR: youtube-dl/yt-dlp not installed!") + exit(1) +from io import open # for Python 2 compatibility, in Python 3 this + # just maps to the built-in function class MyLogger(object): def debug(self, msg): @@ -20,17 +42,11 @@ def error(self, msg): pass -def matroska_find(filelist): - for myfile in filelist: - if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm": - return True - return False - def ytdl_hook(d): if d["status"] == "finished": - print(" downloaded {0}: 100% ".format(os.path.basename(d["filename"]))) + print(" downloaded %s: 100% " % (os.path.basename(d["filename"]))) if d["status"] == "downloading": - print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="") + print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="") if d["status"] == "error": print(" an error occurred downloading {0}!") @@ -43,7 +59,7 @@ result["videos"].append(i) return result else: - return json.loads(open(path, "r", encoding="utf-8")) + return json.loads(open(path, "r", encoding="utf-8").read()) parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") @@ -66,7 +82,7 @@ os.mkdir(output) ytdl_opts = { - "outtmpl": "{0}/%(title)s-%(id)s.%(ext)s".format(output), + "outtmpl": "%s/%(title)s-%(id)s.%(ext)s" % (output), "retries": 100, "nooverwrites": True, "call_home": False, @@ -89,68 +105,62 @@ "ignoreerrors": False, } -for i in load_split_files(args.database): - try: - uploader = i["uploader_id"] - except Exception: - uploader = "unknown" +for i in load_split_files(args.database)["videos"]: + uploader = i["uploader_id"] if "uploader_id" in i else None if uploader == channel: - print("{0}:".format(i["id"])) - isalreadydownloaded = 0 - for file in os.listdir(output): - if os.path.splitext(file)[1] == ".json": - if file.find("-" + i["id"] + ".info.json") != -1: - isalreadydownloaded = 1 - if isalreadydownloaded == 1: # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great! + print("%s:" % i["id"]) + if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"): print(" video already downloaded!") continue - with yt_dlp.YoutubeDL(ytdl_opts) as ytdl: + # this code is *really* ugly... todo a rewrite? + with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: try: - result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])]) # TODO: add check for existing downloaded items and don't download them + result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]]) # TODO: add check for existing downloaded items and don't download them continue except Exception: print(" video is not available! attempting to find Internet Archive pages of it...") - if internetarchive.get_item("youtube-{0}".format(i["id"])).exists: # download from internetarchive if available - fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))] - disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])] # list of IA-created files we don't need + if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available + fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] + disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]] # list of IA-created files we don't need flist = [] for fname in fnames: - if matroska_find(fnames): + if os.path.splitext(fname)[1] in [".mkv", ".webm"]: if fname[-4:] == ".mp4": continue else: if fname[-7:] == ".ia.mp4": continue if fname.find("/") == -1: - if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]): + if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]: flist.append(fname) if len(flist) >= 1: - internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) + internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) else: print(" video already downloaded!") continue - if os.path.exists(output + "\\" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download + if os.path.exists(output + "/" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download for fname in flist: - if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname): - os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname) + if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname): + os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname) else: print("ID file not found!") - else: # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then) + else: print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") try: - contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type") - if contenttype == "video/webm": - ext = "webm" + headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"]) + if hasattr(headers.info(), "getheader"): + contenttype = headers.info().getheader("Content-Type") else: - ext = "mp4" - urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output)) - print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext)) + contenttype = headers.getheader("Content-Type") + ext = "webm" if contenttype == "video/webm" else "mp4" + compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext)) + print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext)) except HTTPError: print(" video not available on the Wayback Machine!") except Exception as e: - print(" unknown error downloading video!") + print(" unknown error downloading video!\n") print(e) # metadata - with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile: - print(json.dumps(i), end="", file=jsonfile) - print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output)) + with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile: + jsonfile.write(json.dumps(i, ensure_ascii=False).decode('utf-8')) + print(" saved %s" % os.path.basename(jsonfile.name))