Mercurial > codedump
comparison channeldownloader.py @ 67:9636d5dee08c
[channeldownloader.py] Python 2.7 compatibility
Also make the code a *lot* more optimized
(e.g. removing the unnecessary double for-loop)
committer: GitHub <noreply@github.com>
| author | Paper <37962225+mrpapersonic@users.noreply.github.com> | 
|---|---|
| date | Wed, 18 May 2022 18:57:58 -0400 | 
| parents | c615532e6572 | 
| children | a43ed076b28f | 
   comparison
  equal
  deleted
  inserted
  replaced
| 66:ff473892908c | 67:9636d5dee08c | 
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 # | |
| 3 # download deleted vids from old yt channels | |
| 4 # script by paper | |
| 5 | |
| 6 from __future__ import print_function | |
| 1 import argparse | 7 import argparse | 
| 2 import internetarchive # pip install internetarchive | 8 import internetarchive | 
| 3 import json | 9 try: | 
| 10 import orjson as json | |
| 11 except ImportError: | |
| 12 import json | |
| 4 import glob | 13 import glob | 
| 5 import os | 14 import os | 
| 6 import re | 15 import re | 
| 7 import urllib.request | 16 try: | 
| 8 import yt_dlp # pip install yt-dlp | 17 import urllib.request as compat_urllib | 
| 9 import itertools | 18 from urllib.error import HTTPError | 
| 10 from urllib.error import HTTPError | 19 except ImportError: # Python 2 | 
| 11 from yt_dlp.utils import sanitize_filename | 20 import urllib as compat_urllib | 
| 21 from urllib2 import HTTPError | |
| 22 try: | |
| 23 import yt_dlp as youtube_dl | |
| 24 from yt_dlp.utils import sanitize_filename | |
| 25 except ImportError: | |
| 26 try: | |
| 27 import youtube_dl | |
| 28 from youtube_dl.utils import sanitize_filename | |
| 29 except ImportError: | |
| 30 print("ERROR: youtube-dl/yt-dlp not installed!") | |
| 31 exit(1) | |
| 32 from io import open # for Python 2 compatibility, in Python 3 this | |
| 33 # just maps to the built-in function | |
| 12 | 34 | 
| 13 class MyLogger(object): | 35 class MyLogger(object): | 
| 14 def debug(self, msg): | 36 def debug(self, msg): | 
| 15 pass | 37 pass | 
| 16 | 38 | 
| 18 pass | 40 pass | 
| 19 | 41 | 
| 20 def error(self, msg): | 42 def error(self, msg): | 
| 21 pass | 43 pass | 
| 22 | 44 | 
| 23 def matroska_find(filelist): | |
| 24 for myfile in filelist: | |
| 25 if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm": | |
| 26 return True | |
| 27 return False | |
| 28 | |
| 29 def ytdl_hook(d): | 45 def ytdl_hook(d): | 
| 30 if d["status"] == "finished": | 46 if d["status"] == "finished": | 
| 31 print(" downloaded {0}: 100% ".format(os.path.basename(d["filename"]))) | 47 print(" downloaded %s: 100% " % (os.path.basename(d["filename"]))) | 
| 32 if d["status"] == "downloading": | 48 if d["status"] == "downloading": | 
| 33 print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="") | 49 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="") | 
| 34 if d["status"] == "error": | 50 if d["status"] == "error": | 
| 35 print(" an error occurred downloading {0}!") | 51 print(" an error occurred downloading {0}!") | 
| 36 | 52 | 
| 37 def load_split_files(path): | 53 def load_split_files(path): | 
| 38 if os.path.isdir(path): | 54 if os.path.isdir(path): | 
| 41 with open(f, "r", encoding="utf-8") as infile: | 57 with open(f, "r", encoding="utf-8") as infile: | 
| 42 for i in json.loads(infile.read())["videos"]: | 58 for i in json.loads(infile.read())["videos"]: | 
| 43 result["videos"].append(i) | 59 result["videos"].append(i) | 
| 44 return result | 60 return result | 
| 45 else: | 61 else: | 
| 46 return json.loads(open(path, "r", encoding="utf-8")) | 62 return json.loads(open(path, "r", encoding="utf-8").read()) | 
| 47 | 63 | 
| 48 | 64 | 
| 49 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") | 65 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") | 
| 50 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True) | 66 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True) | 
| 51 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True) | 67 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True) | 
| 64 | 80 | 
| 65 if not os.path.exists(output): | 81 if not os.path.exists(output): | 
| 66 os.mkdir(output) | 82 os.mkdir(output) | 
| 67 | 83 | 
| 68 ytdl_opts = { | 84 ytdl_opts = { | 
| 69 "outtmpl": "{0}/%(title)s-%(id)s.%(ext)s".format(output), | 85 "outtmpl": "%s/%(title)s-%(id)s.%(ext)s" % (output), | 
| 70 "retries": 100, | 86 "retries": 100, | 
| 71 "nooverwrites": True, | 87 "nooverwrites": True, | 
| 72 "call_home": False, | 88 "call_home": False, | 
| 73 "quiet": True, | 89 "quiet": True, | 
| 74 "writeinfojson": True, | 90 "writeinfojson": True, | 
| 87 "progress_hooks": [ytdl_hook], | 103 "progress_hooks": [ytdl_hook], | 
| 88 "logger": MyLogger(), | 104 "logger": MyLogger(), | 
| 89 "ignoreerrors": False, | 105 "ignoreerrors": False, | 
| 90 } | 106 } | 
| 91 | 107 | 
| 92 for i in load_split_files(args.database): | 108 for i in load_split_files(args.database)["videos"]: | 
| 93 try: | 109 uploader = i["uploader_id"] if "uploader_id" in i else None | 
| 94 uploader = i["uploader_id"] | |
| 95 except Exception: | |
| 96 uploader = "unknown" | |
| 97 if uploader == channel: | 110 if uploader == channel: | 
| 98 print("{0}:".format(i["id"])) | 111 print("%s:" % i["id"]) | 
| 99 isalreadydownloaded = 0 | 112 if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"): | 
| 100 for file in os.listdir(output): | |
| 101 if os.path.splitext(file)[1] == ".json": | |
| 102 if file.find("-" + i["id"] + ".info.json") != -1: | |
| 103 isalreadydownloaded = 1 | |
| 104 if isalreadydownloaded == 1: # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great! | |
| 105 print(" video already downloaded!") | 113 print(" video already downloaded!") | 
| 106 continue | 114 continue | 
| 107 with yt_dlp.YoutubeDL(ytdl_opts) as ytdl: | 115 # this code is *really* ugly... todo a rewrite? | 
| 116 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: | |
| 108 try: | 117 try: | 
| 109 result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])]) # TODO: add check for existing downloaded items and don't download them | 118 result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]]) # TODO: add check for existing downloaded items and don't download them | 
| 110 continue | 119 continue | 
| 111 except Exception: | 120 except Exception: | 
| 112 print(" video is not available! attempting to find Internet Archive pages of it...") | 121 print(" video is not available! attempting to find Internet Archive pages of it...") | 
| 113 if internetarchive.get_item("youtube-{0}".format(i["id"])).exists: # download from internetarchive if available | 122 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available | 
| 114 fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))] | 123 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] | 
| 115 disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])] # list of IA-created files we don't need | 124 disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]] # list of IA-created files we don't need | 
| 116 flist = [] | 125 flist = [] | 
| 117 for fname in fnames: | 126 for fname in fnames: | 
| 118 if matroska_find(fnames): | 127 if os.path.splitext(fname)[1] in [".mkv", ".webm"]: | 
| 119 if fname[-4:] == ".mp4": | 128 if fname[-4:] == ".mp4": | 
| 120 continue | 129 continue | 
| 121 else: | 130 else: | 
| 122 if fname[-7:] == ".ia.mp4": | 131 if fname[-7:] == ".ia.mp4": | 
| 123 continue | 132 continue | 
| 124 if fname.find("/") == -1: | 133 if fname.find("/") == -1: | 
| 125 if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]): | 134 if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]: | 
| 126 flist.append(fname) | 135 flist.append(fname) | 
| 127 if len(flist) >= 1: | 136 if len(flist) >= 1: | 
| 128 internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) | 137 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) | 
| 129 else: | 138 else: | 
| 130 print(" video already downloaded!") | 139 print(" video already downloaded!") | 
| 131 continue | 140 continue | 
| 132 if os.path.exists(output + "\\" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download | 141 if os.path.exists(output + "/" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download | 
| 133 for fname in flist: | 142 for fname in flist: | 
| 134 if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname): | 143 if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname): | 
| 135 os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname) | 144 os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname) | 
| 136 else: | 145 else: | 
| 137 print("ID file not found!") | 146 print("ID file not found!") | 
| 138 else: # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then) | 147 else: | 
| 139 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") | 148 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") | 
| 140 try: | 149 try: | 
| 141 contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type") | 150 headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"]) | 
| 142 if contenttype == "video/webm": | 151 if hasattr(headers.info(), "getheader"): | 
| 143 ext = "webm" | 152 contenttype = headers.info().getheader("Content-Type") | 
| 144 else: | 153 else: | 
| 145 ext = "mp4" | 154 contenttype = headers.getheader("Content-Type") | 
| 146 urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output)) | 155 ext = "webm" if contenttype == "video/webm" else "mp4" | 
| 147 print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext)) | 156 compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext)) | 
| 157 print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext)) | |
| 148 except HTTPError: | 158 except HTTPError: | 
| 149 print(" video not available on the Wayback Machine!") | 159 print(" video not available on the Wayback Machine!") | 
| 150 except Exception as e: | 160 except Exception as e: | 
| 151 print(" unknown error downloading video!") | 161 print(" unknown error downloading video!\n") | 
| 152 print(e) | 162 print(e) | 
| 153 # metadata | 163 # metadata | 
| 154 with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile: | 164 with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile: | 
| 155 print(json.dumps(i), end="", file=jsonfile) | 165 jsonfile.write(json.dumps(i, ensure_ascii=False).decode('utf-8')) | 
| 156 print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output)) | 166 print(" saved %s" % os.path.basename(jsonfile.name)) | 
