Mercurial > codedump
comparison channeldownloader.py @ 118:eac6dae753ca
*: major cleanup
committer: GitHub <noreply@github.com>
| author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
|---|---|
| date | Fri, 03 Mar 2023 22:51:28 +0000 |
| parents | 80bd4a99ea00 |
| children | 196cf2e3d96e |
comparison
equal
deleted
inserted
replaced
| 117:40a7b6d9bd3b | 118:eac6dae753ca |
|---|---|
| 23 except ImportError: | 23 except ImportError: |
| 24 import json | 24 import json |
| 25 import os | 25 import os |
| 26 import re | 26 import re |
| 27 import time | 27 import time |
| 28 try: | 28 import urllib.request |
| 29 import urllib.request as compat_urllib | 29 import requests # need this for ONE (1) exception |
| 30 from urllib.error import HTTPError | 30 import yt_dlp as youtube_dl |
| 31 except ImportError: # Python 2 | 31 from urllib.error import HTTPError |
| 32 import urllib as compat_urllib | 32 from yt_dlp.utils import sanitize_filename, DownloadError |
| 33 from urllib2 import HTTPError | 33 from pathlib import Path |
| 34 try: | 34 from requests.exceptions import ConnectTimeout |
| 35 import yt_dlp as youtube_dl | 35 |
| 36 from yt_dlp.utils import sanitize_filename, DownloadError | |
| 37 except ImportError: | |
| 38 try: | |
| 39 import youtube_dl | |
| 40 from youtube_dl.utils import sanitize_filename, DownloadError | |
| 41 except ImportError: | |
| 42 print("ERROR: youtube-dl/yt-dlp not installed!") | |
| 43 exit(1) | |
| 44 from io import open # for Python 2 compatibility, in Python 3 this | |
| 45 # just maps to the built-in function | |
| 46 | 36 |
| 47 class MyLogger(object): | 37 class MyLogger(object): |
| 48 def debug(self, msg): | 38 def debug(self, msg): |
| 49 pass | 39 pass |
| 50 | 40 |
| 53 | 43 |
| 54 def error(self, msg): | 44 def error(self, msg): |
| 55 print(" " + msg) | 45 print(" " + msg) |
| 56 pass | 46 pass |
| 57 | 47 |
| 58 def ytdl_hook(d): | 48 |
| 49 def ytdl_hook(d) -> None: | |
| 59 if d["status"] == "finished": | 50 if d["status"] == "finished": |
| 60 print(" downloaded %s: 100%% " % (os.path.basename(d["filename"]))) | 51 print(" downloaded %s: 100%% " % (os.path.basename(d["filename"]))) |
| 61 if d["status"] == "downloading": | 52 if d["status"] == "downloading": |
| 62 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="") | 53 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), |
| 54 d["_percent_str"]), end="") | |
| 63 if d["status"] == "error": | 55 if d["status"] == "error": |
| 64 print("\n an error occurred downloading %s!" % (os.path.basename(d["filename"]))) | 56 print("\n an error occurred downloading %s!" |
| 65 | 57 % (os.path.basename(d["filename"]))) |
| 66 def load_split_files(path): | 58 |
| 59 | |
| 60 def load_split_files(path: str) -> dict: | |
| 67 if os.path.isdir(path): | 61 if os.path.isdir(path): |
| 68 result = {"videos": []} | 62 result = {"videos": []} |
| 69 for fi in os.listdir(path): | 63 for fi in os.listdir(path): |
| 70 for f in re.findall(r"vids[0-9\-]+?\.json", fi): | 64 for f in re.findall(r"vids[0-9\-]+?\.json", fi): |
| 71 with open(path + "/" + f, "r", encoding="utf-8") as infile: | 65 with open(path + "/" + f, "r", encoding="utf-8") as infile: |
| 73 result["videos"].extend(jsonnn) | 67 result["videos"].extend(jsonnn) |
| 74 return result | 68 return result |
| 75 else: | 69 else: |
| 76 return json.loads(open(path, "r", encoding="utf-8").read()) | 70 return json.loads(open(path, "r", encoding="utf-8").read()) |
| 77 | 71 |
| 78 def reporthook(count, block_size, total_size): | 72 |
| 73 def reporthook(count: int, block_size: int, total_size: int) -> None: | |
| 79 global start_time | 74 global start_time |
| 80 if count == 0: | 75 if count == 0: |
| 81 start_time = time.time() | 76 start_time = time.time() |
| 82 return | 77 return |
| 83 duration = time.time() - start_time | |
| 84 percent = int(count * block_size * 100 / total_size) | 78 percent = int(count * block_size * 100 / total_size) |
| 85 print(" downloading %d%% \r" % (percent), end="") | 79 print(" downloading %d%% \r" % (percent), end="") |
| 86 | 80 |
| 87 args = docopt.docopt(__doc__) | 81 |
| 82 def write_metadata(i: dict, basename: str) -> None: | |
| 83 if not os.path.exists(basename + ".info.json"): | |
| 84 with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile: | |
| 85 try: | |
| 86 jsonfile.write(json.dumps(i).decode("utf-8")) | |
| 87 except AttributeError: | |
| 88 jsonfile.write(json.dumps(i)) | |
| 89 print(" saved %s" % os.path.basename(jsonfile.name)) | |
| 90 if not os.path.exists(basename + ".description"): | |
| 91 with open(basename + ".description", "w", | |
| 92 encoding="utf-8") as descfile: | |
| 93 descfile.write(i["description"]) | |
| 94 print(" saved %s" % os.path.basename(descfile.name)) | |
| 95 | |
| 88 | 96 |
| 89 ytdl_opts = { | 97 ytdl_opts = { |
| 90 "retries": 100, | 98 "retries": 100, |
| 91 "nooverwrites": True, | 99 "nooverwrites": True, |
| 92 "call_home": False, | 100 "call_home": False, |
| 95 "writedescription": True, | 103 "writedescription": True, |
| 96 "writethumbnail": True, | 104 "writethumbnail": True, |
| 97 "writeannotations": True, | 105 "writeannotations": True, |
| 98 "writesubtitles": True, | 106 "writesubtitles": True, |
| 99 "allsubtitles": True, | 107 "allsubtitles": True, |
| 100 "ignoreerrors": True, | |
| 101 "addmetadata": True, | 108 "addmetadata": True, |
| 102 "continuedl": True, | 109 "continuedl": True, |
| 103 "embedthumbnail": True, | 110 "embedthumbnail": True, |
| 104 "format": "bestvideo+bestaudio/best", | 111 "format": "bestvideo+bestaudio/best", |
| 105 "restrictfilenames": True, | 112 "restrictfilenames": True, |
| 107 "progress_hooks": [ytdl_hook], | 114 "progress_hooks": [ytdl_hook], |
| 108 "logger": MyLogger(), | 115 "logger": MyLogger(), |
| 109 "ignoreerrors": False, | 116 "ignoreerrors": False, |
| 110 } | 117 } |
| 111 | 118 |
| 112 if not os.path.exists(args["--output"]): | 119 |
| 113 os.mkdir(args["--output"]) | 120 def wayback_machine_dl(video: dict, basename: str) -> int: |
| 114 | 121 try: |
| 115 for i in load_split_files(args["--database"])["videos"]: | 122 url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu", |
| 116 uploader = i["uploader_id"] if "uploader_id" in i else None | 123 "rl.archive.org/yt/%s"]) |
| 117 for url in args["<url>"]: | 124 headers = urllib.request.urlopen(url % video["id"]) |
| 118 channel = url.split("/")[-1] | 125 contenttype = headers.getheader("Content-Type") |
| 119 | 126 if contenttype == "video/webm": |
| 120 output = "%s/%s" % (args["--output"], channel) | 127 ext = "webm" |
| 121 if not os.path.exists(output): | 128 elif contenttype == "video/mp4": |
| 122 os.mkdir(output) | 129 ext = "mp4" |
| 123 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" | 130 else: |
| 124 | 131 raise HTTPError(url=None, code=None, msg=None, |
| 125 | 132 hdrs=None, fp=None) |
| 126 if uploader == channel: | 133 urllib.request.urlretrieve(url % video["id"], "%s.%s" % (basename, ext), |
| 127 print("%s:" % i["id"]) | 134 reporthook) |
| 128 # :skull: | 135 print(" downloaded %s.%s" % (basename, ext)) |
| 129 # todo: put this in a function? | 136 return 0 |
| 130 if any(x in os.listdir(output) for x in [sanitize_filename("%s-%s.mp4" % (i["title"], i["id"]), restricted=True), | 137 except TimeoutError: |
| 131 sanitize_filename("%s-%s.mkv" % (i["title"], i["id"]), restricted=True), | 138 return 1 |
| 132 sanitize_filename("%s-%s.webm" % (i["title"], i["id"]), restricted=True)]): | 139 except HTTPError: |
| 133 print(" video already downloaded!") | 140 print(" video not available on the Wayback Machine!") |
| 141 return 0 | |
| 142 except Exception as e: | |
| 143 print(" unknown error downloading video!\n") | |
| 144 print(e) | |
| 145 return 0 | |
| 146 | |
| 147 def internet_archive_dl(video: dict, basename: str) -> int: | |
| 148 if internetarchive.get_item("youtube-%s" % video["id"]).exists: | |
| 149 fnames = [f.name for f in internetarchive.get_files( | |
| 150 "youtube-%s" % video["id"])] | |
| 151 flist = [] | |
| 152 for fname in range(len(fnames)): | |
| 153 if re.search(''.join([r"((?:.+?-)?", video["id"], | |
| 154 r"\.(?:mp4|jpg|webp|mkv|webm|info\\.json|des" | |
| 155 r"cription|annotations.xml))"]), | |
| 156 fnames[fname]): | |
| 157 flist.append(fnames[fname]) | |
| 158 while True: | |
| 159 try: | |
| 160 internetarchive.download("youtube-%s" % video["id"], | |
| 161 files=flist, verbose=True, | |
| 162 destdir=output, | |
| 163 no_directory=True, | |
| 164 ignore_existing=True, | |
| 165 retries=9999) | |
| 166 break | |
| 167 except ConnectTimeout: | |
| 134 continue | 168 continue |
| 135 # this code is *really* ugly... todo a rewrite? | 169 except Exception: |
| 136 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: | 170 return 0 |
| 137 try: | 171 if flist[0][:len(video["id"])] == video["id"]: |
| 138 result = ytdl.extract_info("https://youtube.com/watch?v=%s" % i["id"]) | 172 for fname in flist: |
| 173 if os.path.exists("%s/%s" % (output, fname)): | |
| 174 os.replace("%s/%s" % (output, fname), | |
| 175 "%s-%s" % (basename.rsplit("-", 1)[0], | |
| 176 fname)) | |
| 177 return 1 | |
| 178 return 0 | |
| 179 | |
| 180 def main(): | |
| 181 args = docopt.docopt(__doc__) | |
| 182 | |
| 183 if not os.path.exists(args["--output"]): | |
| 184 os.mkdir(args["--output"]) | |
| 185 | |
| 186 for i in load_split_files(args["--database"])["videos"]: | |
| 187 uploader = i["uploader_id"] if "uploader_id" in i else None | |
| 188 for url in args["<url>"]: | |
| 189 channel = url.split("/")[-1] | |
| 190 | |
| 191 output = "%s/%s" % (args["--output"], channel) | |
| 192 if not os.path.exists(output): | |
| 193 os.mkdir(output) | |
| 194 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" | |
| 195 | |
| 196 if uploader == channel: | |
| 197 print("%s:" % i["id"]) | |
| 198 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], | |
| 199 restricted=True), i["id"]) | |
| 200 path = Path(output) | |
| 201 files = list(path.glob("*-%s.mkv" % i["id"])) | |
| 202 files.extend(list(path.glob("*-%s.mp4" % i["id"]))) | |
| 203 files.extend(list(path.glob("*-%s.webm" % i["id"]))) | |
| 204 if files: | |
| 205 print(" video already downloaded!") | |
| 206 write_metadata(i, basename) | |
| 139 continue | 207 continue |
| 140 except DownloadError: | 208 # this code is *really* ugly... todo a rewrite? |
| 141 print(" video is not available! attempting to find Internet Archive pages of it...") | 209 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: |
| 142 except Exception as e: | |
| 143 print(" unknown error downloading video!\n") | |
| 144 print(e) | |
| 145 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available | |
| 146 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] | |
| 147 flist = [] | |
| 148 for fname in range(len(fnames)): | |
| 149 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]): | |
| 150 flist.append(fnames[fname]) | |
| 151 if len(flist) >= 1: | |
| 152 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True, retries=9999) | |
| 153 else: | |
| 154 print(" video already downloaded!") | |
| 155 continue | |
| 156 if os.path.exists("%s/%s.info.json" % (output, i["id"])): # will always exist no matter which setting was used to download | |
| 157 for fname in flist: | |
| 158 if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname): | |
| 159 os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname) | |
| 160 else: | |
| 161 print("ID file not found!") | |
| 162 else: | |
| 163 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") | |
| 164 try: # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url, | |
| 165 # and we wouldn't even know if it worked. so let's continue using our little "hack" | |
| 166 headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"]) | |
| 167 if hasattr(headers.info(), "getheader"): | |
| 168 contenttype = headers.info().getheader("Content-Type") | |
| 169 else: | |
| 170 contenttype = headers.getheader("Content-Type") | |
| 171 if contenttype == "video/webm": | |
| 172 ext = "webm" | |
| 173 elif contenttype == "video/mp4": | |
| 174 ext = "mp4" | |
| 175 else: | |
| 176 raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None) | |
| 177 compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook) | |
| 178 print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext)) | |
| 179 except HTTPError: | |
| 180 print(" video not available on the Wayback Machine!") | |
| 181 except Exception as e: | |
| 182 print(" unknown error downloading video!\n") | |
| 183 print(e) | |
| 184 # metadata | |
| 185 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"]) | |
| 186 if not os.path.exists(basename + ".info.json"): | |
| 187 with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile: | |
| 188 try: | 210 try: |
| 189 jsonfile.write(json.dumps(i).decode("utf-8")) | 211 ytdl.extract_info("https://youtube.com/watch?v=%s" |
| 190 except AttributeError: | 212 % i["id"]) |
| 191 jsonfile.write(json.dumps(i)) | 213 continue |
| 192 print(" saved %s" % os.path.basename(jsonfile.name)) | 214 except DownloadError: |
| 193 if not os.path.exists(basename + ".description"): | 215 print(" video is not available! attempting to find In" |
| 194 with open(basename + ".description", "w", encoding="utf-8") as descfile: | 216 "ternet Archive pages of it...") |
| 195 descfile.write(i["description"]) | 217 except Exception as e: |
| 196 print(" saved %s" % os.path.basename(descfile.name)) | 218 print(" unknown error downloading video!\n") |
| 197 | 219 print(e) |
| 220 if internet_archive_dl(i, basename) == 0: # if we can't download from IA | |
| 221 print(" video does not have a Internet Archive page! attem" | |
| 222 "pting to download from the Wayback Machine...") | |
| 223 while True: | |
| 224 if wayback_machine_dl(i, basename) == 0: # success | |
| 225 break | |
| 226 time.sleep(5) | |
| 227 continue | |
| 228 write_metadata(i, basename) | |
| 229 | |
| 230 | |
| 231 if __name__ == "__main__": | |
| 232 main() |
