comparison channeldownloader.py @ 118:eac6dae753ca

*: major cleanup committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Fri, 03 Mar 2023 22:51:28 +0000
parents 80bd4a99ea00
children 196cf2e3d96e
comparison
equal deleted inserted replaced
117:40a7b6d9bd3b 118:eac6dae753ca
23 except ImportError: 23 except ImportError:
24 import json 24 import json
25 import os 25 import os
26 import re 26 import re
27 import time 27 import time
28 try: 28 import urllib.request
29 import urllib.request as compat_urllib 29 import requests # need this for ONE (1) exception
30 from urllib.error import HTTPError 30 import yt_dlp as youtube_dl
31 except ImportError: # Python 2 31 from urllib.error import HTTPError
32 import urllib as compat_urllib 32 from yt_dlp.utils import sanitize_filename, DownloadError
33 from urllib2 import HTTPError 33 from pathlib import Path
34 try: 34 from requests.exceptions import ConnectTimeout
35 import yt_dlp as youtube_dl 35
36 from yt_dlp.utils import sanitize_filename, DownloadError
37 except ImportError:
38 try:
39 import youtube_dl
40 from youtube_dl.utils import sanitize_filename, DownloadError
41 except ImportError:
42 print("ERROR: youtube-dl/yt-dlp not installed!")
43 exit(1)
44 from io import open # for Python 2 compatibility, in Python 3 this
45 # just maps to the built-in function
46 36
47 class MyLogger(object): 37 class MyLogger(object):
48 def debug(self, msg): 38 def debug(self, msg):
49 pass 39 pass
50 40
53 43
54 def error(self, msg): 44 def error(self, msg):
55 print(" " + msg) 45 print(" " + msg)
56 pass 46 pass
57 47
58 def ytdl_hook(d): 48
49 def ytdl_hook(d) -> None:
59 if d["status"] == "finished": 50 if d["status"] == "finished":
60 print(" downloaded %s: 100%% " % (os.path.basename(d["filename"]))) 51 print(" downloaded %s: 100%% " % (os.path.basename(d["filename"])))
61 if d["status"] == "downloading": 52 if d["status"] == "downloading":
62 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="") 53 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]),
54 d["_percent_str"]), end="")
63 if d["status"] == "error": 55 if d["status"] == "error":
64 print("\n an error occurred downloading %s!" % (os.path.basename(d["filename"]))) 56 print("\n an error occurred downloading %s!"
65 57 % (os.path.basename(d["filename"])))
66 def load_split_files(path): 58
59
60 def load_split_files(path: str) -> dict:
67 if os.path.isdir(path): 61 if os.path.isdir(path):
68 result = {"videos": []} 62 result = {"videos": []}
69 for fi in os.listdir(path): 63 for fi in os.listdir(path):
70 for f in re.findall(r"vids[0-9\-]+?\.json", fi): 64 for f in re.findall(r"vids[0-9\-]+?\.json", fi):
71 with open(path + "/" + f, "r", encoding="utf-8") as infile: 65 with open(path + "/" + f, "r", encoding="utf-8") as infile:
73 result["videos"].extend(jsonnn) 67 result["videos"].extend(jsonnn)
74 return result 68 return result
75 else: 69 else:
76 return json.loads(open(path, "r", encoding="utf-8").read()) 70 return json.loads(open(path, "r", encoding="utf-8").read())
77 71
78 def reporthook(count, block_size, total_size): 72
73 def reporthook(count: int, block_size: int, total_size: int) -> None:
79 global start_time 74 global start_time
80 if count == 0: 75 if count == 0:
81 start_time = time.time() 76 start_time = time.time()
82 return 77 return
83 duration = time.time() - start_time
84 percent = int(count * block_size * 100 / total_size) 78 percent = int(count * block_size * 100 / total_size)
85 print(" downloading %d%% \r" % (percent), end="") 79 print(" downloading %d%% \r" % (percent), end="")
86 80
87 args = docopt.docopt(__doc__) 81
82 def write_metadata(i: dict, basename: str) -> None:
83 if not os.path.exists(basename + ".info.json"):
84 with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile:
85 try:
86 jsonfile.write(json.dumps(i).decode("utf-8"))
87 except AttributeError:
88 jsonfile.write(json.dumps(i))
89 print(" saved %s" % os.path.basename(jsonfile.name))
90 if not os.path.exists(basename + ".description"):
91 with open(basename + ".description", "w",
92 encoding="utf-8") as descfile:
93 descfile.write(i["description"])
94 print(" saved %s" % os.path.basename(descfile.name))
95
88 96
89 ytdl_opts = { 97 ytdl_opts = {
90 "retries": 100, 98 "retries": 100,
91 "nooverwrites": True, 99 "nooverwrites": True,
92 "call_home": False, 100 "call_home": False,
95 "writedescription": True, 103 "writedescription": True,
96 "writethumbnail": True, 104 "writethumbnail": True,
97 "writeannotations": True, 105 "writeannotations": True,
98 "writesubtitles": True, 106 "writesubtitles": True,
99 "allsubtitles": True, 107 "allsubtitles": True,
100 "ignoreerrors": True,
101 "addmetadata": True, 108 "addmetadata": True,
102 "continuedl": True, 109 "continuedl": True,
103 "embedthumbnail": True, 110 "embedthumbnail": True,
104 "format": "bestvideo+bestaudio/best", 111 "format": "bestvideo+bestaudio/best",
105 "restrictfilenames": True, 112 "restrictfilenames": True,
107 "progress_hooks": [ytdl_hook], 114 "progress_hooks": [ytdl_hook],
108 "logger": MyLogger(), 115 "logger": MyLogger(),
109 "ignoreerrors": False, 116 "ignoreerrors": False,
110 } 117 }
111 118
112 if not os.path.exists(args["--output"]): 119
113 os.mkdir(args["--output"]) 120 def wayback_machine_dl(video: dict, basename: str) -> int:
114 121 try:
115 for i in load_split_files(args["--database"])["videos"]: 122 url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu",
116 uploader = i["uploader_id"] if "uploader_id" in i else None 123 "rl.archive.org/yt/%s"])
117 for url in args["<url>"]: 124 headers = urllib.request.urlopen(url % video["id"])
118 channel = url.split("/")[-1] 125 contenttype = headers.getheader("Content-Type")
119 126 if contenttype == "video/webm":
120 output = "%s/%s" % (args["--output"], channel) 127 ext = "webm"
121 if not os.path.exists(output): 128 elif contenttype == "video/mp4":
122 os.mkdir(output) 129 ext = "mp4"
123 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" 130 else:
124 131 raise HTTPError(url=None, code=None, msg=None,
125 132 hdrs=None, fp=None)
126 if uploader == channel: 133 urllib.request.urlretrieve(url % video["id"], "%s.%s" % (basename, ext),
127 print("%s:" % i["id"]) 134 reporthook)
128 # :skull: 135 print(" downloaded %s.%s" % (basename, ext))
129 # todo: put this in a function? 136 return 0
130 if any(x in os.listdir(output) for x in [sanitize_filename("%s-%s.mp4" % (i["title"], i["id"]), restricted=True), 137 except TimeoutError:
131 sanitize_filename("%s-%s.mkv" % (i["title"], i["id"]), restricted=True), 138 return 1
132 sanitize_filename("%s-%s.webm" % (i["title"], i["id"]), restricted=True)]): 139 except HTTPError:
133 print(" video already downloaded!") 140 print(" video not available on the Wayback Machine!")
141 return 0
142 except Exception as e:
143 print(" unknown error downloading video!\n")
144 print(e)
145 return 0
146
147 def internet_archive_dl(video: dict, basename: str) -> int:
148 if internetarchive.get_item("youtube-%s" % video["id"]).exists:
149 fnames = [f.name for f in internetarchive.get_files(
150 "youtube-%s" % video["id"])]
151 flist = []
152 for fname in range(len(fnames)):
153 if re.search(''.join([r"((?:.+?-)?", video["id"],
154 r"\.(?:mp4|jpg|webp|mkv|webm|info\\.json|des"
155 r"cription|annotations.xml))"]),
156 fnames[fname]):
157 flist.append(fnames[fname])
158 while True:
159 try:
160 internetarchive.download("youtube-%s" % video["id"],
161 files=flist, verbose=True,
162 destdir=output,
163 no_directory=True,
164 ignore_existing=True,
165 retries=9999)
166 break
167 except ConnectTimeout:
134 continue 168 continue
135 # this code is *really* ugly... todo a rewrite? 169 except Exception:
136 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: 170 return 0
137 try: 171 if flist[0][:len(video["id"])] == video["id"]:
138 result = ytdl.extract_info("https://youtube.com/watch?v=%s" % i["id"]) 172 for fname in flist:
173 if os.path.exists("%s/%s" % (output, fname)):
174 os.replace("%s/%s" % (output, fname),
175 "%s-%s" % (basename.rsplit("-", 1)[0],
176 fname))
177 return 1
178 return 0
179
180 def main():
181 args = docopt.docopt(__doc__)
182
183 if not os.path.exists(args["--output"]):
184 os.mkdir(args["--output"])
185
186 for i in load_split_files(args["--database"])["videos"]:
187 uploader = i["uploader_id"] if "uploader_id" in i else None
188 for url in args["<url>"]:
189 channel = url.split("/")[-1]
190
191 output = "%s/%s" % (args["--output"], channel)
192 if not os.path.exists(output):
193 os.mkdir(output)
194 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
195
196 if uploader == channel:
197 print("%s:" % i["id"])
198 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"],
199 restricted=True), i["id"])
200 path = Path(output)
201 files = list(path.glob("*-%s.mkv" % i["id"]))
202 files.extend(list(path.glob("*-%s.mp4" % i["id"])))
203 files.extend(list(path.glob("*-%s.webm" % i["id"])))
204 if files:
205 print(" video already downloaded!")
206 write_metadata(i, basename)
139 continue 207 continue
140 except DownloadError: 208 # this code is *really* ugly... todo a rewrite?
141 print(" video is not available! attempting to find Internet Archive pages of it...") 209 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
142 except Exception as e:
143 print(" unknown error downloading video!\n")
144 print(e)
145 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available
146 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
147 flist = []
148 for fname in range(len(fnames)):
149 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
150 flist.append(fnames[fname])
151 if len(flist) >= 1:
152 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True, retries=9999)
153 else:
154 print(" video already downloaded!")
155 continue
156 if os.path.exists("%s/%s.info.json" % (output, i["id"])): # will always exist no matter which setting was used to download
157 for fname in flist:
158 if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
159 os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
160 else:
161 print("ID file not found!")
162 else:
163 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
164 try: # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url,
165 # and we wouldn't even know if it worked. so let's continue using our little "hack"
166 headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"])
167 if hasattr(headers.info(), "getheader"):
168 contenttype = headers.info().getheader("Content-Type")
169 else:
170 contenttype = headers.getheader("Content-Type")
171 if contenttype == "video/webm":
172 ext = "webm"
173 elif contenttype == "video/mp4":
174 ext = "mp4"
175 else:
176 raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None)
177 compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook)
178 print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext))
179 except HTTPError:
180 print(" video not available on the Wayback Machine!")
181 except Exception as e:
182 print(" unknown error downloading video!\n")
183 print(e)
184 # metadata
185 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"])
186 if not os.path.exists(basename + ".info.json"):
187 with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile:
188 try: 210 try:
189 jsonfile.write(json.dumps(i).decode("utf-8")) 211 ytdl.extract_info("https://youtube.com/watch?v=%s"
190 except AttributeError: 212 % i["id"])
191 jsonfile.write(json.dumps(i)) 213 continue
192 print(" saved %s" % os.path.basename(jsonfile.name)) 214 except DownloadError:
193 if not os.path.exists(basename + ".description"): 215 print(" video is not available! attempting to find In"
194 with open(basename + ".description", "w", encoding="utf-8") as descfile: 216 "ternet Archive pages of it...")
195 descfile.write(i["description"]) 217 except Exception as e:
196 print(" saved %s" % os.path.basename(descfile.name)) 218 print(" unknown error downloading video!\n")
197 219 print(e)
220 if internet_archive_dl(i, basename) == 0: # if we can't download from IA
221 print(" video does not have a Internet Archive page! attem"
222 "pting to download from the Wayback Machine...")
223 while True:
224 if wayback_machine_dl(i, basename) == 0: # success
225 break
226 time.sleep(5)
227 continue
228 write_metadata(i, basename)
229
230
231 if __name__ == "__main__":
232 main()