comparison channeldownloader.py @ 119:196cf2e3d96e

channeldownloader: insane memory optimizations it should now use at maximum 300mb if you're using the split json files committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Sat, 25 Mar 2023 17:02:23 -0400
parents eac6dae753ca
children 3ecb2e815854
comparison
equal deleted inserted replaced
118:eac6dae753ca 119:196cf2e3d96e
55 if d["status"] == "error": 55 if d["status"] == "error":
56 print("\n an error occurred downloading %s!" 56 print("\n an error occurred downloading %s!"
57 % (os.path.basename(d["filename"]))) 57 % (os.path.basename(d["filename"])))
58 58
59 59
60 def load_split_files(path: str) -> dict: 60 def load_split_files(path: str):
61 if os.path.isdir(path): 61 if not os.path.isdir(path):
62 result = {"videos": []} 62 yield json.load(open(path, "r", encoding="utf-8"))
63 for fi in os.listdir(path): 63 for fi in os.listdir(path):
64 for f in re.findall(r"vids[0-9\-]+?\.json", fi): 64 if re.search(r"vids[0-9\-]+?\.json", fi):
65 with open(path + "/" + f, "r", encoding="utf-8") as infile: 65 with open(path + "/" + fi, "r", encoding="utf-8") as infile:
66 jsonnn = json.loads(infile.read()) 66 print(fi)
67 result["videos"].extend(jsonnn) 67 yield json.load(infile)
68 return result
69 else:
70 return json.loads(open(path, "r", encoding="utf-8").read())
71 68
72 69
73 def reporthook(count: int, block_size: int, total_size: int) -> None: 70 def reporthook(count: int, block_size: int, total_size: int) -> None:
74 global start_time 71 global start_time
75 if count == 0: 72 if count == 0:
90 if not os.path.exists(basename + ".description"): 87 if not os.path.exists(basename + ".description"):
91 with open(basename + ".description", "w", 88 with open(basename + ".description", "w",
92 encoding="utf-8") as descfile: 89 encoding="utf-8") as descfile:
93 descfile.write(i["description"]) 90 descfile.write(i["description"])
94 print(" saved %s" % os.path.basename(descfile.name)) 91 print(" saved %s" % os.path.basename(descfile.name))
95
96
97 ytdl_opts = {
98 "retries": 100,
99 "nooverwrites": True,
100 "call_home": False,
101 "quiet": True,
102 "writeinfojson": True,
103 "writedescription": True,
104 "writethumbnail": True,
105 "writeannotations": True,
106 "writesubtitles": True,
107 "allsubtitles": True,
108 "addmetadata": True,
109 "continuedl": True,
110 "embedthumbnail": True,
111 "format": "bestvideo+bestaudio/best",
112 "restrictfilenames": True,
113 "no_warnings": True,
114 "progress_hooks": [ytdl_hook],
115 "logger": MyLogger(),
116 "ignoreerrors": False,
117 }
118 92
119 93
120 def wayback_machine_dl(video: dict, basename: str) -> int: 94 def wayback_machine_dl(video: dict, basename: str) -> int:
121 try: 95 try:
122 url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu", 96 url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu",
142 except Exception as e: 116 except Exception as e:
143 print(" unknown error downloading video!\n") 117 print(" unknown error downloading video!\n")
144 print(e) 118 print(e)
145 return 0 119 return 0
146 120
147 def internet_archive_dl(video: dict, basename: str) -> int: 121
122 def ia_file_legit(path: str, vidid: str) -> bool:
123 return True if re.search(''.join([r"((?:.+?-)?", vidid, r"\.(?:mp4|jpg|web"
124 r"p|mkv|webm|info\\.json|description|annotations.xml"
125 "))"]),
126 path) else False
127
128
129 def internet_archive_dl(video: dict, basename: str, output: str) -> int:
148 if internetarchive.get_item("youtube-%s" % video["id"]).exists: 130 if internetarchive.get_item("youtube-%s" % video["id"]).exists:
149 fnames = [f.name for f in internetarchive.get_files( 131 flist = [f.name for f in internetarchive.get_files("youtube-%s" % video["id"]) if ia_file_legit(f.name, video["id"])]
150 "youtube-%s" % video["id"])]
151 flist = []
152 for fname in range(len(fnames)):
153 if re.search(''.join([r"((?:.+?-)?", video["id"],
154 r"\.(?:mp4|jpg|webp|mkv|webm|info\\.json|des"
155 r"cription|annotations.xml))"]),
156 fnames[fname]):
157 flist.append(fnames[fname])
158 while True: 132 while True:
159 try: 133 try:
160 internetarchive.download("youtube-%s" % video["id"], 134 internetarchive.download("youtube-%s" % video["id"],
161 files=flist, verbose=True, 135 files=flist, verbose=True,
162 destdir=output, 136 destdir=output,
164 ignore_existing=True, 138 ignore_existing=True,
165 retries=9999) 139 retries=9999)
166 break 140 break
167 except ConnectTimeout: 141 except ConnectTimeout:
168 continue 142 continue
169 except Exception: 143 except Exception as e:
144 print(e)
170 return 0 145 return 0
171 if flist[0][:len(video["id"])] == video["id"]: 146 if flist[0][:len(video["id"])] == video["id"]:
172 for fname in flist: 147 for fname in flist:
173 if os.path.exists("%s/%s" % (output, fname)): 148 if os.path.exists("%s/%s" % (output, fname)):
174 os.replace("%s/%s" % (output, fname), 149 os.replace("%s/%s" % (output, fname),
175 "%s-%s" % (basename.rsplit("-", 1)[0], 150 "%s-%s" % (basename.rsplit("-", 1)[0],
176 fname)) 151 fname))
177 return 1 152 return 1
178 return 0 153 return 0
179 154
155
156 ytdl_opts = {
157 "retries": 100,
158 "nooverwrites": True,
159 "call_home": False,
160 "quiet": True,
161 "writeinfojson": True,
162 "writedescription": True,
163 "writethumbnail": True,
164 "writeannotations": True,
165 "writesubtitles": True,
166 "allsubtitles": True,
167 "addmetadata": True,
168 "continuedl": True,
169 "embedthumbnail": True,
170 "format": "bestvideo+bestaudio/best",
171 "restrictfilenames": True,
172 "no_warnings": True,
173 "progress_hooks": [ytdl_hook],
174 "logger": MyLogger(),
175 "ignoreerrors": False,
176 }
177
178
180 def main(): 179 def main():
181 args = docopt.docopt(__doc__) 180 args = docopt.docopt(__doc__)
182 181
183 if not os.path.exists(args["--output"]): 182 if not os.path.exists(args["--output"]):
184 os.mkdir(args["--output"]) 183 os.mkdir(args["--output"])
185 184
186 for i in load_split_files(args["--database"])["videos"]: 185 for f in load_split_files(args["--database"]):
187 uploader = i["uploader_id"] if "uploader_id" in i else None 186 for i in f:
188 for url in args["<url>"]: 187 uploader = i["uploader_id"] if "uploader_id" in i else None
189 channel = url.split("/")[-1] 188 for url in args["<url>"]:
190 189 channel = url.split("/")[-1]
191 output = "%s/%s" % (args["--output"], channel) 190
192 if not os.path.exists(output): 191 output = "%s/%s" % (args["--output"], channel)
193 os.mkdir(output) 192 if not os.path.exists(output):
194 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" 193 os.mkdir(output)
195 194 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
196 if uploader == channel: 195
197 print("%s:" % i["id"]) 196 if uploader == channel:
198 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], 197 print(uploader, channel)
199 restricted=True), i["id"]) 198 print("%s:" % i["id"])
200 path = Path(output) 199 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"],
201 files = list(path.glob("*-%s.mkv" % i["id"])) 200 restricted=True), i["id"])
202 files.extend(list(path.glob("*-%s.mp4" % i["id"]))) 201 files = [y for p in ["mkv", "mp4", "webm"] for y in list(Path(output).glob(("*-%s." + p) % i["id"]))]
203 files.extend(list(path.glob("*-%s.webm" % i["id"]))) 202 if files:
204 if files: 203 print(" video already downloaded!")
205 print(" video already downloaded!") 204 write_metadata(i, basename)
206 write_metadata(i, basename)
207 continue
208 # this code is *really* ugly... todo a rewrite?
209 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
210 try:
211 ytdl.extract_info("https://youtube.com/watch?v=%s"
212 % i["id"])
213 continue 205 continue
214 except DownloadError: 206 # this code is *really* ugly... todo a rewrite?
215 print(" video is not available! attempting to find In" 207 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
216 "ternet Archive pages of it...") 208 try:
217 except Exception as e: 209 ytdl.extract_info("https://youtube.com/watch?v=%s"
218 print(" unknown error downloading video!\n") 210 % i["id"])
219 print(e) 211 continue
220 if internet_archive_dl(i, basename) == 0: # if we can't download from IA 212 except DownloadError:
213 print(" video is not available! attempting to find In"
214 "ternet Archive pages of it...")
215 except Exception as e:
216 print(" unknown error downloading video!\n")
217 print(e)
218 if internet_archive_dl(i, basename, output): # if we can't download from IA
219 continue
221 print(" video does not have a Internet Archive page! attem" 220 print(" video does not have a Internet Archive page! attem"
222 "pting to download from the Wayback Machine...") 221 "pting to download from the Wayback Machine...")
223 while True: 222 while True:
224 if wayback_machine_dl(i, basename) == 0: # success 223 if wayback_machine_dl(i, basename) == 0: # success
225 break 224 break
226 time.sleep(5) 225 time.sleep(5)
227 continue 226 continue
228 write_metadata(i, basename) 227 write_metadata(i, basename)
229 228
230 229
231 if __name__ == "__main__": 230 if __name__ == "__main__":
232 main() 231 main()