Mercurial > codedump
comparison channeldownloader.py @ 119:196cf2e3d96e
channeldownloader: insane memory optimizations
it should now use at maximum 300mb if you're using the split json files
committer: GitHub <noreply@github.com>
author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
---|---|
date | Sat, 25 Mar 2023 17:02:23 -0400 |
parents | eac6dae753ca |
children | 3ecb2e815854 |
comparison
equal
deleted
inserted
replaced
118:eac6dae753ca | 119:196cf2e3d96e |
---|---|
55 if d["status"] == "error": | 55 if d["status"] == "error": |
56 print("\n an error occurred downloading %s!" | 56 print("\n an error occurred downloading %s!" |
57 % (os.path.basename(d["filename"]))) | 57 % (os.path.basename(d["filename"]))) |
58 | 58 |
59 | 59 |
60 def load_split_files(path: str) -> dict: | 60 def load_split_files(path: str): |
61 if os.path.isdir(path): | 61 if not os.path.isdir(path): |
62 result = {"videos": []} | 62 yield json.load(open(path, "r", encoding="utf-8")) |
63 for fi in os.listdir(path): | 63 for fi in os.listdir(path): |
64 for f in re.findall(r"vids[0-9\-]+?\.json", fi): | 64 if re.search(r"vids[0-9\-]+?\.json", fi): |
65 with open(path + "/" + f, "r", encoding="utf-8") as infile: | 65 with open(path + "/" + fi, "r", encoding="utf-8") as infile: |
66 jsonnn = json.loads(infile.read()) | 66 print(fi) |
67 result["videos"].extend(jsonnn) | 67 yield json.load(infile) |
68 return result | |
69 else: | |
70 return json.loads(open(path, "r", encoding="utf-8").read()) | |
71 | 68 |
72 | 69 |
73 def reporthook(count: int, block_size: int, total_size: int) -> None: | 70 def reporthook(count: int, block_size: int, total_size: int) -> None: |
74 global start_time | 71 global start_time |
75 if count == 0: | 72 if count == 0: |
90 if not os.path.exists(basename + ".description"): | 87 if not os.path.exists(basename + ".description"): |
91 with open(basename + ".description", "w", | 88 with open(basename + ".description", "w", |
92 encoding="utf-8") as descfile: | 89 encoding="utf-8") as descfile: |
93 descfile.write(i["description"]) | 90 descfile.write(i["description"]) |
94 print(" saved %s" % os.path.basename(descfile.name)) | 91 print(" saved %s" % os.path.basename(descfile.name)) |
95 | |
96 | |
97 ytdl_opts = { | |
98 "retries": 100, | |
99 "nooverwrites": True, | |
100 "call_home": False, | |
101 "quiet": True, | |
102 "writeinfojson": True, | |
103 "writedescription": True, | |
104 "writethumbnail": True, | |
105 "writeannotations": True, | |
106 "writesubtitles": True, | |
107 "allsubtitles": True, | |
108 "addmetadata": True, | |
109 "continuedl": True, | |
110 "embedthumbnail": True, | |
111 "format": "bestvideo+bestaudio/best", | |
112 "restrictfilenames": True, | |
113 "no_warnings": True, | |
114 "progress_hooks": [ytdl_hook], | |
115 "logger": MyLogger(), | |
116 "ignoreerrors": False, | |
117 } | |
118 | 92 |
119 | 93 |
120 def wayback_machine_dl(video: dict, basename: str) -> int: | 94 def wayback_machine_dl(video: dict, basename: str) -> int: |
121 try: | 95 try: |
122 url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu", | 96 url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu", |
142 except Exception as e: | 116 except Exception as e: |
143 print(" unknown error downloading video!\n") | 117 print(" unknown error downloading video!\n") |
144 print(e) | 118 print(e) |
145 return 0 | 119 return 0 |
146 | 120 |
147 def internet_archive_dl(video: dict, basename: str) -> int: | 121 |
122 def ia_file_legit(path: str, vidid: str) -> bool: | |
123 return True if re.search(''.join([r"((?:.+?-)?", vidid, r"\.(?:mp4|jpg|web" | |
124 r"p|mkv|webm|info\\.json|description|annotations.xml" | |
125 "))"]), | |
126 path) else False | |
127 | |
128 | |
129 def internet_archive_dl(video: dict, basename: str, output: str) -> int: | |
148 if internetarchive.get_item("youtube-%s" % video["id"]).exists: | 130 if internetarchive.get_item("youtube-%s" % video["id"]).exists: |
149 fnames = [f.name for f in internetarchive.get_files( | 131 flist = [f.name for f in internetarchive.get_files("youtube-%s" % video["id"]) if ia_file_legit(f.name, video["id"])] |
150 "youtube-%s" % video["id"])] | |
151 flist = [] | |
152 for fname in range(len(fnames)): | |
153 if re.search(''.join([r"((?:.+?-)?", video["id"], | |
154 r"\.(?:mp4|jpg|webp|mkv|webm|info\\.json|des" | |
155 r"cription|annotations.xml))"]), | |
156 fnames[fname]): | |
157 flist.append(fnames[fname]) | |
158 while True: | 132 while True: |
159 try: | 133 try: |
160 internetarchive.download("youtube-%s" % video["id"], | 134 internetarchive.download("youtube-%s" % video["id"], |
161 files=flist, verbose=True, | 135 files=flist, verbose=True, |
162 destdir=output, | 136 destdir=output, |
164 ignore_existing=True, | 138 ignore_existing=True, |
165 retries=9999) | 139 retries=9999) |
166 break | 140 break |
167 except ConnectTimeout: | 141 except ConnectTimeout: |
168 continue | 142 continue |
169 except Exception: | 143 except Exception as e: |
144 print(e) | |
170 return 0 | 145 return 0 |
171 if flist[0][:len(video["id"])] == video["id"]: | 146 if flist[0][:len(video["id"])] == video["id"]: |
172 for fname in flist: | 147 for fname in flist: |
173 if os.path.exists("%s/%s" % (output, fname)): | 148 if os.path.exists("%s/%s" % (output, fname)): |
174 os.replace("%s/%s" % (output, fname), | 149 os.replace("%s/%s" % (output, fname), |
175 "%s-%s" % (basename.rsplit("-", 1)[0], | 150 "%s-%s" % (basename.rsplit("-", 1)[0], |
176 fname)) | 151 fname)) |
177 return 1 | 152 return 1 |
178 return 0 | 153 return 0 |
179 | 154 |
155 | |
156 ytdl_opts = { | |
157 "retries": 100, | |
158 "nooverwrites": True, | |
159 "call_home": False, | |
160 "quiet": True, | |
161 "writeinfojson": True, | |
162 "writedescription": True, | |
163 "writethumbnail": True, | |
164 "writeannotations": True, | |
165 "writesubtitles": True, | |
166 "allsubtitles": True, | |
167 "addmetadata": True, | |
168 "continuedl": True, | |
169 "embedthumbnail": True, | |
170 "format": "bestvideo+bestaudio/best", | |
171 "restrictfilenames": True, | |
172 "no_warnings": True, | |
173 "progress_hooks": [ytdl_hook], | |
174 "logger": MyLogger(), | |
175 "ignoreerrors": False, | |
176 } | |
177 | |
178 | |
180 def main(): | 179 def main(): |
181 args = docopt.docopt(__doc__) | 180 args = docopt.docopt(__doc__) |
182 | 181 |
183 if not os.path.exists(args["--output"]): | 182 if not os.path.exists(args["--output"]): |
184 os.mkdir(args["--output"]) | 183 os.mkdir(args["--output"]) |
185 | 184 |
186 for i in load_split_files(args["--database"])["videos"]: | 185 for f in load_split_files(args["--database"]): |
187 uploader = i["uploader_id"] if "uploader_id" in i else None | 186 for i in f: |
188 for url in args["<url>"]: | 187 uploader = i["uploader_id"] if "uploader_id" in i else None |
189 channel = url.split("/")[-1] | 188 for url in args["<url>"]: |
190 | 189 channel = url.split("/")[-1] |
191 output = "%s/%s" % (args["--output"], channel) | 190 |
192 if not os.path.exists(output): | 191 output = "%s/%s" % (args["--output"], channel) |
193 os.mkdir(output) | 192 if not os.path.exists(output): |
194 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" | 193 os.mkdir(output) |
195 | 194 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" |
196 if uploader == channel: | 195 |
197 print("%s:" % i["id"]) | 196 if uploader == channel: |
198 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], | 197 print(uploader, channel) |
199 restricted=True), i["id"]) | 198 print("%s:" % i["id"]) |
200 path = Path(output) | 199 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], |
201 files = list(path.glob("*-%s.mkv" % i["id"])) | 200 restricted=True), i["id"]) |
202 files.extend(list(path.glob("*-%s.mp4" % i["id"]))) | 201 files = [y for p in ["mkv", "mp4", "webm"] for y in list(Path(output).glob(("*-%s." + p) % i["id"]))] |
203 files.extend(list(path.glob("*-%s.webm" % i["id"]))) | 202 if files: |
204 if files: | 203 print(" video already downloaded!") |
205 print(" video already downloaded!") | 204 write_metadata(i, basename) |
206 write_metadata(i, basename) | |
207 continue | |
208 # this code is *really* ugly... todo a rewrite? | |
209 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: | |
210 try: | |
211 ytdl.extract_info("https://youtube.com/watch?v=%s" | |
212 % i["id"]) | |
213 continue | 205 continue |
214 except DownloadError: | 206 # this code is *really* ugly... todo a rewrite? |
215 print(" video is not available! attempting to find In" | 207 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: |
216 "ternet Archive pages of it...") | 208 try: |
217 except Exception as e: | 209 ytdl.extract_info("https://youtube.com/watch?v=%s" |
218 print(" unknown error downloading video!\n") | 210 % i["id"]) |
219 print(e) | 211 continue |
220 if internet_archive_dl(i, basename) == 0: # if we can't download from IA | 212 except DownloadError: |
213 print(" video is not available! attempting to find In" | |
214 "ternet Archive pages of it...") | |
215 except Exception as e: | |
216 print(" unknown error downloading video!\n") | |
217 print(e) | |
218 if internet_archive_dl(i, basename, output): # if we can't download from IA | |
219 continue | |
221 print(" video does not have a Internet Archive page! attem" | 220 print(" video does not have a Internet Archive page! attem" |
222 "pting to download from the Wayback Machine...") | 221 "pting to download from the Wayback Machine...") |
223 while True: | 222 while True: |
224 if wayback_machine_dl(i, basename) == 0: # success | 223 if wayback_machine_dl(i, basename) == 0: # success |
225 break | 224 break |
226 time.sleep(5) | 225 time.sleep(5) |
227 continue | 226 continue |
228 write_metadata(i, basename) | 227 write_metadata(i, basename) |
229 | 228 |
230 | 229 |
231 if __name__ == "__main__": | 230 if __name__ == "__main__": |
232 main() | 231 main() |