Mercurial > codedump
comparison channeldownloader.py @ 119:196cf2e3d96e
channeldownloader: insane memory optimizations
it should now use at maximum 300mb if you're using the split json files
committer: GitHub <noreply@github.com>
| author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
|---|---|
| date | Sat, 25 Mar 2023 17:02:23 -0400 |
| parents | eac6dae753ca |
| children | 3ecb2e815854 |
comparison
equal
deleted
inserted
replaced
| 118:eac6dae753ca | 119:196cf2e3d96e |
|---|---|
| 55 if d["status"] == "error": | 55 if d["status"] == "error": |
| 56 print("\n an error occurred downloading %s!" | 56 print("\n an error occurred downloading %s!" |
| 57 % (os.path.basename(d["filename"]))) | 57 % (os.path.basename(d["filename"]))) |
| 58 | 58 |
| 59 | 59 |
| 60 def load_split_files(path: str) -> dict: | 60 def load_split_files(path: str): |
| 61 if os.path.isdir(path): | 61 if not os.path.isdir(path): |
| 62 result = {"videos": []} | 62 yield json.load(open(path, "r", encoding="utf-8")) |
| 63 for fi in os.listdir(path): | 63 for fi in os.listdir(path): |
| 64 for f in re.findall(r"vids[0-9\-]+?\.json", fi): | 64 if re.search(r"vids[0-9\-]+?\.json", fi): |
| 65 with open(path + "/" + f, "r", encoding="utf-8") as infile: | 65 with open(path + "/" + fi, "r", encoding="utf-8") as infile: |
| 66 jsonnn = json.loads(infile.read()) | 66 print(fi) |
| 67 result["videos"].extend(jsonnn) | 67 yield json.load(infile) |
| 68 return result | |
| 69 else: | |
| 70 return json.loads(open(path, "r", encoding="utf-8").read()) | |
| 71 | 68 |
| 72 | 69 |
| 73 def reporthook(count: int, block_size: int, total_size: int) -> None: | 70 def reporthook(count: int, block_size: int, total_size: int) -> None: |
| 74 global start_time | 71 global start_time |
| 75 if count == 0: | 72 if count == 0: |
| 90 if not os.path.exists(basename + ".description"): | 87 if not os.path.exists(basename + ".description"): |
| 91 with open(basename + ".description", "w", | 88 with open(basename + ".description", "w", |
| 92 encoding="utf-8") as descfile: | 89 encoding="utf-8") as descfile: |
| 93 descfile.write(i["description"]) | 90 descfile.write(i["description"]) |
| 94 print(" saved %s" % os.path.basename(descfile.name)) | 91 print(" saved %s" % os.path.basename(descfile.name)) |
| 95 | |
| 96 | |
| 97 ytdl_opts = { | |
| 98 "retries": 100, | |
| 99 "nooverwrites": True, | |
| 100 "call_home": False, | |
| 101 "quiet": True, | |
| 102 "writeinfojson": True, | |
| 103 "writedescription": True, | |
| 104 "writethumbnail": True, | |
| 105 "writeannotations": True, | |
| 106 "writesubtitles": True, | |
| 107 "allsubtitles": True, | |
| 108 "addmetadata": True, | |
| 109 "continuedl": True, | |
| 110 "embedthumbnail": True, | |
| 111 "format": "bestvideo+bestaudio/best", | |
| 112 "restrictfilenames": True, | |
| 113 "no_warnings": True, | |
| 114 "progress_hooks": [ytdl_hook], | |
| 115 "logger": MyLogger(), | |
| 116 "ignoreerrors": False, | |
| 117 } | |
| 118 | 92 |
| 119 | 93 |
| 120 def wayback_machine_dl(video: dict, basename: str) -> int: | 94 def wayback_machine_dl(video: dict, basename: str) -> int: |
| 121 try: | 95 try: |
| 122 url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu", | 96 url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu", |
| 142 except Exception as e: | 116 except Exception as e: |
| 143 print(" unknown error downloading video!\n") | 117 print(" unknown error downloading video!\n") |
| 144 print(e) | 118 print(e) |
| 145 return 0 | 119 return 0 |
| 146 | 120 |
| 147 def internet_archive_dl(video: dict, basename: str) -> int: | 121 |
| 122 def ia_file_legit(path: str, vidid: str) -> bool: | |
| 123 return True if re.search(''.join([r"((?:.+?-)?", vidid, r"\.(?:mp4|jpg|web" | |
| 124 r"p|mkv|webm|info\\.json|description|annotations.xml" | |
| 125 "))"]), | |
| 126 path) else False | |
| 127 | |
| 128 | |
| 129 def internet_archive_dl(video: dict, basename: str, output: str) -> int: | |
| 148 if internetarchive.get_item("youtube-%s" % video["id"]).exists: | 130 if internetarchive.get_item("youtube-%s" % video["id"]).exists: |
| 149 fnames = [f.name for f in internetarchive.get_files( | 131 flist = [f.name for f in internetarchive.get_files("youtube-%s" % video["id"]) if ia_file_legit(f.name, video["id"])] |
| 150 "youtube-%s" % video["id"])] | |
| 151 flist = [] | |
| 152 for fname in range(len(fnames)): | |
| 153 if re.search(''.join([r"((?:.+?-)?", video["id"], | |
| 154 r"\.(?:mp4|jpg|webp|mkv|webm|info\\.json|des" | |
| 155 r"cription|annotations.xml))"]), | |
| 156 fnames[fname]): | |
| 157 flist.append(fnames[fname]) | |
| 158 while True: | 132 while True: |
| 159 try: | 133 try: |
| 160 internetarchive.download("youtube-%s" % video["id"], | 134 internetarchive.download("youtube-%s" % video["id"], |
| 161 files=flist, verbose=True, | 135 files=flist, verbose=True, |
| 162 destdir=output, | 136 destdir=output, |
| 164 ignore_existing=True, | 138 ignore_existing=True, |
| 165 retries=9999) | 139 retries=9999) |
| 166 break | 140 break |
| 167 except ConnectTimeout: | 141 except ConnectTimeout: |
| 168 continue | 142 continue |
| 169 except Exception: | 143 except Exception as e: |
| 144 print(e) | |
| 170 return 0 | 145 return 0 |
| 171 if flist[0][:len(video["id"])] == video["id"]: | 146 if flist[0][:len(video["id"])] == video["id"]: |
| 172 for fname in flist: | 147 for fname in flist: |
| 173 if os.path.exists("%s/%s" % (output, fname)): | 148 if os.path.exists("%s/%s" % (output, fname)): |
| 174 os.replace("%s/%s" % (output, fname), | 149 os.replace("%s/%s" % (output, fname), |
| 175 "%s-%s" % (basename.rsplit("-", 1)[0], | 150 "%s-%s" % (basename.rsplit("-", 1)[0], |
| 176 fname)) | 151 fname)) |
| 177 return 1 | 152 return 1 |
| 178 return 0 | 153 return 0 |
| 179 | 154 |
| 155 | |
| 156 ytdl_opts = { | |
| 157 "retries": 100, | |
| 158 "nooverwrites": True, | |
| 159 "call_home": False, | |
| 160 "quiet": True, | |
| 161 "writeinfojson": True, | |
| 162 "writedescription": True, | |
| 163 "writethumbnail": True, | |
| 164 "writeannotations": True, | |
| 165 "writesubtitles": True, | |
| 166 "allsubtitles": True, | |
| 167 "addmetadata": True, | |
| 168 "continuedl": True, | |
| 169 "embedthumbnail": True, | |
| 170 "format": "bestvideo+bestaudio/best", | |
| 171 "restrictfilenames": True, | |
| 172 "no_warnings": True, | |
| 173 "progress_hooks": [ytdl_hook], | |
| 174 "logger": MyLogger(), | |
| 175 "ignoreerrors": False, | |
| 176 } | |
| 177 | |
| 178 | |
| 180 def main(): | 179 def main(): |
| 181 args = docopt.docopt(__doc__) | 180 args = docopt.docopt(__doc__) |
| 182 | 181 |
| 183 if not os.path.exists(args["--output"]): | 182 if not os.path.exists(args["--output"]): |
| 184 os.mkdir(args["--output"]) | 183 os.mkdir(args["--output"]) |
| 185 | 184 |
| 186 for i in load_split_files(args["--database"])["videos"]: | 185 for f in load_split_files(args["--database"]): |
| 187 uploader = i["uploader_id"] if "uploader_id" in i else None | 186 for i in f: |
| 188 for url in args["<url>"]: | 187 uploader = i["uploader_id"] if "uploader_id" in i else None |
| 189 channel = url.split("/")[-1] | 188 for url in args["<url>"]: |
| 190 | 189 channel = url.split("/")[-1] |
| 191 output = "%s/%s" % (args["--output"], channel) | 190 |
| 192 if not os.path.exists(output): | 191 output = "%s/%s" % (args["--output"], channel) |
| 193 os.mkdir(output) | 192 if not os.path.exists(output): |
| 194 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" | 193 os.mkdir(output) |
| 195 | 194 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" |
| 196 if uploader == channel: | 195 |
| 197 print("%s:" % i["id"]) | 196 if uploader == channel: |
| 198 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], | 197 print(uploader, channel) |
| 199 restricted=True), i["id"]) | 198 print("%s:" % i["id"]) |
| 200 path = Path(output) | 199 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], |
| 201 files = list(path.glob("*-%s.mkv" % i["id"])) | 200 restricted=True), i["id"]) |
| 202 files.extend(list(path.glob("*-%s.mp4" % i["id"]))) | 201 files = [y for p in ["mkv", "mp4", "webm"] for y in list(Path(output).glob(("*-%s." + p) % i["id"]))] |
| 203 files.extend(list(path.glob("*-%s.webm" % i["id"]))) | 202 if files: |
| 204 if files: | 203 print(" video already downloaded!") |
| 205 print(" video already downloaded!") | 204 write_metadata(i, basename) |
| 206 write_metadata(i, basename) | |
| 207 continue | |
| 208 # this code is *really* ugly... todo a rewrite? | |
| 209 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: | |
| 210 try: | |
| 211 ytdl.extract_info("https://youtube.com/watch?v=%s" | |
| 212 % i["id"]) | |
| 213 continue | 205 continue |
| 214 except DownloadError: | 206 # this code is *really* ugly... todo a rewrite? |
| 215 print(" video is not available! attempting to find In" | 207 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: |
| 216 "ternet Archive pages of it...") | 208 try: |
| 217 except Exception as e: | 209 ytdl.extract_info("https://youtube.com/watch?v=%s" |
| 218 print(" unknown error downloading video!\n") | 210 % i["id"]) |
| 219 print(e) | 211 continue |
| 220 if internet_archive_dl(i, basename) == 0: # if we can't download from IA | 212 except DownloadError: |
| 213 print(" video is not available! attempting to find In" | |
| 214 "ternet Archive pages of it...") | |
| 215 except Exception as e: | |
| 216 print(" unknown error downloading video!\n") | |
| 217 print(e) | |
| 218 if internet_archive_dl(i, basename, output): # if we can't download from IA | |
| 219 continue | |
| 221 print(" video does not have a Internet Archive page! attem" | 220 print(" video does not have a Internet Archive page! attem" |
| 222 "pting to download from the Wayback Machine...") | 221 "pting to download from the Wayback Machine...") |
| 223 while True: | 222 while True: |
| 224 if wayback_machine_dl(i, basename) == 0: # success | 223 if wayback_machine_dl(i, basename) == 0: # success |
| 225 break | 224 break |
| 226 time.sleep(5) | 225 time.sleep(5) |
| 227 continue | 226 continue |
| 228 write_metadata(i, basename) | 227 write_metadata(i, basename) |
| 229 | 228 |
| 230 | 229 |
| 231 if __name__ == "__main__": | 230 if __name__ == "__main__": |
| 232 main() | 231 main() |
