Mercurial > codedump
changeset 119:196cf2e3d96e
channeldownloader: insane memory optimizations
it should now use at maximum 300mb if you're using the split json files
committer: GitHub <noreply@github.com>
author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
---|---|
date | Sat, 25 Mar 2023 17:02:23 -0400 |
parents | eac6dae753ca |
children | 3ecb2e815854 |
files | channeldownloader.py |
diffstat | 1 files changed, 77 insertions(+), 78 deletions(-) [+] |
line wrap: on
line diff
--- a/channeldownloader.py Fri Mar 03 22:51:28 2023 +0000 +++ b/channeldownloader.py Sat Mar 25 17:02:23 2023 -0400 @@ -57,17 +57,14 @@ % (os.path.basename(d["filename"]))) -def load_split_files(path: str) -> dict: - if os.path.isdir(path): - result = {"videos": []} - for fi in os.listdir(path): - for f in re.findall(r"vids[0-9\-]+?\.json", fi): - with open(path + "/" + f, "r", encoding="utf-8") as infile: - jsonnn = json.loads(infile.read()) - result["videos"].extend(jsonnn) - return result - else: - return json.loads(open(path, "r", encoding="utf-8").read()) +def load_split_files(path: str): + if not os.path.isdir(path): + yield json.load(open(path, "r", encoding="utf-8")) + for fi in os.listdir(path): + if re.search(r"vids[0-9\-]+?\.json", fi): + with open(path + "/" + fi, "r", encoding="utf-8") as infile: + print(fi) + yield json.load(infile) def reporthook(count: int, block_size: int, total_size: int) -> None: @@ -94,29 +91,6 @@ print(" saved %s" % os.path.basename(descfile.name)) -ytdl_opts = { - "retries": 100, - "nooverwrites": True, - "call_home": False, - "quiet": True, - "writeinfojson": True, - "writedescription": True, - "writethumbnail": True, - "writeannotations": True, - "writesubtitles": True, - "allsubtitles": True, - "addmetadata": True, - "continuedl": True, - "embedthumbnail": True, - "format": "bestvideo+bestaudio/best", - "restrictfilenames": True, - "no_warnings": True, - "progress_hooks": [ytdl_hook], - "logger": MyLogger(), - "ignoreerrors": False, -} - - def wayback_machine_dl(video: dict, basename: str) -> int: try: url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu", @@ -144,17 +118,17 @@ print(e) return 0 -def internet_archive_dl(video: dict, basename: str) -> int: + +def ia_file_legit(path: str, vidid: str) -> bool: + return True if re.search(''.join([r"((?:.+?-)?", vidid, r"\.(?:mp4|jpg|web" + r"p|mkv|webm|info\\.json|description|annotations.xml" + "))"]), + path) else False + + +def internet_archive_dl(video: dict, basename: str, output: str) -> int: if internetarchive.get_item("youtube-%s" % video["id"]).exists: - fnames = [f.name for f in internetarchive.get_files( - "youtube-%s" % video["id"])] - flist = [] - for fname in range(len(fnames)): - if re.search(''.join([r"((?:.+?-)?", video["id"], - r"\.(?:mp4|jpg|webp|mkv|webm|info\\.json|des" - r"cription|annotations.xml))"]), - fnames[fname]): - flist.append(fnames[fname]) + flist = [f.name for f in internetarchive.get_files("youtube-%s" % video["id"]) if ia_file_legit(f.name, video["id"])] while True: try: internetarchive.download("youtube-%s" % video["id"], @@ -166,7 +140,8 @@ break except ConnectTimeout: continue - except Exception: + except Exception as e: + print(e) return 0 if flist[0][:len(video["id"])] == video["id"]: for fname in flist: @@ -177,47 +152,71 @@ return 1 return 0 + +ytdl_opts = { + "retries": 100, + "nooverwrites": True, + "call_home": False, + "quiet": True, + "writeinfojson": True, + "writedescription": True, + "writethumbnail": True, + "writeannotations": True, + "writesubtitles": True, + "allsubtitles": True, + "addmetadata": True, + "continuedl": True, + "embedthumbnail": True, + "format": "bestvideo+bestaudio/best", + "restrictfilenames": True, + "no_warnings": True, + "progress_hooks": [ytdl_hook], + "logger": MyLogger(), + "ignoreerrors": False, +} + + def main(): args = docopt.docopt(__doc__) if not os.path.exists(args["--output"]): os.mkdir(args["--output"]) - for i in load_split_files(args["--database"])["videos"]: - uploader = i["uploader_id"] if "uploader_id" in i else None - for url in args["<url>"]: - channel = url.split("/")[-1] + for f in load_split_files(args["--database"]): + for i in f: + uploader = i["uploader_id"] if "uploader_id" in i else None + for url in args["<url>"]: + channel = url.split("/")[-1] - output = "%s/%s" % (args["--output"], channel) - if not os.path.exists(output): - os.mkdir(output) - ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" + output = "%s/%s" % (args["--output"], channel) + if not os.path.exists(output): + os.mkdir(output) + ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" - if uploader == channel: - print("%s:" % i["id"]) - basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], - restricted=True), i["id"]) - path = Path(output) - files = list(path.glob("*-%s.mkv" % i["id"])) - files.extend(list(path.glob("*-%s.mp4" % i["id"]))) - files.extend(list(path.glob("*-%s.webm" % i["id"]))) - if files: - print(" video already downloaded!") - write_metadata(i, basename) - continue - # this code is *really* ugly... todo a rewrite? - with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: - try: - ytdl.extract_info("https://youtube.com/watch?v=%s" - % i["id"]) + if uploader == channel: + print(uploader, channel) + print("%s:" % i["id"]) + basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], + restricted=True), i["id"]) + files = [y for p in ["mkv", "mp4", "webm"] for y in list(Path(output).glob(("*-%s." + p) % i["id"]))] + if files: + print(" video already downloaded!") + write_metadata(i, basename) continue - except DownloadError: - print(" video is not available! attempting to find In" - "ternet Archive pages of it...") - except Exception as e: - print(" unknown error downloading video!\n") - print(e) - if internet_archive_dl(i, basename) == 0: # if we can't download from IA + # this code is *really* ugly... todo a rewrite? + with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: + try: + ytdl.extract_info("https://youtube.com/watch?v=%s" + % i["id"]) + continue + except DownloadError: + print(" video is not available! attempting to find In" + "ternet Archive pages of it...") + except Exception as e: + print(" unknown error downloading video!\n") + print(e) + if internet_archive_dl(i, basename, output): # if we can't download from IA + continue print(" video does not have a Internet Archive page! attem" "pting to download from the Wayback Machine...") while True: @@ -225,7 +224,7 @@ break time.sleep(5) continue - write_metadata(i, basename) + write_metadata(i, basename) if __name__ == "__main__":