codedump: channeldownloader.py comparison

comparison channeldownloader.py @ 119:196cf2e3d96e

channeldownloader: insane memory optimizations it should now use at maximum 300mb if you're using the split json files committer: GitHub <noreply@github.com>

author	Paper <37962225+mrpapersonic@users.noreply.github.com>
date	Sat, 25 Mar 2023 17:02:23 -0400
parents	eac6dae753ca
children	3ecb2e815854

comparison

equal deleted inserted replaced

-:eac6dae753ca
+:196cf2e3d96e
 if d["status"] == "error":
 print("\n an error occurred downloading %s!"
 % (os.path.basename(d["filename"])))
-def load_split_files(path: str) -> dict:
+def load_split_files(path: str):
-if os.path.isdir(path):
+if not os.path.isdir(path):
-result = {"videos": []}
+yield json.load(open(path, "r", encoding="utf-8"))
 for fi in os.listdir(path):
-for f in re.findall(r"vids[0-9\-]+?\.json", fi):
+if re.search(r"vids[0-9\-]+?\.json", fi):
-with open(path + "/" + f, "r", encoding="utf-8") as infile:
+with open(path + "/" + fi, "r", encoding="utf-8") as infile:
-jsonnn = json.loads(infile.read())
+print(fi)
-result["videos"].extend(jsonnn)
+yield json.load(infile)
-return result
-else:
-return json.loads(open(path, "r", encoding="utf-8").read())
 def reporthook(count: int, block_size: int, total_size: int) -> None:
 global start_time
 if count == 0:
 if not os.path.exists(basename + ".description"):
 with open(basename + ".description", "w",
 encoding="utf-8") as descfile:
 descfile.write(i["description"])
 print(" saved %s" % os.path.basename(descfile.name))
-ytdl_opts = {
-"retries": 100,
-"nooverwrites": True,
-"call_home": False,
-"quiet": True,
-"writeinfojson": True,
-"writedescription": True,
-"writethumbnail": True,
-"writeannotations": True,
-"writesubtitles": True,
-"allsubtitles": True,
-"addmetadata": True,
-"continuedl": True,
-"embedthumbnail": True,
-"format": "bestvideo+bestaudio/best",
-"restrictfilenames": True,
-"no_warnings": True,
-"progress_hooks": [ytdl_hook],
-"logger": MyLogger(),
-"ignoreerrors": False,
-}
 def wayback_machine_dl(video: dict, basename: str) -> int:
 try:
 url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu",
 except Exception as e:
 print(" unknown error downloading video!\n")
 print(e)
 return 0
-def internet_archive_dl(video: dict, basename: str) -> int:
+def ia_file_legit(path: str, vidid: str) -> bool:
+return True if re.search(''.join([r"((?:.+?-)?", vidid, r"\.(?:mp4|jpg|web"
+r"p|mkv|webm|info\\.json|description|annotations.xml"
+"))"]),
+path) else False
+def internet_archive_dl(video: dict, basename: str, output: str) -> int:
 if internetarchive.get_item("youtube-%s" % video["id"]).exists:
-fnames = [f.name for f in internetarchive.get_files(
+flist = [f.name for f in internetarchive.get_files("youtube-%s" % video["id"]) if ia_file_legit(f.name, video["id"])]
-"youtube-%s" % video["id"])]
-flist = []
-for fname in range(len(fnames)):
-if re.search(''.join([r"((?:.+?-)?", video["id"],
-r"\.(?:mp4|jpg|webp|mkv|webm|info\\.json|des"
-r"cription|annotations.xml))"]),
-fnames[fname]):
-flist.append(fnames[fname])
 while True:
 try:
 internetarchive.download("youtube-%s" % video["id"],
 files=flist, verbose=True,
 destdir=output,
 ignore_existing=True,
 retries=9999)
 break
 except ConnectTimeout:
 continue
-except Exception:
+except Exception as e:
+print(e)
 return 0
 if flist[0][:len(video["id"])] == video["id"]:
 for fname in flist:
 if os.path.exists("%s/%s" % (output, fname)):
 os.replace("%s/%s" % (output, fname),
 "%s-%s" % (basename.rsplit("-", 1)[0],
 fname))
 return 1
 return 0
+ytdl_opts = {
+"retries": 100,
+"nooverwrites": True,
+"call_home": False,
+"quiet": True,
+"writeinfojson": True,
+"writedescription": True,
+"writethumbnail": True,
+"writeannotations": True,
+"writesubtitles": True,
+"allsubtitles": True,
+"addmetadata": True,
+"continuedl": True,
+"embedthumbnail": True,
+"format": "bestvideo+bestaudio/best",
+"restrictfilenames": True,
+"no_warnings": True,
+"progress_hooks": [ytdl_hook],
+"logger": MyLogger(),
+"ignoreerrors": False,
+}
 def main():
 args = docopt.docopt(__doc__)
 if not os.path.exists(args["--output"]):
 os.mkdir(args["--output"])
-for i in load_split_files(args["--database"])["videos"]:
+for f in load_split_files(args["--database"]):
-uploader = i["uploader_id"] if "uploader_id" in i else None
+for i in f:
-for url in args["<url>"]:
+uploader = i["uploader_id"] if "uploader_id" in i else None
-channel = url.split("/")[-1]
+for url in args["<url>"]:
+channel = url.split("/")[-1]
-output = "%s/%s" % (args["--output"], channel)
-if not os.path.exists(output):
+output = "%s/%s" % (args["--output"], channel)
-os.mkdir(output)
+if not os.path.exists(output):
-ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
+os.mkdir(output)
+ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
-if uploader == channel:
-print("%s:" % i["id"])
+if uploader == channel:
-basename = "%s/%s-%s" % (output, sanitize_filename(i["title"],
+print(uploader, channel)
-restricted=True), i["id"])
+print("%s:" % i["id"])
-path = Path(output)
+basename = "%s/%s-%s" % (output, sanitize_filename(i["title"],
-files = list(path.glob("*-%s.mkv" % i["id"]))
+restricted=True), i["id"])
-files.extend(list(path.glob("*-%s.mp4" % i["id"])))
+files = [y for p in ["mkv", "mp4", "webm"] for y in list(Path(output).glob(("*-%s." + p) % i["id"]))]
-files.extend(list(path.glob("*-%s.webm" % i["id"])))
+if files:
-if files:
+print(" video already downloaded!")
-print(" video already downloaded!")
+write_metadata(i, basename)
-write_metadata(i, basename)
-continue
-# this code is *really* ugly... todo a rewrite?
-with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
-try:
-ytdl.extract_info("https://youtube.com/watch?v=%s"
-% i["id"])
 continue
-except DownloadError:
+# this code is *really* ugly... todo a rewrite?
-print(" video is not available! attempting to find In"
+with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
-"ternet Archive pages of it...")
+try:
-except Exception as e:
+ytdl.extract_info("https://youtube.com/watch?v=%s"
-print(" unknown error downloading video!\n")
+% i["id"])
-print(e)
+continue
-if internet_archive_dl(i, basename) == 0:  # if we can't download from IA
+except DownloadError:
+print(" video is not available! attempting to find In"
+"ternet Archive pages of it...")
+except Exception as e:
+print(" unknown error downloading video!\n")
+print(e)
+if internet_archive_dl(i, basename, output):  # if we can't download from IA
+continue
 print(" video does not have a Internet Archive page! attem"
 "pting to download from the Wayback Machine...")
 while True:
 if wayback_machine_dl(i, basename) == 0:  # success
 break
 time.sleep(5)
 continue
 write_metadata(i, basename)
 if __name__ == "__main__":
 main()

Mercurial > codedump

comparison channeldownloader.py @ 119:196cf2e3d96e