changeset 119:196cf2e3d96e

channeldownloader: insane memory optimizations it should now use at maximum 300mb if you're using the split json files committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Sat, 25 Mar 2023 17:02:23 -0400
parents eac6dae753ca
children 3ecb2e815854
files channeldownloader.py
diffstat 1 files changed, 77 insertions(+), 78 deletions(-) [+]
line wrap: on
line diff
--- a/channeldownloader.py	Fri Mar 03 22:51:28 2023 +0000
+++ b/channeldownloader.py	Sat Mar 25 17:02:23 2023 -0400
@@ -57,17 +57,14 @@
               % (os.path.basename(d["filename"])))
 
 
-def load_split_files(path: str) -> dict:
-    if os.path.isdir(path):
-        result = {"videos": []}
-        for fi in os.listdir(path):
-            for f in re.findall(r"vids[0-9\-]+?\.json", fi):
-                with open(path + "/" + f, "r", encoding="utf-8") as infile:
-                    jsonnn = json.loads(infile.read())
-                    result["videos"].extend(jsonnn)
-        return result
-    else:
-        return json.loads(open(path, "r", encoding="utf-8").read())
+def load_split_files(path: str):
+    if not os.path.isdir(path):
+        yield json.load(open(path, "r", encoding="utf-8"))
+    for fi in os.listdir(path):
+        if re.search(r"vids[0-9\-]+?\.json", fi):
+            with open(path + "/" + fi, "r", encoding="utf-8") as infile:
+                print(fi)
+                yield json.load(infile)
 
 
 def reporthook(count: int, block_size: int, total_size: int) -> None:
@@ -94,29 +91,6 @@
             print(" saved %s" % os.path.basename(descfile.name))
 
 
-ytdl_opts = {
-    "retries": 100,
-    "nooverwrites": True,
-    "call_home": False,
-    "quiet": True,
-    "writeinfojson": True,
-    "writedescription": True,
-    "writethumbnail": True,
-    "writeannotations": True,
-    "writesubtitles": True,
-    "allsubtitles": True,
-    "addmetadata": True,
-    "continuedl": True,
-    "embedthumbnail": True,
-    "format": "bestvideo+bestaudio/best",
-    "restrictfilenames": True,
-    "no_warnings": True,
-    "progress_hooks": [ytdl_hook],
-    "logger": MyLogger(),
-    "ignoreerrors": False,
-}
-
-
 def wayback_machine_dl(video: dict, basename: str) -> int:
     try:
         url = ''.join(["https://web.archive.org/web/2oe_/http://wayback-fakeu",
@@ -144,17 +118,17 @@
         print(e)
         return 0
 
-def internet_archive_dl(video: dict, basename: str) -> int:
+
+def ia_file_legit(path: str, vidid: str) -> bool:
+    return True if re.search(''.join([r"((?:.+?-)?", vidid, r"\.(?:mp4|jpg|web"
+                          r"p|mkv|webm|info\\.json|description|annotations.xml"
+                          "))"]),
+                         path) else False
+
+
+def internet_archive_dl(video: dict, basename: str, output: str) -> int:
     if internetarchive.get_item("youtube-%s" % video["id"]).exists:
-        fnames = [f.name for f in internetarchive.get_files(
-                                  "youtube-%s" % video["id"])]
-        flist = []
-        for fname in range(len(fnames)):
-            if re.search(''.join([r"((?:.+?-)?", video["id"],
-                                  r"\.(?:mp4|jpg|webp|mkv|webm|info\\.json|des"
-                                  r"cription|annotations.xml))"]),
-                                 fnames[fname]):
-                flist.append(fnames[fname])
+        flist = [f.name for f in internetarchive.get_files("youtube-%s" % video["id"]) if ia_file_legit(f.name, video["id"])]
         while True:
             try:
                 internetarchive.download("youtube-%s" % video["id"],
@@ -166,7 +140,8 @@
                 break
             except ConnectTimeout:
                 continue
-            except Exception:
+            except Exception as e:
+                print(e)
                 return 0
         if flist[0][:len(video["id"])] == video["id"]:
             for fname in flist:
@@ -177,47 +152,71 @@
         return 1
     return 0
 
+
+ytdl_opts = {
+    "retries": 100,
+    "nooverwrites": True,
+    "call_home": False,
+    "quiet": True,
+    "writeinfojson": True,
+    "writedescription": True,
+    "writethumbnail": True,
+    "writeannotations": True,
+    "writesubtitles": True,
+    "allsubtitles": True,
+    "addmetadata": True,
+    "continuedl": True,
+    "embedthumbnail": True,
+    "format": "bestvideo+bestaudio/best",
+    "restrictfilenames": True,
+    "no_warnings": True,
+    "progress_hooks": [ytdl_hook],
+    "logger": MyLogger(),
+    "ignoreerrors": False,
+}
+
+
 def main():
     args = docopt.docopt(__doc__)
 
     if not os.path.exists(args["--output"]):
         os.mkdir(args["--output"])
 
-    for i in load_split_files(args["--database"])["videos"]:
-        uploader = i["uploader_id"] if "uploader_id" in i else None
-        for url in args["<url>"]:
-            channel = url.split("/")[-1]
+    for f in load_split_files(args["--database"]):
+        for i in f:
+            uploader = i["uploader_id"] if "uploader_id" in i else None
+            for url in args["<url>"]:
+                channel = url.split("/")[-1]
 
-            output = "%s/%s" % (args["--output"], channel)
-            if not os.path.exists(output):
-                os.mkdir(output)
-            ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
+                output = "%s/%s" % (args["--output"], channel)
+                if not os.path.exists(output):
+                    os.mkdir(output)
+                ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
 
-            if uploader == channel:
-                print("%s:" % i["id"])
-                basename = "%s/%s-%s" % (output, sanitize_filename(i["title"],
-                                         restricted=True), i["id"])
-                path = Path(output)
-                files = list(path.glob("*-%s.mkv" % i["id"]))
-                files.extend(list(path.glob("*-%s.mp4" % i["id"])))
-                files.extend(list(path.glob("*-%s.webm" % i["id"])))
-                if files:
-                    print(" video already downloaded!")
-                    write_metadata(i, basename)
-                    continue
-                # this code is *really* ugly... todo a rewrite?
-                with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
-                    try:
-                        ytdl.extract_info("https://youtube.com/watch?v=%s"
-                                          % i["id"])
+                if uploader == channel:
+                    print(uploader, channel)
+                    print("%s:" % i["id"])
+                    basename = "%s/%s-%s" % (output, sanitize_filename(i["title"],
+                                             restricted=True), i["id"])
+                    files = [y for p in ["mkv", "mp4", "webm"] for y in list(Path(output).glob(("*-%s." + p) % i["id"]))]
+                    if files:
+                        print(" video already downloaded!")
+                        write_metadata(i, basename)
                         continue
-                    except DownloadError:
-                        print(" video is not available! attempting to find In"
-                              "ternet Archive pages of it...")
-                    except Exception as e:
-                        print(" unknown error downloading video!\n")
-                        print(e)
-                if internet_archive_dl(i, basename) == 0:  # if we can't download from IA
+                    # this code is *really* ugly... todo a rewrite?
+                    with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
+                        try:
+                            ytdl.extract_info("https://youtube.com/watch?v=%s"
+                                              % i["id"])
+                            continue
+                        except DownloadError:
+                            print(" video is not available! attempting to find In"
+                                  "ternet Archive pages of it...")
+                        except Exception as e:
+                            print(" unknown error downloading video!\n")
+                            print(e)
+                    if internet_archive_dl(i, basename, output):  # if we can't download from IA
+                        continue
                     print(" video does not have a Internet Archive page! attem"
                           "pting to download from the Wayback Machine...")
                     while True:
@@ -225,7 +224,7 @@
                             break
                         time.sleep(5)
                         continue
-                write_metadata(i, basename)
+                    write_metadata(i, basename)
 
 
 if __name__ == "__main__":