changeset 69:63e6bc911606

Use regex instead of weirdness to filter archive.org names committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Wed, 18 May 2022 23:24:03 -0400
parents a43ed076b28f
children eafe13de3f76
files channeldownloader.py
diffstat 1 files changed, 10 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/channeldownloader.py	Wed May 18 20:05:47 2022 -0400
+++ b/channeldownloader.py	Wed May 18 23:24:03 2022 -0400
@@ -2,6 +2,7 @@
 #
 # download deleted vids from old yt channels
 # script by paper
+# it's pretty old and could definitely use some refining
 
 from __future__ import print_function
 import argparse
@@ -10,7 +11,6 @@
     import orjson as json
 except ImportError:
     import json
-import glob
 import os
 import re
 import time
@@ -54,10 +54,11 @@
 def load_split_files(path):
     if os.path.isdir(path):
         result = {"videos": []}
-        for f in glob.glob(os.path.join(path, "vids*.json")):
-            with open(f, "r", encoding="utf-8") as infile:
-                for i in json.loads(infile.read())["videos"]:
-                    result["videos"].append(i)
+        for fi in os.listdir(path):
+            for f in re.findall(r"vids.+?\.json", fi):
+                with open(path + "/" + f, "r", encoding="utf-8") as infile:
+                    for i in json.loads(infile.read())["videos"]:
+                        result["videos"].append(i)
         return result
     else:
         return json.loads(open(path, "r", encoding="utf-8").read())
@@ -68,8 +69,6 @@
         start_time = time.time()
         return
     duration = time.time() - start_time
-    progress_size = int(count * block_size)
-    speed = int(progress_size / (1024 * duration))
     percent = int(count * block_size * 100 / total_size)
     print(" downloading %d%%        \r" % (percent), end="")
 
@@ -121,7 +120,7 @@
     uploader = i["uploader_id"] if "uploader_id" in i else None
     if uploader == channel:
         print("%s:" % i["id"])
-        if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"):
+        if os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"):
             print(" video already downloaded!")
             continue
         # this code is *really* ugly... todo a rewrite?
@@ -133,18 +132,10 @@
                 print(" video is not available! attempting to find Internet Archive pages of it...")
         if internetarchive.get_item("youtube-%s" % i["id"]).exists:  # download from internetarchive if available
             fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
-            disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]]  # list of IA-created files we don't need
             flist = []
-            for fname in fnames:
-                if os.path.splitext(fname)[1] in [".mkv", ".webm"]:
-                    if fname[-4:] == ".mp4":
-                        continue
-                else:
-                    if fname[-7:] == ".ia.mp4":
-                        continue
-                if fname.find("/") == -1:
-                    if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]:
-                        flist.append(fname)
+            for fname in range(len(fnames)):
+                if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
+                    flist.append(fnames[fname])
             if len(flist) >= 1:
                 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
             else: