changeset 61:c615532e6572

Update channeldownloader.py add option to use the split files instead of the full json committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Sun, 02 Jan 2022 07:09:55 -0500
parents 4e7a9c7c0cce
children 8be9281d7ade
files channeldownloader.py
diffstat 1 files changed, 77 insertions(+), 107 deletions(-) [+]
line wrap: on
line diff
--- a/channeldownloader.py	Fri Nov 19 08:47:32 2021 -0500
+++ b/channeldownloader.py	Sun Jan 02 07:09:55 2022 -0500
@@ -1,12 +1,14 @@
 import argparse
 import internetarchive  # pip install internetarchive
 import json
+import glob
 import os
 import re
 import urllib.request
 import yt_dlp  # pip install yt-dlp
 import itertools
 from urllib.error import HTTPError
+from yt_dlp.utils import sanitize_filename
 
 class MyLogger(object):
     def debug(self, msg):
@@ -18,46 +20,6 @@
     def error(self, msg):
         pass
 
-ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
-                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
-                                        'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
-
-def sanitize_filename(s, restricted=False, is_id=False):
-    # from youtube-dl utils
-    def replace_insane(char):
-        if restricted and char in ACCENT_CHARS:
-            return ACCENT_CHARS[char]
-        if char == '?' or ord(char) < 32 or ord(char) == 127:
-            return ''
-        elif char == '"':
-            return '' if restricted else '\''
-        elif char == ':':
-            return '_-' if restricted else ' -'
-        elif char in '\\/|*<>':
-            return '_'
-        if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
-            return '_'
-        if restricted and ord(char) > 127:
-            return '_'
-        return char
-
-    # Handle timestamps
-    s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
-    result = ''.join(map(replace_insane, s))
-    if not is_id:
-        while '__' in result:
-            result = result.replace('__', '_')
-        result = result.strip('_')
-        # Common case of "Foreign band name - English song title"
-        if restricted and result.startswith('-_'):
-            result = result[2:]
-        if result.startswith('-'):
-            result = '_' + result[len('-'):]
-        result = result.lstrip('.')
-        if not result:
-            result = '_'
-    return result
-
 def matroska_find(filelist):
     for myfile in filelist:
         if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm":
@@ -72,6 +34,17 @@
     if d["status"] == "error":
         print(" an error occurred downloading {0}!")
 
+def load_split_files(path):
+    if os.path.isdir(path):
+        result = {"videos": []}
+        for f in glob.glob(os.path.join(path, "vids*.json")):
+            with open(f, "r", encoding="utf-8") as infile:
+                for i in json.loads(infile.read())["videos"]:
+                    result["videos"].append(i)
+        return result
+    else:
+        return json.loads(open(path, "r", encoding="utf-8"))
+
 
 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True)
@@ -116,71 +89,68 @@
     "ignoreerrors": False,
 }
 
-with open(args.database, "r", encoding="utf-8") as f:
-    data = json.load(f)
-    for i in data["videos"]:
-        try:
-            uploader = i["uploader_id"]
-        except Exception:
-            uploader = "unknown"
-        finally:
-            if uploader == channel:
-                print("{0}:".format(i["id"]))
-                isalreadydownloaded = 0
-                for file in os.listdir(output):
-                    if os.path.splitext(file)[1] == ".json":
-                        if file.find("-" + i["id"] + ".info.json") != -1:
-                            isalreadydownloaded = 1
-                if isalreadydownloaded == 1:  # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great!
-                    print(" video already downloaded!")
-                    continue
-                with yt_dlp.YoutubeDL(ytdl_opts) as ytdl:
-                    try:
-                        result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])])  # TODO: add check for existing downloaded items and don't download them
+for i in load_split_files(args.database):
+    try:
+        uploader = i["uploader_id"]
+    except Exception:
+        uploader = "unknown"
+    if uploader == channel:
+        print("{0}:".format(i["id"]))
+        isalreadydownloaded = 0
+        for file in os.listdir(output):
+            if os.path.splitext(file)[1] == ".json":
+                if file.find("-" + i["id"] + ".info.json") != -1:
+                    isalreadydownloaded = 1
+        if isalreadydownloaded == 1:  # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great!
+            print(" video already downloaded!")
+            continue
+        with yt_dlp.YoutubeDL(ytdl_opts) as ytdl:
+            try:
+                result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])])  # TODO: add check for existing downloaded items and don't download them
+                continue
+            except Exception:
+                print(" video is not available! attempting to find Internet Archive pages of it...")
+        if internetarchive.get_item("youtube-{0}".format(i["id"])).exists:  # download from internetarchive if available
+            fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))]
+            disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])]  # list of IA-created files we don't need
+            flist = []
+            for fname in fnames:
+                if matroska_find(fnames):
+                    if fname[-4:] == ".mp4":
+                        continue
+                else:
+                    if fname[-7:] == ".ia.mp4":
                         continue
-                    except Exception:
-                        print(" video is not available! attempting to find Internet Archive pages of it...")
-                if internetarchive.get_item("youtube-{0}".format(i["id"])).exists:  # download from internetarchive if available
-                    fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))]
-                    disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])]  # list of IA-created files we don't need
-                    flist = []
-                    for fname in fnames:
-                        if matroska_find(fnames):
-                            if fname[-4:] == ".mp4":
-                                continue
-                        else:
-                            if fname[-7:] == ".ia.mp4":
-                                continue
-                        if fname.find("/") == -1:
-                            if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]):
-                                flist.append(fname)
-                    if len(flist) >= 1:
-                        internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
-                    else:
-                        print(" video already downloaded!")
-                        continue
-                    if os.path.exists(output + "\\" + i["id"] + ".info.json"):  # will always exist no matter which setting was used to download
-                        for fname in flist:
-                            if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
-                                os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
-                    else:
-                        print("ID file not found!")
-                else:  # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then)
-                    print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
-                    try:
-                        contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type")
-                        if contenttype == "video/webm":
-                            ext = "webm"
-                        else:
-                            ext = "mp4"
-                        urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output))
-                        print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext))
-                    except HTTPError:
-                        print(" video not available on the Wayback Machine!")
-                    except Exception as e:
-                        print(" unknown error downloading video!")
-                        print(e)
-                    # metadata
-                    with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile:
-                        print(json.dumps(i), end="", file=jsonfile)
-                    print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output))
+                if fname.find("/") == -1:
+                    if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]):
+                        flist.append(fname)
+            if len(flist) >= 1:
+                internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
+            else:
+                print(" video already downloaded!")
+                continue
+            if os.path.exists(output + "\\" + i["id"] + ".info.json"):  # will always exist no matter which setting was used to download
+                for fname in flist:
+                    if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
+                        os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
+            else:
+                print("ID file not found!")
+        else:  # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then)
+            print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
+            try:
+                contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type")
+                if contenttype == "video/webm":
+                    ext = "webm"
+                else:
+                    ext = "mp4"
+                urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output))
+                print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext))
+            except HTTPError:
+                print(" video not available on the Wayback Machine!")
+            except Exception as e:
+                print(" unknown error downloading video!")
+                print(e)
+            # metadata
+            with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile:
+                print(json.dumps(i), end="", file=jsonfile)
+            print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output))