changeset 114:80bd4a99ea00

Update channeldownloader.py committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Sat, 21 Jan 2023 15:26:34 -0500
parents a972dc788da0
children f10492e8720b
files channeldownloader.py
diffstat 1 files changed, 105 insertions(+), 88 deletions(-) [+]
line wrap: on
line diff
--- a/channeldownloader.py	Sat Jan 21 13:34:04 2023 -0500
+++ b/channeldownloader.py	Sat Jan 21 15:26:34 2023 -0500
@@ -1,11 +1,22 @@
 #!/usr/bin/env python3
-#
-# download deleted vids from old yt channels
-# script by paper
-# it's pretty old and could definitely use some refining
+"""
+Usage:
+  channeldownloader.py <url>... (--database <file>)
+                                [--output <folder>]
+                                [--proxy <proxy>]
+  channeldownloader.py -h | --help
 
+Arguments:
+  <url>                        YouTube channel URL to download from
+
+Options:
+  -h --help                    Show this screen
+  -o --output <folder>         Output folder, relative to the current directory
+                               [default: .]
+  -d --database <file>         HTTP or HTTPS proxy (SOCKS5 with PySocks)
+"""
 from __future__ import print_function
-import argparse
+import docopt
 import internetarchive
 try:
     import orjson as json
@@ -22,11 +33,11 @@
     from urllib2 import HTTPError
 try:
     import yt_dlp as youtube_dl
-    from yt_dlp.utils import sanitize_filename
+    from yt_dlp.utils import sanitize_filename, DownloadError
 except ImportError:
     try:
         import youtube_dl
-        from youtube_dl.utils import sanitize_filename
+        from youtube_dl.utils import sanitize_filename, DownloadError
     except ImportError:
         print("ERROR: youtube-dl/yt-dlp not installed!")
         exit(1)
@@ -41,24 +52,25 @@
         pass
 
     def error(self, msg):
+        print(" " + msg)
         pass
 
 def ytdl_hook(d):
     if d["status"] == "finished":
-        print(" downloaded %s:    100% " % (os.path.basename(d["filename"])))
+        print(" downloaded %s:    100%% " % (os.path.basename(d["filename"])))
     if d["status"] == "downloading":
         print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="")
     if d["status"] == "error":
-        print(" an error occurred downloading {0}!")
+        print("\n an error occurred downloading %s!" % (os.path.basename(d["filename"])))
 
 def load_split_files(path):
     if os.path.isdir(path):
         result = {"videos": []}
         for fi in os.listdir(path):
-            for f in re.findall(r"vids.+?\.json", fi):
+            for f in re.findall(r"vids[0-9\-]+?\.json", fi):
                 with open(path + "/" + f, "r", encoding="utf-8") as infile:
-                    for i in json.loads(infile.read())["videos"]:
-                        result["videos"].append(i)
+                    jsonnn = json.loads(infile.read())
+                    result["videos"].extend(jsonnn)
         return result
     else:
         return json.loads(open(path, "r", encoding="utf-8").read())
@@ -72,28 +84,9 @@
     percent = int(count * block_size * 100 / total_size)
     print(" downloading %d%%        \r" % (percent), end="")
 
-
-parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
-parser.add_argument("-c", "--channel", help="channel URL", metavar="<url>", required=True)
-parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar="<path>", required=True)
-parser.add_argument("-o", "--output", help="output directory, defaults to the channel ID", metavar="<output>")
-args = parser.parse_args()
-
-if args.channel[:8] == "https://" or args.channel[:7] == "http://":
-    channel = args.channel.split("/")[-1]
-else:
-    channel = args.channel
-
-if args.output:
-    output = args.output
-else:
-    output = channel
-
-if not os.path.exists(output):
-    os.mkdir(output)
+args = docopt.docopt(__doc__)
 
 ytdl_opts = {
-    "outtmpl": output + "/%(title)s-%(id)s.%(ext)s",
     "retries": 100,
     "nooverwrites": True,
     "call_home": False,
@@ -116,65 +109,89 @@
     "ignoreerrors": False,
 }
 
-for i in load_split_files(args.database)["videos"]:
+if not os.path.exists(args["--output"]):
+    os.mkdir(args["--output"])
+
+for i in load_split_files(args["--database"])["videos"]:
     uploader = i["uploader_id"] if "uploader_id" in i else None
-    if uploader == channel:
-        print("%s:" % i["id"])
-        # :skull:
-        # todo: put this in a function?
-        if any(x in os.listdir(output) for x in [sanitize_filename(i["title"] + "-" + i["id"] + ".mp4", restricted=True),
-                                                 sanitize_filename(i["title"] + "-" + i["id"] + ".mkv", restricted=True),
-                                                 sanitize_filename(i["title"] + "-" + i["id"] + ".webm", restricted=True)]):
-            print(" video already downloaded!")
-            continue
-        # this code is *really* ugly... todo a rewrite?
-        with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
-            try:
-                result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]])  # TODO: add check for existing downloaded items and don't download them
-                continue
-            except Exception:
-                print(" video is not available! attempting to find Internet Archive pages of it...")
-        if internetarchive.get_item("youtube-%s" % i["id"]).exists:  # download from internetarchive if available
-            fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
-            flist = []
-            for fname in range(len(fnames)):
-                if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
-                    flist.append(fnames[fname])
-            if len(flist) >= 1:
-                internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
-            else:
+    for url in args["<url>"]:
+        channel = url.split("/")[-1]
+
+        output = "%s/%s" % (args["--output"], channel)
+        if not os.path.exists(output):
+            os.mkdir(output)
+        ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
+        
+
+        if uploader == channel:
+            print("%s:" % i["id"])
+            # :skull:
+            # todo: put this in a function?
+            if any(x in os.listdir(output) for x in [sanitize_filename("%s-%s.mp4"  % (i["title"], i["id"]), restricted=True),
+                                                     sanitize_filename("%s-%s.mkv"  % (i["title"], i["id"]), restricted=True),
+                                                     sanitize_filename("%s-%s.webm" % (i["title"], i["id"]), restricted=True)]):
                 print(" video already downloaded!")
                 continue
-            if os.path.exists(output + "/" + i["id"] + ".info.json"):  # will always exist no matter which setting was used to download
-                for fname in flist:
-                    if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
-                        os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
-            else:
-                print("ID file not found!")
-        else:
-            print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
-            try:  # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url,
-                  # and we wouldn't even know if it worked. so let's continue using our little "hack"
-                headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"])
-                if hasattr(headers.info(), "getheader"):
-                    contenttype = headers.info().getheader("Content-Type")
+            # this code is *really* ugly... todo a rewrite?
+            with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
+                try:
+                    result = ytdl.extract_info("https://youtube.com/watch?v=%s" % i["id"])
+                    continue
+                except DownloadError:
+                    print(" video is not available! attempting to find Internet Archive pages of it...")
+                except Exception as e:
+                    print(" unknown error downloading video!\n")
+                    print(e)
+            if internetarchive.get_item("youtube-%s" % i["id"]).exists:  # download from internetarchive if available
+                fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
+                flist = []
+                for fname in range(len(fnames)):
+                    if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
+                        flist.append(fnames[fname])
+                if len(flist) >= 1:
+                    internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True, retries=9999)
+                else:
+                    print(" video already downloaded!")
+                    continue
+                if os.path.exists("%s/%s.info.json" % (output, i["id"])):  # will always exist no matter which setting was used to download
+                    for fname in flist:
+                        if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
+                            os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
                 else:
-                    contenttype = headers.getheader("Content-Type")
-                if contenttype == "video/webm":
-                    ext = "webm"
-                elif contenttype == "video/mp4":
-                    ext = "mp4"
-                else:
-                    raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None)
-                compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook)
-                print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext))
-            except HTTPError:
-                print(" video not available on the Wayback Machine!")
-            except Exception as e:
-                print(" unknown error downloading video!\n")
-                print(e)
+                    print("ID file not found!")
+            else:
+                print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
+                try:  # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url,
+                      # and we wouldn't even know if it worked. so let's continue using our little "hack"
+                    headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"])
+                    if hasattr(headers.info(), "getheader"):
+                        contenttype = headers.info().getheader("Content-Type")
+                    else:
+                        contenttype = headers.getheader("Content-Type")
+                    if contenttype == "video/webm":
+                        ext = "webm"
+                    elif contenttype == "video/mp4":
+                        ext = "mp4"
+                    else:
+                        raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None)
+                    compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook)
+                    print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext))
+                except HTTPError:
+                    print(" video not available on the Wayback Machine!")
+                except Exception as e:
+                    print(" unknown error downloading video!\n")
+                    print(e)
             # metadata
-            with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile:
-                jsonfile.write(json.dumps(i).decode("utf-8"))
-                print(" saved %s" % os.path.basename(jsonfile.name))
+            basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"])
+            if not os.path.exists(basename + ".info.json"):
+                with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile:
+                    try:
+                        jsonfile.write(json.dumps(i).decode("utf-8"))
+                    except AttributeError:
+                        jsonfile.write(json.dumps(i))
+                    print(" saved %s" % os.path.basename(jsonfile.name))
+            if not os.path.exists(basename + ".description"):
+                with open(basename + ".description", "w", encoding="utf-8") as descfile:
+                    descfile.write(i["description"])
+                    print(" saved %s" % os.path.basename(descfile.name))