Mercurial > codedump

--- a/channeldownloader.py	Mon Apr 25 02:11:46 2022 -0400
+++ b/channeldownloader.py	Wed May 18 18:57:58 2022 -0400
@@ -1,14 +1,36 @@
+#!/usr/bin/env python3
+#
+# download deleted vids from old yt channels
+# script by paper
+
+from __future__ import print_function
 import argparse
-import internetarchive  # pip install internetarchive
-import json
+import internetarchive
+try:
+    import orjson as json
+except ImportError:
+    import json
 import glob
 import os
 import re
-import urllib.request
-import yt_dlp  # pip install yt-dlp
-import itertools
-from urllib.error import HTTPError
-from yt_dlp.utils import sanitize_filename
+try:
+    import urllib.request as compat_urllib
+    from urllib.error import HTTPError
+except ImportError:  # Python 2
+    import urllib as compat_urllib
+    from urllib2 import HTTPError
+try:
+    import yt_dlp as youtube_dl
+    from yt_dlp.utils import sanitize_filename
+except ImportError:
+    try:
+        import youtube_dl
+        from youtube_dl.utils import sanitize_filename
+    except ImportError:
+        print("ERROR: youtube-dl/yt-dlp not installed!")
+        exit(1)
+from io import open  # for Python 2 compatibility, in Python 3 this
+                     # just maps to the built-in function

 class MyLogger(object):
     def debug(self, msg):
@@ -20,17 +42,11 @@
     def error(self, msg):
         pass

-def matroska_find(filelist):
-    for myfile in filelist:
-        if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm":
-            return True
-    return False
-
 def ytdl_hook(d):
     if d["status"] == "finished":
-        print(" downloaded {0}:    100% ".format(os.path.basename(d["filename"])))
+        print(" downloaded %s:    100% " % (os.path.basename(d["filename"])))
     if d["status"] == "downloading":
-        print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="")
+        print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="")
     if d["status"] == "error":
         print(" an error occurred downloading {0}!")

@@ -43,7 +59,7 @@
                     result["videos"].append(i)
         return result
     else:
-        return json.loads(open(path, "r", encoding="utf-8"))
+        return json.loads(open(path, "r", encoding="utf-8").read())


 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
@@ -66,7 +82,7 @@
     os.mkdir(output)

 ytdl_opts = {
-    "outtmpl": "{0}/%(title)s-%(id)s.%(ext)s".format(output),
+    "outtmpl": "%s/%(title)s-%(id)s.%(ext)s" % (output),
     "retries": 100,
     "nooverwrites": True,
     "call_home": False,
@@ -89,68 +105,62 @@
     "ignoreerrors": False,
 }

-for i in load_split_files(args.database):
-    try:
-        uploader = i["uploader_id"]
-    except Exception:
-        uploader = "unknown"
+for i in load_split_files(args.database)["videos"]:
+    uploader = i["uploader_id"] if "uploader_id" in i else None
     if uploader == channel:
-        print("{0}:".format(i["id"]))
-        isalreadydownloaded = 0
-        for file in os.listdir(output):
-            if os.path.splitext(file)[1] == ".json":
-                if file.find("-" + i["id"] + ".info.json") != -1:
-                    isalreadydownloaded = 1
-        if isalreadydownloaded == 1:  # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great!
+        print("%s:" % i["id"])
+        if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"):
             print(" video already downloaded!")
             continue
-        with yt_dlp.YoutubeDL(ytdl_opts) as ytdl:
+        # this code is *really* ugly... todo a rewrite?
+        with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
             try:
-                result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])])  # TODO: add check for existing downloaded items and don't download them
+                result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]])  # TODO: add check for existing downloaded items and don't download them
                 continue
             except Exception:
                 print(" video is not available! attempting to find Internet Archive pages of it...")
-        if internetarchive.get_item("youtube-{0}".format(i["id"])).exists:  # download from internetarchive if available
-            fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))]
-            disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])]  # list of IA-created files we don't need
+        if internetarchive.get_item("youtube-%s" % i["id"]).exists:  # download from internetarchive if available
+            fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
+            disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]]  # list of IA-created files we don't need
             flist = []
             for fname in fnames:
-                if matroska_find(fnames):
+                if os.path.splitext(fname)[1] in [".mkv", ".webm"]:
                     if fname[-4:] == ".mp4":
                         continue
                 else:
                     if fname[-7:] == ".ia.mp4":
                         continue
                 if fname.find("/") == -1:
-                    if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]):
+                    if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]:
                         flist.append(fname)
             if len(flist) >= 1:
-                internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
+                internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
             else:
                 print(" video already downloaded!")
                 continue
-            if os.path.exists(output + "\\" + i["id"] + ".info.json"):  # will always exist no matter which setting was used to download
+            if os.path.exists(output + "/" + i["id"] + ".info.json"):  # will always exist no matter which setting was used to download
                 for fname in flist:
-                    if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
-                        os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
+                    if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
+                        os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
             else:
                 print("ID file not found!")
-        else:  # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then)
+        else:
             print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
             try:
-                contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type")
-                if contenttype == "video/webm":
-                    ext = "webm"
+                headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"])
+                if hasattr(headers.info(), "getheader"):
+                    contenttype = headers.info().getheader("Content-Type")
                 else:
-                    ext = "mp4"
-                urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output))
-                print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext))
+                    contenttype = headers.getheader("Content-Type")
+                ext = "webm" if contenttype == "video/webm" else "mp4"
+                compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext))
+                print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext))
             except HTTPError:
                 print(" video not available on the Wayback Machine!")
             except Exception as e:
-                print(" unknown error downloading video!")
+                print(" unknown error downloading video!\n")
                 print(e)
             # metadata
-            with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile:
-                print(json.dumps(i), end="", file=jsonfile)
-            print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output))
+            with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile:
+                jsonfile.write(json.dumps(i, ensure_ascii=False).decode('utf-8'))
+                print(" saved %s" % os.path.basename(jsonfile.name))