codedump: channeldownloader.py comparison

comparison channeldownloader.py @ 67:9636d5dee08c

[channeldownloader.py] Python 2.7 compatibility Also make the code a *lot* more optimized (e.g. removing the unnecessary double for-loop) committer: GitHub <noreply@github.com>

author	Paper <37962225+mrpapersonic@users.noreply.github.com>
date	Wed, 18 May 2022 18:57:58 -0400
parents	c615532e6572
children	a43ed076b28f

comparison

equal deleted inserted replaced

-:ff473892908c
+:9636d5dee08c
+#!/usr/bin/env python3
+#
+# download deleted vids from old yt channels
+# script by paper
+from __future__ import print_function
 import argparse
-import internetarchive  # pip install internetarchive
+import internetarchive
-import json
+try:
+import orjson as json
+except ImportError:
+import json
 import glob
 import os
 import re
-import urllib.request
+try:
-import yt_dlp  # pip install yt-dlp
+import urllib.request as compat_urllib
-import itertools
+from urllib.error import HTTPError
-from urllib.error import HTTPError
+except ImportError:  # Python 2
-from yt_dlp.utils import sanitize_filename
+import urllib as compat_urllib
+from urllib2 import HTTPError
+try:
+import yt_dlp as youtube_dl
+from yt_dlp.utils import sanitize_filename
+except ImportError:
+try:
+import youtube_dl
+from youtube_dl.utils import sanitize_filename
+except ImportError:
+print("ERROR: youtube-dl/yt-dlp not installed!")
+exit(1)
+from io import open  # for Python 2 compatibility, in Python 3 this
+# just maps to the built-in function
 class MyLogger(object):
 def debug(self, msg):
 pass
 pass
 def error(self, msg):
 pass
-def matroska_find(filelist):
-for myfile in filelist:
-if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm":
-return True
-return False
 def ytdl_hook(d):
 if d["status"] == "finished":
-print(" downloaded {0}:    100% ".format(os.path.basename(d["filename"])))
+print(" downloaded %s:    100% " % (os.path.basename(d["filename"])))
 if d["status"] == "downloading":
-print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="")
+print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="")
 if d["status"] == "error":
 print(" an error occurred downloading {0}!")
 def load_split_files(path):
 if os.path.isdir(path):
 with open(f, "r", encoding="utf-8") as infile:
 for i in json.loads(infile.read())["videos"]:
 result["videos"].append(i)
 return result
 else:
-return json.loads(open(path, "r", encoding="utf-8"))
+return json.loads(open(path, "r", encoding="utf-8").read())
 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True)
 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True)
 if not os.path.exists(output):
 os.mkdir(output)
 ytdl_opts = {
-"outtmpl": "{0}/%(title)s-%(id)s.%(ext)s".format(output),
+"outtmpl": "%s/%(title)s-%(id)s.%(ext)s" % (output),
 "retries": 100,
 "nooverwrites": True,
 "call_home": False,
 "quiet": True,
 "writeinfojson": True,
 "progress_hooks": [ytdl_hook],
 "logger": MyLogger(),
 "ignoreerrors": False,
 }
-for i in load_split_files(args.database):
+for i in load_split_files(args.database)["videos"]:
-try:
+uploader = i["uploader_id"] if "uploader_id" in i else None
-uploader = i["uploader_id"]
-except Exception:
-uploader = "unknown"
 if uploader == channel:
-print("{0}:".format(i["id"]))
+print("%s:" % i["id"])
-isalreadydownloaded = 0
+if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"):
-for file in os.listdir(output):
-if os.path.splitext(file)[1] == ".json":
-if file.find("-" + i["id"] + ".info.json") != -1:
-isalreadydownloaded = 1
-if isalreadydownloaded == 1:  # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great!
 print(" video already downloaded!")
 continue
-with yt_dlp.YoutubeDL(ytdl_opts) as ytdl:
+# this code is *really* ugly... todo a rewrite?
+with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
 try:
-result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])])  # TODO: add check for existing downloaded items and don't download them
+result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]])  # TODO: add check for existing downloaded items and don't download them
 continue
 except Exception:
 print(" video is not available! attempting to find Internet Archive pages of it...")
-if internetarchive.get_item("youtube-{0}".format(i["id"])).exists:  # download from internetarchive if available
+if internetarchive.get_item("youtube-%s" % i["id"]).exists:  # download from internetarchive if available
-fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))]
+fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
-disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])]  # list of IA-created files we don't need
+disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]]  # list of IA-created files we don't need
 flist = []
 for fname in fnames:
-if matroska_find(fnames):
+if os.path.splitext(fname)[1] in [".mkv", ".webm"]:
 if fname[-4:] == ".mp4":
 continue
 else:
 if fname[-7:] == ".ia.mp4":
 continue
 if fname.find("/") == -1:
-if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]):
+if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]:
 flist.append(fname)
 if len(flist) >= 1:
-internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
+internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
 else:
 print(" video already downloaded!")
 continue
-if os.path.exists(output + "\\" + i["id"] + ".info.json"):  # will always exist no matter which setting was used to download
+if os.path.exists(output + "/" + i["id"] + ".info.json"):  # will always exist no matter which setting was used to download
 for fname in flist:
-if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
+if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
-os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
+os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
 else:
 print("ID file not found!")
-else:  # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then)
+else:
 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
 try:
-contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type")
+headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"])
-if contenttype == "video/webm":
+if hasattr(headers.info(), "getheader"):
-ext = "webm"
+contenttype = headers.info().getheader("Content-Type")
 else:
-ext = "mp4"
+contenttype = headers.getheader("Content-Type")
-urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output))
+ext = "webm" if contenttype == "video/webm" else "mp4"
-print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext))
+compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext))
+print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext))
 except HTTPError:
 print(" video not available on the Wayback Machine!")
 except Exception as e:
-print(" unknown error downloading video!")
+print(" unknown error downloading video!\n")
 print(e)
 # metadata
-with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile:
+with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile:
-print(json.dumps(i), end="", file=jsonfile)
+jsonfile.write(json.dumps(i, ensure_ascii=False).decode('utf-8'))
-print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output))
+print(" saved %s" % os.path.basename(jsonfile.name))

Mercurial > codedump

comparison channeldownloader.py @ 67:9636d5dee08c