Mercurial > codedump
changeset 114:80bd4a99ea00
Update channeldownloader.py
committer: GitHub <noreply@github.com>
author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
---|---|
date | Sat, 21 Jan 2023 15:26:34 -0500 |
parents | a972dc788da0 |
children | f10492e8720b |
files | channeldownloader.py |
diffstat | 1 files changed, 105 insertions(+), 88 deletions(-) [+] |
line wrap: on
line diff
--- a/channeldownloader.py Sat Jan 21 13:34:04 2023 -0500 +++ b/channeldownloader.py Sat Jan 21 15:26:34 2023 -0500 @@ -1,11 +1,22 @@ #!/usr/bin/env python3 -# -# download deleted vids from old yt channels -# script by paper -# it's pretty old and could definitely use some refining +""" +Usage: + channeldownloader.py <url>... (--database <file>) + [--output <folder>] + [--proxy <proxy>] + channeldownloader.py -h | --help +Arguments: + <url> YouTube channel URL to download from + +Options: + -h --help Show this screen + -o --output <folder> Output folder, relative to the current directory + [default: .] + -d --database <file> HTTP or HTTPS proxy (SOCKS5 with PySocks) +""" from __future__ import print_function -import argparse +import docopt import internetarchive try: import orjson as json @@ -22,11 +33,11 @@ from urllib2 import HTTPError try: import yt_dlp as youtube_dl - from yt_dlp.utils import sanitize_filename + from yt_dlp.utils import sanitize_filename, DownloadError except ImportError: try: import youtube_dl - from youtube_dl.utils import sanitize_filename + from youtube_dl.utils import sanitize_filename, DownloadError except ImportError: print("ERROR: youtube-dl/yt-dlp not installed!") exit(1) @@ -41,24 +52,25 @@ pass def error(self, msg): + print(" " + msg) pass def ytdl_hook(d): if d["status"] == "finished": - print(" downloaded %s: 100% " % (os.path.basename(d["filename"]))) + print(" downloaded %s: 100%% " % (os.path.basename(d["filename"]))) if d["status"] == "downloading": print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="") if d["status"] == "error": - print(" an error occurred downloading {0}!") + print("\n an error occurred downloading %s!" % (os.path.basename(d["filename"]))) def load_split_files(path): if os.path.isdir(path): result = {"videos": []} for fi in os.listdir(path): - for f in re.findall(r"vids.+?\.json", fi): + for f in re.findall(r"vids[0-9\-]+?\.json", fi): with open(path + "/" + f, "r", encoding="utf-8") as infile: - for i in json.loads(infile.read())["videos"]: - result["videos"].append(i) + jsonnn = json.loads(infile.read()) + result["videos"].extend(jsonnn) return result else: return json.loads(open(path, "r", encoding="utf-8").read()) @@ -72,28 +84,9 @@ percent = int(count * block_size * 100 / total_size) print(" downloading %d%% \r" % (percent), end="") - -parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") -parser.add_argument("-c", "--channel", help="channel URL", metavar="<url>", required=True) -parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar="<path>", required=True) -parser.add_argument("-o", "--output", help="output directory, defaults to the channel ID", metavar="<output>") -args = parser.parse_args() - -if args.channel[:8] == "https://" or args.channel[:7] == "http://": - channel = args.channel.split("/")[-1] -else: - channel = args.channel - -if args.output: - output = args.output -else: - output = channel - -if not os.path.exists(output): - os.mkdir(output) +args = docopt.docopt(__doc__) ytdl_opts = { - "outtmpl": output + "/%(title)s-%(id)s.%(ext)s", "retries": 100, "nooverwrites": True, "call_home": False, @@ -116,65 +109,89 @@ "ignoreerrors": False, } -for i in load_split_files(args.database)["videos"]: +if not os.path.exists(args["--output"]): + os.mkdir(args["--output"]) + +for i in load_split_files(args["--database"])["videos"]: uploader = i["uploader_id"] if "uploader_id" in i else None - if uploader == channel: - print("%s:" % i["id"]) - # :skull: - # todo: put this in a function? - if any(x in os.listdir(output) for x in [sanitize_filename(i["title"] + "-" + i["id"] + ".mp4", restricted=True), - sanitize_filename(i["title"] + "-" + i["id"] + ".mkv", restricted=True), - sanitize_filename(i["title"] + "-" + i["id"] + ".webm", restricted=True)]): - print(" video already downloaded!") - continue - # this code is *really* ugly... todo a rewrite? - with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: - try: - result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]]) # TODO: add check for existing downloaded items and don't download them - continue - except Exception: - print(" video is not available! attempting to find Internet Archive pages of it...") - if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available - fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] - flist = [] - for fname in range(len(fnames)): - if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]): - flist.append(fnames[fname]) - if len(flist) >= 1: - internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) - else: + for url in args["<url>"]: + channel = url.split("/")[-1] + + output = "%s/%s" % (args["--output"], channel) + if not os.path.exists(output): + os.mkdir(output) + ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" + + + if uploader == channel: + print("%s:" % i["id"]) + # :skull: + # todo: put this in a function? + if any(x in os.listdir(output) for x in [sanitize_filename("%s-%s.mp4" % (i["title"], i["id"]), restricted=True), + sanitize_filename("%s-%s.mkv" % (i["title"], i["id"]), restricted=True), + sanitize_filename("%s-%s.webm" % (i["title"], i["id"]), restricted=True)]): print(" video already downloaded!") continue - if os.path.exists(output + "/" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download - for fname in flist: - if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname): - os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname) - else: - print("ID file not found!") - else: - print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") - try: # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url, - # and we wouldn't even know if it worked. so let's continue using our little "hack" - headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"]) - if hasattr(headers.info(), "getheader"): - contenttype = headers.info().getheader("Content-Type") + # this code is *really* ugly... todo a rewrite? + with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: + try: + result = ytdl.extract_info("https://youtube.com/watch?v=%s" % i["id"]) + continue + except DownloadError: + print(" video is not available! attempting to find Internet Archive pages of it...") + except Exception as e: + print(" unknown error downloading video!\n") + print(e) + if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available + fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] + flist = [] + for fname in range(len(fnames)): + if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]): + flist.append(fnames[fname]) + if len(flist) >= 1: + internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True, retries=9999) + else: + print(" video already downloaded!") + continue + if os.path.exists("%s/%s.info.json" % (output, i["id"])): # will always exist no matter which setting was used to download + for fname in flist: + if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname): + os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname) else: - contenttype = headers.getheader("Content-Type") - if contenttype == "video/webm": - ext = "webm" - elif contenttype == "video/mp4": - ext = "mp4" - else: - raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None) - compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook) - print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext)) - except HTTPError: - print(" video not available on the Wayback Machine!") - except Exception as e: - print(" unknown error downloading video!\n") - print(e) + print("ID file not found!") + else: + print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") + try: # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url, + # and we wouldn't even know if it worked. so let's continue using our little "hack" + headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"]) + if hasattr(headers.info(), "getheader"): + contenttype = headers.info().getheader("Content-Type") + else: + contenttype = headers.getheader("Content-Type") + if contenttype == "video/webm": + ext = "webm" + elif contenttype == "video/mp4": + ext = "mp4" + else: + raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None) + compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook) + print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext)) + except HTTPError: + print(" video not available on the Wayback Machine!") + except Exception as e: + print(" unknown error downloading video!\n") + print(e) # metadata - with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile: - jsonfile.write(json.dumps(i).decode("utf-8")) - print(" saved %s" % os.path.basename(jsonfile.name)) + basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"]) + if not os.path.exists(basename + ".info.json"): + with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile: + try: + jsonfile.write(json.dumps(i).decode("utf-8")) + except AttributeError: + jsonfile.write(json.dumps(i)) + print(" saved %s" % os.path.basename(jsonfile.name)) + if not os.path.exists(basename + ".description"): + with open(basename + ".description", "w", encoding="utf-8") as descfile: + descfile.write(i["description"]) + print(" saved %s" % os.path.basename(descfile.name))