codedump: channeldownloader.py comparison

comparison channeldownloader.py @ 114:80bd4a99ea00

Update channeldownloader.py committer: GitHub <noreply@github.com>

author	Paper <37962225+mrpapersonic@users.noreply.github.com>
date	Sat, 21 Jan 2023 15:26:34 -0500
parents	eafe13de3f76
children	eac6dae753ca

comparison

equal deleted inserted replaced

-:a972dc788da0
+:80bd4a99ea00
 #!/usr/bin/env python3
-#
+"""
-# download deleted vids from old yt channels
+Usage:
-# script by paper
+channeldownloader.py <url>... (--database <file>)
-# it's pretty old and could definitely use some refining
+[--output <folder>]
+[--proxy <proxy>]
+channeldownloader.py -h | --help
+Arguments:
+<url>                        YouTube channel URL to download from
+Options:
+-h --help                    Show this screen
+-o --output <folder>         Output folder, relative to the current directory
+[default: .]
+-d --database <file>         HTTP or HTTPS proxy (SOCKS5 with PySocks)
+"""
 from __future__ import print_function
-import argparse
+import docopt
 import internetarchive
 try:
 import orjson as json
 except ImportError:
 import json
 except ImportError:  # Python 2
 import urllib as compat_urllib
 from urllib2 import HTTPError
 try:
 import yt_dlp as youtube_dl
-from yt_dlp.utils import sanitize_filename
+from yt_dlp.utils import sanitize_filename, DownloadError
 except ImportError:
 try:
 import youtube_dl
-from youtube_dl.utils import sanitize_filename
+from youtube_dl.utils import sanitize_filename, DownloadError
 except ImportError:
 print("ERROR: youtube-dl/yt-dlp not installed!")
 exit(1)
 from io import open  # for Python 2 compatibility, in Python 3 this
 # just maps to the built-in function
 def warning(self, msg):
 pass
 def error(self, msg):
+print(" " + msg)
 pass
 def ytdl_hook(d):
 if d["status"] == "finished":
-print(" downloaded %s:    100% " % (os.path.basename(d["filename"])))
+print(" downloaded %s:    100%% " % (os.path.basename(d["filename"])))
 if d["status"] == "downloading":
 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="")
 if d["status"] == "error":
-print(" an error occurred downloading {0}!")
+print("\n an error occurred downloading %s!" % (os.path.basename(d["filename"])))
 def load_split_files(path):
 if os.path.isdir(path):
 result = {"videos": []}
 for fi in os.listdir(path):
-for f in re.findall(r"vids.+?\.json", fi):
+for f in re.findall(r"vids[0-9\-]+?\.json", fi):
 with open(path + "/" + f, "r", encoding="utf-8") as infile:
-for i in json.loads(infile.read())["videos"]:
+jsonnn = json.loads(infile.read())
-result["videos"].append(i)
+result["videos"].extend(jsonnn)
 return result
 else:
 return json.loads(open(path, "r", encoding="utf-8").read())
 def reporthook(count, block_size, total_size):
 return
 duration = time.time() - start_time
 percent = int(count * block_size * 100 / total_size)
 print(" downloading %d%%        \r" % (percent), end="")
+args = docopt.docopt(__doc__)
-parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
-parser.add_argument("-c", "--channel", help="channel URL", metavar="<url>", required=True)
-parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar="<path>", required=True)
-parser.add_argument("-o", "--output", help="output directory, defaults to the channel ID", metavar="<output>")
-args = parser.parse_args()
-if args.channel[:8] == "https://" or args.channel[:7] == "http://":
-channel = args.channel.split("/")[-1]
-else:
-channel = args.channel
-if args.output:
-output = args.output
-else:
-output = channel
-if not os.path.exists(output):
-os.mkdir(output)
 ytdl_opts = {
-"outtmpl": output + "/%(title)s-%(id)s.%(ext)s",
 "retries": 100,
 "nooverwrites": True,
 "call_home": False,
 "quiet": True,
 "writeinfojson": True,
 "progress_hooks": [ytdl_hook],
 "logger": MyLogger(),
 "ignoreerrors": False,
 }
-for i in load_split_files(args.database)["videos"]:
+if not os.path.exists(args["--output"]):
+os.mkdir(args["--output"])
+for i in load_split_files(args["--database"])["videos"]:
 uploader = i["uploader_id"] if "uploader_id" in i else None
-if uploader == channel:
+for url in args["<url>"]:
-print("%s:" % i["id"])
+channel = url.split("/")[-1]
-# :skull:
-# todo: put this in a function?
+output = "%s/%s" % (args["--output"], channel)
-if any(x in os.listdir(output) for x in [sanitize_filename(i["title"] + "-" + i["id"] + ".mp4", restricted=True),
+if not os.path.exists(output):
-sanitize_filename(i["title"] + "-" + i["id"] + ".mkv", restricted=True),
+os.mkdir(output)
-sanitize_filename(i["title"] + "-" + i["id"] + ".webm", restricted=True)]):
+ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
-print(" video already downloaded!")
-continue
-# this code is *really* ugly... todo a rewrite?
+if uploader == channel:
-with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
+print("%s:" % i["id"])
-try:
+# :skull:
-result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]])  # TODO: add check for existing downloaded items and don't download them
+# todo: put this in a function?
-continue
+if any(x in os.listdir(output) for x in [sanitize_filename("%s-%s.mp4"  % (i["title"], i["id"]), restricted=True),
-except Exception:
+sanitize_filename("%s-%s.mkv"  % (i["title"], i["id"]), restricted=True),
-print(" video is not available! attempting to find Internet Archive pages of it...")
+sanitize_filename("%s-%s.webm" % (i["title"], i["id"]), restricted=True)]):
-if internetarchive.get_item("youtube-%s" % i["id"]).exists:  # download from internetarchive if available
-fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
-flist = []
-for fname in range(len(fnames)):
-if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
-flist.append(fnames[fname])
-if len(flist) >= 1:
-internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
-else:
 print(" video already downloaded!")
 continue
-if os.path.exists(output + "/" + i["id"] + ".info.json"):  # will always exist no matter which setting was used to download
+# this code is *really* ugly... todo a rewrite?
-for fname in flist:
+with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
-if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
+try:
-os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
+result = ytdl.extract_info("https://youtube.com/watch?v=%s" % i["id"])
+continue
+except DownloadError:
+print(" video is not available! attempting to find Internet Archive pages of it...")
+except Exception as e:
+print(" unknown error downloading video!\n")
+print(e)
+if internetarchive.get_item("youtube-%s" % i["id"]).exists:  # download from internetarchive if available
+fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
+flist = []
+for fname in range(len(fnames)):
+if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
+flist.append(fnames[fname])
+if len(flist) >= 1:
+internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True, retries=9999)
+else:
+print(" video already downloaded!")
+continue
+if os.path.exists("%s/%s.info.json" % (output, i["id"])):  # will always exist no matter which setting was used to download
+for fname in flist:
+if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
+os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
+else:
+print("ID file not found!")
 else:
-print("ID file not found!")
+print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
-else:
+try:  # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url,
-print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
+# and we wouldn't even know if it worked. so let's continue using our little "hack"
-try:  # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url,
+headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"])
-# and we wouldn't even know if it worked. so let's continue using our little "hack"
+if hasattr(headers.info(), "getheader"):
-headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"])
+contenttype = headers.info().getheader("Content-Type")
-if hasattr(headers.info(), "getheader"):
+else:
-contenttype = headers.info().getheader("Content-Type")
+contenttype = headers.getheader("Content-Type")
-else:
+if contenttype == "video/webm":
-contenttype = headers.getheader("Content-Type")
+ext = "webm"
-if contenttype == "video/webm":
+elif contenttype == "video/mp4":
-ext = "webm"
+ext = "mp4"
-elif contenttype == "video/mp4":
+else:
-ext = "mp4"
+raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None)
-else:
+compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook)
-raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None)
+print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext))
-compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook)
+except HTTPError:
-print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext))
+print(" video not available on the Wayback Machine!")
-except HTTPError:
+except Exception as e:
-print(" video not available on the Wayback Machine!")
+print(" unknown error downloading video!\n")
-except Exception as e:
+print(e)
-print(" unknown error downloading video!\n")
-print(e)
 # metadata
-with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile:
+basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"])
-jsonfile.write(json.dumps(i).decode("utf-8"))
+if not os.path.exists(basename + ".info.json"):
-print(" saved %s" % os.path.basename(jsonfile.name))
+with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile:
+try:
+jsonfile.write(json.dumps(i).decode("utf-8"))
+except AttributeError:
+jsonfile.write(json.dumps(i))
+print(" saved %s" % os.path.basename(jsonfile.name))
+if not os.path.exists(basename + ".description"):
+with open(basename + ".description", "w", encoding="utf-8") as descfile:
+descfile.write(i["description"])
+print(" saved %s" % os.path.basename(descfile.name))

Mercurial > codedump

comparison channeldownloader.py @ 114:80bd4a99ea00