Mercurial > codedump
comparison channeldownloader.py @ 114:80bd4a99ea00
Update channeldownloader.py
committer: GitHub <noreply@github.com>
| author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
|---|---|
| date | Sat, 21 Jan 2023 15:26:34 -0500 |
| parents | eafe13de3f76 |
| children | eac6dae753ca |
comparison
equal
deleted
inserted
replaced
| 113:a972dc788da0 | 114:80bd4a99ea00 |
|---|---|
| 1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
| 2 # | 2 """ |
| 3 # download deleted vids from old yt channels | 3 Usage: |
| 4 # script by paper | 4 channeldownloader.py <url>... (--database <file>) |
| 5 # it's pretty old and could definitely use some refining | 5 [--output <folder>] |
| 6 [--proxy <proxy>] | |
| 7 channeldownloader.py -h | --help | |
| 6 | 8 |
| 9 Arguments: | |
| 10 <url> YouTube channel URL to download from | |
| 11 | |
| 12 Options: | |
| 13 -h --help Show this screen | |
| 14 -o --output <folder> Output folder, relative to the current directory | |
| 15 [default: .] | |
| 16 -d --database <file> HTTP or HTTPS proxy (SOCKS5 with PySocks) | |
| 17 """ | |
| 7 from __future__ import print_function | 18 from __future__ import print_function |
| 8 import argparse | 19 import docopt |
| 9 import internetarchive | 20 import internetarchive |
| 10 try: | 21 try: |
| 11 import orjson as json | 22 import orjson as json |
| 12 except ImportError: | 23 except ImportError: |
| 13 import json | 24 import json |
| 20 except ImportError: # Python 2 | 31 except ImportError: # Python 2 |
| 21 import urllib as compat_urllib | 32 import urllib as compat_urllib |
| 22 from urllib2 import HTTPError | 33 from urllib2 import HTTPError |
| 23 try: | 34 try: |
| 24 import yt_dlp as youtube_dl | 35 import yt_dlp as youtube_dl |
| 25 from yt_dlp.utils import sanitize_filename | 36 from yt_dlp.utils import sanitize_filename, DownloadError |
| 26 except ImportError: | 37 except ImportError: |
| 27 try: | 38 try: |
| 28 import youtube_dl | 39 import youtube_dl |
| 29 from youtube_dl.utils import sanitize_filename | 40 from youtube_dl.utils import sanitize_filename, DownloadError |
| 30 except ImportError: | 41 except ImportError: |
| 31 print("ERROR: youtube-dl/yt-dlp not installed!") | 42 print("ERROR: youtube-dl/yt-dlp not installed!") |
| 32 exit(1) | 43 exit(1) |
| 33 from io import open # for Python 2 compatibility, in Python 3 this | 44 from io import open # for Python 2 compatibility, in Python 3 this |
| 34 # just maps to the built-in function | 45 # just maps to the built-in function |
| 39 | 50 |
| 40 def warning(self, msg): | 51 def warning(self, msg): |
| 41 pass | 52 pass |
| 42 | 53 |
| 43 def error(self, msg): | 54 def error(self, msg): |
| 55 print(" " + msg) | |
| 44 pass | 56 pass |
| 45 | 57 |
| 46 def ytdl_hook(d): | 58 def ytdl_hook(d): |
| 47 if d["status"] == "finished": | 59 if d["status"] == "finished": |
| 48 print(" downloaded %s: 100% " % (os.path.basename(d["filename"]))) | 60 print(" downloaded %s: 100%% " % (os.path.basename(d["filename"]))) |
| 49 if d["status"] == "downloading": | 61 if d["status"] == "downloading": |
| 50 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="") | 62 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="") |
| 51 if d["status"] == "error": | 63 if d["status"] == "error": |
| 52 print(" an error occurred downloading {0}!") | 64 print("\n an error occurred downloading %s!" % (os.path.basename(d["filename"]))) |
| 53 | 65 |
| 54 def load_split_files(path): | 66 def load_split_files(path): |
| 55 if os.path.isdir(path): | 67 if os.path.isdir(path): |
| 56 result = {"videos": []} | 68 result = {"videos": []} |
| 57 for fi in os.listdir(path): | 69 for fi in os.listdir(path): |
| 58 for f in re.findall(r"vids.+?\.json", fi): | 70 for f in re.findall(r"vids[0-9\-]+?\.json", fi): |
| 59 with open(path + "/" + f, "r", encoding="utf-8") as infile: | 71 with open(path + "/" + f, "r", encoding="utf-8") as infile: |
| 60 for i in json.loads(infile.read())["videos"]: | 72 jsonnn = json.loads(infile.read()) |
| 61 result["videos"].append(i) | 73 result["videos"].extend(jsonnn) |
| 62 return result | 74 return result |
| 63 else: | 75 else: |
| 64 return json.loads(open(path, "r", encoding="utf-8").read()) | 76 return json.loads(open(path, "r", encoding="utf-8").read()) |
| 65 | 77 |
| 66 def reporthook(count, block_size, total_size): | 78 def reporthook(count, block_size, total_size): |
| 70 return | 82 return |
| 71 duration = time.time() - start_time | 83 duration = time.time() - start_time |
| 72 percent = int(count * block_size * 100 / total_size) | 84 percent = int(count * block_size * 100 / total_size) |
| 73 print(" downloading %d%% \r" % (percent), end="") | 85 print(" downloading %d%% \r" % (percent), end="") |
| 74 | 86 |
| 75 | 87 args = docopt.docopt(__doc__) |
| 76 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") | |
| 77 parser.add_argument("-c", "--channel", help="channel URL", metavar="<url>", required=True) | |
| 78 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar="<path>", required=True) | |
| 79 parser.add_argument("-o", "--output", help="output directory, defaults to the channel ID", metavar="<output>") | |
| 80 args = parser.parse_args() | |
| 81 | |
| 82 if args.channel[:8] == "https://" or args.channel[:7] == "http://": | |
| 83 channel = args.channel.split("/")[-1] | |
| 84 else: | |
| 85 channel = args.channel | |
| 86 | |
| 87 if args.output: | |
| 88 output = args.output | |
| 89 else: | |
| 90 output = channel | |
| 91 | |
| 92 if not os.path.exists(output): | |
| 93 os.mkdir(output) | |
| 94 | 88 |
| 95 ytdl_opts = { | 89 ytdl_opts = { |
| 96 "outtmpl": output + "/%(title)s-%(id)s.%(ext)s", | |
| 97 "retries": 100, | 90 "retries": 100, |
| 98 "nooverwrites": True, | 91 "nooverwrites": True, |
| 99 "call_home": False, | 92 "call_home": False, |
| 100 "quiet": True, | 93 "quiet": True, |
| 101 "writeinfojson": True, | 94 "writeinfojson": True, |
| 114 "progress_hooks": [ytdl_hook], | 107 "progress_hooks": [ytdl_hook], |
| 115 "logger": MyLogger(), | 108 "logger": MyLogger(), |
| 116 "ignoreerrors": False, | 109 "ignoreerrors": False, |
| 117 } | 110 } |
| 118 | 111 |
| 119 for i in load_split_files(args.database)["videos"]: | 112 if not os.path.exists(args["--output"]): |
| 113 os.mkdir(args["--output"]) | |
| 114 | |
| 115 for i in load_split_files(args["--database"])["videos"]: | |
| 120 uploader = i["uploader_id"] if "uploader_id" in i else None | 116 uploader = i["uploader_id"] if "uploader_id" in i else None |
| 121 if uploader == channel: | 117 for url in args["<url>"]: |
| 122 print("%s:" % i["id"]) | 118 channel = url.split("/")[-1] |
| 123 # :skull: | 119 |
| 124 # todo: put this in a function? | 120 output = "%s/%s" % (args["--output"], channel) |
| 125 if any(x in os.listdir(output) for x in [sanitize_filename(i["title"] + "-" + i["id"] + ".mp4", restricted=True), | 121 if not os.path.exists(output): |
| 126 sanitize_filename(i["title"] + "-" + i["id"] + ".mkv", restricted=True), | 122 os.mkdir(output) |
| 127 sanitize_filename(i["title"] + "-" + i["id"] + ".webm", restricted=True)]): | 123 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" |
| 128 print(" video already downloaded!") | 124 |
| 129 continue | 125 |
| 130 # this code is *really* ugly... todo a rewrite? | 126 if uploader == channel: |
| 131 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: | 127 print("%s:" % i["id"]) |
| 132 try: | 128 # :skull: |
| 133 result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]]) # TODO: add check for existing downloaded items and don't download them | 129 # todo: put this in a function? |
| 134 continue | 130 if any(x in os.listdir(output) for x in [sanitize_filename("%s-%s.mp4" % (i["title"], i["id"]), restricted=True), |
| 135 except Exception: | 131 sanitize_filename("%s-%s.mkv" % (i["title"], i["id"]), restricted=True), |
| 136 print(" video is not available! attempting to find Internet Archive pages of it...") | 132 sanitize_filename("%s-%s.webm" % (i["title"], i["id"]), restricted=True)]): |
| 137 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available | |
| 138 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] | |
| 139 flist = [] | |
| 140 for fname in range(len(fnames)): | |
| 141 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]): | |
| 142 flist.append(fnames[fname]) | |
| 143 if len(flist) >= 1: | |
| 144 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) | |
| 145 else: | |
| 146 print(" video already downloaded!") | 133 print(" video already downloaded!") |
| 147 continue | 134 continue |
| 148 if os.path.exists(output + "/" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download | 135 # this code is *really* ugly... todo a rewrite? |
| 149 for fname in flist: | 136 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: |
| 150 if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname): | 137 try: |
| 151 os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname) | 138 result = ytdl.extract_info("https://youtube.com/watch?v=%s" % i["id"]) |
| 139 continue | |
| 140 except DownloadError: | |
| 141 print(" video is not available! attempting to find Internet Archive pages of it...") | |
| 142 except Exception as e: | |
| 143 print(" unknown error downloading video!\n") | |
| 144 print(e) | |
| 145 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available | |
| 146 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] | |
| 147 flist = [] | |
| 148 for fname in range(len(fnames)): | |
| 149 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]): | |
| 150 flist.append(fnames[fname]) | |
| 151 if len(flist) >= 1: | |
| 152 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True, retries=9999) | |
| 153 else: | |
| 154 print(" video already downloaded!") | |
| 155 continue | |
| 156 if os.path.exists("%s/%s.info.json" % (output, i["id"])): # will always exist no matter which setting was used to download | |
| 157 for fname in flist: | |
| 158 if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname): | |
| 159 os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname) | |
| 160 else: | |
| 161 print("ID file not found!") | |
| 152 else: | 162 else: |
| 153 print("ID file not found!") | 163 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") |
| 154 else: | 164 try: # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url, |
| 155 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") | 165 # and we wouldn't even know if it worked. so let's continue using our little "hack" |
| 156 try: # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url, | 166 headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"]) |
| 157 # and we wouldn't even know if it worked. so let's continue using our little "hack" | 167 if hasattr(headers.info(), "getheader"): |
| 158 headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"]) | 168 contenttype = headers.info().getheader("Content-Type") |
| 159 if hasattr(headers.info(), "getheader"): | 169 else: |
| 160 contenttype = headers.info().getheader("Content-Type") | 170 contenttype = headers.getheader("Content-Type") |
| 161 else: | 171 if contenttype == "video/webm": |
| 162 contenttype = headers.getheader("Content-Type") | 172 ext = "webm" |
| 163 if contenttype == "video/webm": | 173 elif contenttype == "video/mp4": |
| 164 ext = "webm" | 174 ext = "mp4" |
| 165 elif contenttype == "video/mp4": | 175 else: |
| 166 ext = "mp4" | 176 raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None) |
| 167 else: | 177 compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook) |
| 168 raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None) | 178 print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext)) |
| 169 compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook) | 179 except HTTPError: |
| 170 print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext)) | 180 print(" video not available on the Wayback Machine!") |
| 171 except HTTPError: | 181 except Exception as e: |
| 172 print(" video not available on the Wayback Machine!") | 182 print(" unknown error downloading video!\n") |
| 173 except Exception as e: | 183 print(e) |
| 174 print(" unknown error downloading video!\n") | |
| 175 print(e) | |
| 176 # metadata | 184 # metadata |
| 177 with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile: | 185 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"]) |
| 178 jsonfile.write(json.dumps(i).decode("utf-8")) | 186 if not os.path.exists(basename + ".info.json"): |
| 179 print(" saved %s" % os.path.basename(jsonfile.name)) | 187 with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile: |
| 188 try: | |
| 189 jsonfile.write(json.dumps(i).decode("utf-8")) | |
| 190 except AttributeError: | |
| 191 jsonfile.write(json.dumps(i)) | |
| 192 print(" saved %s" % os.path.basename(jsonfile.name)) | |
| 193 if not os.path.exists(basename + ".description"): | |
| 194 with open(basename + ".description", "w", encoding="utf-8") as descfile: | |
| 195 descfile.write(i["description"]) | |
| 196 print(" saved %s" % os.path.basename(descfile.name)) | |
| 180 | 197 |
