Mercurial > codedump
comparison channeldownloader.py @ 114:80bd4a99ea00
Update channeldownloader.py
committer: GitHub <noreply@github.com>
author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
---|---|
date | Sat, 21 Jan 2023 15:26:34 -0500 |
parents | eafe13de3f76 |
children | eac6dae753ca |
comparison
equal
deleted
inserted
replaced
113:a972dc788da0 | 114:80bd4a99ea00 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 # | 2 """ |
3 # download deleted vids from old yt channels | 3 Usage: |
4 # script by paper | 4 channeldownloader.py <url>... (--database <file>) |
5 # it's pretty old and could definitely use some refining | 5 [--output <folder>] |
6 [--proxy <proxy>] | |
7 channeldownloader.py -h | --help | |
6 | 8 |
9 Arguments: | |
10 <url> YouTube channel URL to download from | |
11 | |
12 Options: | |
13 -h --help Show this screen | |
14 -o --output <folder> Output folder, relative to the current directory | |
15 [default: .] | |
16 -d --database <file> HTTP or HTTPS proxy (SOCKS5 with PySocks) | |
17 """ | |
7 from __future__ import print_function | 18 from __future__ import print_function |
8 import argparse | 19 import docopt |
9 import internetarchive | 20 import internetarchive |
10 try: | 21 try: |
11 import orjson as json | 22 import orjson as json |
12 except ImportError: | 23 except ImportError: |
13 import json | 24 import json |
20 except ImportError: # Python 2 | 31 except ImportError: # Python 2 |
21 import urllib as compat_urllib | 32 import urllib as compat_urllib |
22 from urllib2 import HTTPError | 33 from urllib2 import HTTPError |
23 try: | 34 try: |
24 import yt_dlp as youtube_dl | 35 import yt_dlp as youtube_dl |
25 from yt_dlp.utils import sanitize_filename | 36 from yt_dlp.utils import sanitize_filename, DownloadError |
26 except ImportError: | 37 except ImportError: |
27 try: | 38 try: |
28 import youtube_dl | 39 import youtube_dl |
29 from youtube_dl.utils import sanitize_filename | 40 from youtube_dl.utils import sanitize_filename, DownloadError |
30 except ImportError: | 41 except ImportError: |
31 print("ERROR: youtube-dl/yt-dlp not installed!") | 42 print("ERROR: youtube-dl/yt-dlp not installed!") |
32 exit(1) | 43 exit(1) |
33 from io import open # for Python 2 compatibility, in Python 3 this | 44 from io import open # for Python 2 compatibility, in Python 3 this |
34 # just maps to the built-in function | 45 # just maps to the built-in function |
39 | 50 |
40 def warning(self, msg): | 51 def warning(self, msg): |
41 pass | 52 pass |
42 | 53 |
43 def error(self, msg): | 54 def error(self, msg): |
55 print(" " + msg) | |
44 pass | 56 pass |
45 | 57 |
46 def ytdl_hook(d): | 58 def ytdl_hook(d): |
47 if d["status"] == "finished": | 59 if d["status"] == "finished": |
48 print(" downloaded %s: 100% " % (os.path.basename(d["filename"]))) | 60 print(" downloaded %s: 100%% " % (os.path.basename(d["filename"]))) |
49 if d["status"] == "downloading": | 61 if d["status"] == "downloading": |
50 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="") | 62 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="") |
51 if d["status"] == "error": | 63 if d["status"] == "error": |
52 print(" an error occurred downloading {0}!") | 64 print("\n an error occurred downloading %s!" % (os.path.basename(d["filename"]))) |
53 | 65 |
54 def load_split_files(path): | 66 def load_split_files(path): |
55 if os.path.isdir(path): | 67 if os.path.isdir(path): |
56 result = {"videos": []} | 68 result = {"videos": []} |
57 for fi in os.listdir(path): | 69 for fi in os.listdir(path): |
58 for f in re.findall(r"vids.+?\.json", fi): | 70 for f in re.findall(r"vids[0-9\-]+?\.json", fi): |
59 with open(path + "/" + f, "r", encoding="utf-8") as infile: | 71 with open(path + "/" + f, "r", encoding="utf-8") as infile: |
60 for i in json.loads(infile.read())["videos"]: | 72 jsonnn = json.loads(infile.read()) |
61 result["videos"].append(i) | 73 result["videos"].extend(jsonnn) |
62 return result | 74 return result |
63 else: | 75 else: |
64 return json.loads(open(path, "r", encoding="utf-8").read()) | 76 return json.loads(open(path, "r", encoding="utf-8").read()) |
65 | 77 |
66 def reporthook(count, block_size, total_size): | 78 def reporthook(count, block_size, total_size): |
70 return | 82 return |
71 duration = time.time() - start_time | 83 duration = time.time() - start_time |
72 percent = int(count * block_size * 100 / total_size) | 84 percent = int(count * block_size * 100 / total_size) |
73 print(" downloading %d%% \r" % (percent), end="") | 85 print(" downloading %d%% \r" % (percent), end="") |
74 | 86 |
75 | 87 args = docopt.docopt(__doc__) |
76 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") | |
77 parser.add_argument("-c", "--channel", help="channel URL", metavar="<url>", required=True) | |
78 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar="<path>", required=True) | |
79 parser.add_argument("-o", "--output", help="output directory, defaults to the channel ID", metavar="<output>") | |
80 args = parser.parse_args() | |
81 | |
82 if args.channel[:8] == "https://" or args.channel[:7] == "http://": | |
83 channel = args.channel.split("/")[-1] | |
84 else: | |
85 channel = args.channel | |
86 | |
87 if args.output: | |
88 output = args.output | |
89 else: | |
90 output = channel | |
91 | |
92 if not os.path.exists(output): | |
93 os.mkdir(output) | |
94 | 88 |
95 ytdl_opts = { | 89 ytdl_opts = { |
96 "outtmpl": output + "/%(title)s-%(id)s.%(ext)s", | |
97 "retries": 100, | 90 "retries": 100, |
98 "nooverwrites": True, | 91 "nooverwrites": True, |
99 "call_home": False, | 92 "call_home": False, |
100 "quiet": True, | 93 "quiet": True, |
101 "writeinfojson": True, | 94 "writeinfojson": True, |
114 "progress_hooks": [ytdl_hook], | 107 "progress_hooks": [ytdl_hook], |
115 "logger": MyLogger(), | 108 "logger": MyLogger(), |
116 "ignoreerrors": False, | 109 "ignoreerrors": False, |
117 } | 110 } |
118 | 111 |
119 for i in load_split_files(args.database)["videos"]: | 112 if not os.path.exists(args["--output"]): |
113 os.mkdir(args["--output"]) | |
114 | |
115 for i in load_split_files(args["--database"])["videos"]: | |
120 uploader = i["uploader_id"] if "uploader_id" in i else None | 116 uploader = i["uploader_id"] if "uploader_id" in i else None |
121 if uploader == channel: | 117 for url in args["<url>"]: |
122 print("%s:" % i["id"]) | 118 channel = url.split("/")[-1] |
123 # :skull: | 119 |
124 # todo: put this in a function? | 120 output = "%s/%s" % (args["--output"], channel) |
125 if any(x in os.listdir(output) for x in [sanitize_filename(i["title"] + "-" + i["id"] + ".mp4", restricted=True), | 121 if not os.path.exists(output): |
126 sanitize_filename(i["title"] + "-" + i["id"] + ".mkv", restricted=True), | 122 os.mkdir(output) |
127 sanitize_filename(i["title"] + "-" + i["id"] + ".webm", restricted=True)]): | 123 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s" |
128 print(" video already downloaded!") | 124 |
129 continue | 125 |
130 # this code is *really* ugly... todo a rewrite? | 126 if uploader == channel: |
131 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: | 127 print("%s:" % i["id"]) |
132 try: | 128 # :skull: |
133 result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]]) # TODO: add check for existing downloaded items and don't download them | 129 # todo: put this in a function? |
134 continue | 130 if any(x in os.listdir(output) for x in [sanitize_filename("%s-%s.mp4" % (i["title"], i["id"]), restricted=True), |
135 except Exception: | 131 sanitize_filename("%s-%s.mkv" % (i["title"], i["id"]), restricted=True), |
136 print(" video is not available! attempting to find Internet Archive pages of it...") | 132 sanitize_filename("%s-%s.webm" % (i["title"], i["id"]), restricted=True)]): |
137 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available | |
138 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] | |
139 flist = [] | |
140 for fname in range(len(fnames)): | |
141 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]): | |
142 flist.append(fnames[fname]) | |
143 if len(flist) >= 1: | |
144 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) | |
145 else: | |
146 print(" video already downloaded!") | 133 print(" video already downloaded!") |
147 continue | 134 continue |
148 if os.path.exists(output + "/" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download | 135 # this code is *really* ugly... todo a rewrite? |
149 for fname in flist: | 136 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: |
150 if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname): | 137 try: |
151 os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname) | 138 result = ytdl.extract_info("https://youtube.com/watch?v=%s" % i["id"]) |
139 continue | |
140 except DownloadError: | |
141 print(" video is not available! attempting to find Internet Archive pages of it...") | |
142 except Exception as e: | |
143 print(" unknown error downloading video!\n") | |
144 print(e) | |
145 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available | |
146 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] | |
147 flist = [] | |
148 for fname in range(len(fnames)): | |
149 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]): | |
150 flist.append(fnames[fname]) | |
151 if len(flist) >= 1: | |
152 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True, retries=9999) | |
153 else: | |
154 print(" video already downloaded!") | |
155 continue | |
156 if os.path.exists("%s/%s.info.json" % (output, i["id"])): # will always exist no matter which setting was used to download | |
157 for fname in flist: | |
158 if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname): | |
159 os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname) | |
160 else: | |
161 print("ID file not found!") | |
152 else: | 162 else: |
153 print("ID file not found!") | 163 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") |
154 else: | 164 try: # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url, |
155 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") | 165 # and we wouldn't even know if it worked. so let's continue using our little "hack" |
156 try: # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url, | 166 headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"]) |
157 # and we wouldn't even know if it worked. so let's continue using our little "hack" | 167 if hasattr(headers.info(), "getheader"): |
158 headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"]) | 168 contenttype = headers.info().getheader("Content-Type") |
159 if hasattr(headers.info(), "getheader"): | 169 else: |
160 contenttype = headers.info().getheader("Content-Type") | 170 contenttype = headers.getheader("Content-Type") |
161 else: | 171 if contenttype == "video/webm": |
162 contenttype = headers.getheader("Content-Type") | 172 ext = "webm" |
163 if contenttype == "video/webm": | 173 elif contenttype == "video/mp4": |
164 ext = "webm" | 174 ext = "mp4" |
165 elif contenttype == "video/mp4": | 175 else: |
166 ext = "mp4" | 176 raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None) |
167 else: | 177 compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook) |
168 raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None) | 178 print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext)) |
169 compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook) | 179 except HTTPError: |
170 print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext)) | 180 print(" video not available on the Wayback Machine!") |
171 except HTTPError: | 181 except Exception as e: |
172 print(" video not available on the Wayback Machine!") | 182 print(" unknown error downloading video!\n") |
173 except Exception as e: | 183 print(e) |
174 print(" unknown error downloading video!\n") | |
175 print(e) | |
176 # metadata | 184 # metadata |
177 with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile: | 185 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"]) |
178 jsonfile.write(json.dumps(i).decode("utf-8")) | 186 if not os.path.exists(basename + ".info.json"): |
179 print(" saved %s" % os.path.basename(jsonfile.name)) | 187 with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile: |
188 try: | |
189 jsonfile.write(json.dumps(i).decode("utf-8")) | |
190 except AttributeError: | |
191 jsonfile.write(json.dumps(i)) | |
192 print(" saved %s" % os.path.basename(jsonfile.name)) | |
193 if not os.path.exists(basename + ".description"): | |
194 with open(basename + ".description", "w", encoding="utf-8") as descfile: | |
195 descfile.write(i["description"]) | |
196 print(" saved %s" % os.path.basename(descfile.name)) | |
180 | 197 |