comparison channeldownloader.py @ 114:80bd4a99ea00

Update channeldownloader.py committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Sat, 21 Jan 2023 15:26:34 -0500
parents eafe13de3f76
children eac6dae753ca
comparison
equal deleted inserted replaced
113:a972dc788da0 114:80bd4a99ea00
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 # 2 """
3 # download deleted vids from old yt channels 3 Usage:
4 # script by paper 4 channeldownloader.py <url>... (--database <file>)
5 # it's pretty old and could definitely use some refining 5 [--output <folder>]
6 [--proxy <proxy>]
7 channeldownloader.py -h | --help
6 8
9 Arguments:
10 <url> YouTube channel URL to download from
11
12 Options:
13 -h --help Show this screen
14 -o --output <folder> Output folder, relative to the current directory
15 [default: .]
16 -d --database <file> HTTP or HTTPS proxy (SOCKS5 with PySocks)
17 """
7 from __future__ import print_function 18 from __future__ import print_function
8 import argparse 19 import docopt
9 import internetarchive 20 import internetarchive
10 try: 21 try:
11 import orjson as json 22 import orjson as json
12 except ImportError: 23 except ImportError:
13 import json 24 import json
20 except ImportError: # Python 2 31 except ImportError: # Python 2
21 import urllib as compat_urllib 32 import urllib as compat_urllib
22 from urllib2 import HTTPError 33 from urllib2 import HTTPError
23 try: 34 try:
24 import yt_dlp as youtube_dl 35 import yt_dlp as youtube_dl
25 from yt_dlp.utils import sanitize_filename 36 from yt_dlp.utils import sanitize_filename, DownloadError
26 except ImportError: 37 except ImportError:
27 try: 38 try:
28 import youtube_dl 39 import youtube_dl
29 from youtube_dl.utils import sanitize_filename 40 from youtube_dl.utils import sanitize_filename, DownloadError
30 except ImportError: 41 except ImportError:
31 print("ERROR: youtube-dl/yt-dlp not installed!") 42 print("ERROR: youtube-dl/yt-dlp not installed!")
32 exit(1) 43 exit(1)
33 from io import open # for Python 2 compatibility, in Python 3 this 44 from io import open # for Python 2 compatibility, in Python 3 this
34 # just maps to the built-in function 45 # just maps to the built-in function
39 50
40 def warning(self, msg): 51 def warning(self, msg):
41 pass 52 pass
42 53
43 def error(self, msg): 54 def error(self, msg):
55 print(" " + msg)
44 pass 56 pass
45 57
46 def ytdl_hook(d): 58 def ytdl_hook(d):
47 if d["status"] == "finished": 59 if d["status"] == "finished":
48 print(" downloaded %s: 100% " % (os.path.basename(d["filename"]))) 60 print(" downloaded %s: 100%% " % (os.path.basename(d["filename"])))
49 if d["status"] == "downloading": 61 if d["status"] == "downloading":
50 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="") 62 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="")
51 if d["status"] == "error": 63 if d["status"] == "error":
52 print(" an error occurred downloading {0}!") 64 print("\n an error occurred downloading %s!" % (os.path.basename(d["filename"])))
53 65
54 def load_split_files(path): 66 def load_split_files(path):
55 if os.path.isdir(path): 67 if os.path.isdir(path):
56 result = {"videos": []} 68 result = {"videos": []}
57 for fi in os.listdir(path): 69 for fi in os.listdir(path):
58 for f in re.findall(r"vids.+?\.json", fi): 70 for f in re.findall(r"vids[0-9\-]+?\.json", fi):
59 with open(path + "/" + f, "r", encoding="utf-8") as infile: 71 with open(path + "/" + f, "r", encoding="utf-8") as infile:
60 for i in json.loads(infile.read())["videos"]: 72 jsonnn = json.loads(infile.read())
61 result["videos"].append(i) 73 result["videos"].extend(jsonnn)
62 return result 74 return result
63 else: 75 else:
64 return json.loads(open(path, "r", encoding="utf-8").read()) 76 return json.loads(open(path, "r", encoding="utf-8").read())
65 77
66 def reporthook(count, block_size, total_size): 78 def reporthook(count, block_size, total_size):
70 return 82 return
71 duration = time.time() - start_time 83 duration = time.time() - start_time
72 percent = int(count * block_size * 100 / total_size) 84 percent = int(count * block_size * 100 / total_size)
73 print(" downloading %d%% \r" % (percent), end="") 85 print(" downloading %d%% \r" % (percent), end="")
74 86
75 87 args = docopt.docopt(__doc__)
76 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
77 parser.add_argument("-c", "--channel", help="channel URL", metavar="<url>", required=True)
78 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar="<path>", required=True)
79 parser.add_argument("-o", "--output", help="output directory, defaults to the channel ID", metavar="<output>")
80 args = parser.parse_args()
81
82 if args.channel[:8] == "https://" or args.channel[:7] == "http://":
83 channel = args.channel.split("/")[-1]
84 else:
85 channel = args.channel
86
87 if args.output:
88 output = args.output
89 else:
90 output = channel
91
92 if not os.path.exists(output):
93 os.mkdir(output)
94 88
95 ytdl_opts = { 89 ytdl_opts = {
96 "outtmpl": output + "/%(title)s-%(id)s.%(ext)s",
97 "retries": 100, 90 "retries": 100,
98 "nooverwrites": True, 91 "nooverwrites": True,
99 "call_home": False, 92 "call_home": False,
100 "quiet": True, 93 "quiet": True,
101 "writeinfojson": True, 94 "writeinfojson": True,
114 "progress_hooks": [ytdl_hook], 107 "progress_hooks": [ytdl_hook],
115 "logger": MyLogger(), 108 "logger": MyLogger(),
116 "ignoreerrors": False, 109 "ignoreerrors": False,
117 } 110 }
118 111
119 for i in load_split_files(args.database)["videos"]: 112 if not os.path.exists(args["--output"]):
113 os.mkdir(args["--output"])
114
115 for i in load_split_files(args["--database"])["videos"]:
120 uploader = i["uploader_id"] if "uploader_id" in i else None 116 uploader = i["uploader_id"] if "uploader_id" in i else None
121 if uploader == channel: 117 for url in args["<url>"]:
122 print("%s:" % i["id"]) 118 channel = url.split("/")[-1]
123 # :skull: 119
124 # todo: put this in a function? 120 output = "%s/%s" % (args["--output"], channel)
125 if any(x in os.listdir(output) for x in [sanitize_filename(i["title"] + "-" + i["id"] + ".mp4", restricted=True), 121 if not os.path.exists(output):
126 sanitize_filename(i["title"] + "-" + i["id"] + ".mkv", restricted=True), 122 os.mkdir(output)
127 sanitize_filename(i["title"] + "-" + i["id"] + ".webm", restricted=True)]): 123 ytdl_opts["outtmpl"] = output + "/%(title)s-%(id)s.%(ext)s"
128 print(" video already downloaded!") 124
129 continue 125
130 # this code is *really* ugly... todo a rewrite? 126 if uploader == channel:
131 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: 127 print("%s:" % i["id"])
132 try: 128 # :skull:
133 result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]]) # TODO: add check for existing downloaded items and don't download them 129 # todo: put this in a function?
134 continue 130 if any(x in os.listdir(output) for x in [sanitize_filename("%s-%s.mp4" % (i["title"], i["id"]), restricted=True),
135 except Exception: 131 sanitize_filename("%s-%s.mkv" % (i["title"], i["id"]), restricted=True),
136 print(" video is not available! attempting to find Internet Archive pages of it...") 132 sanitize_filename("%s-%s.webm" % (i["title"], i["id"]), restricted=True)]):
137 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available
138 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
139 flist = []
140 for fname in range(len(fnames)):
141 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
142 flist.append(fnames[fname])
143 if len(flist) >= 1:
144 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
145 else:
146 print(" video already downloaded!") 133 print(" video already downloaded!")
147 continue 134 continue
148 if os.path.exists(output + "/" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download 135 # this code is *really* ugly... todo a rewrite?
149 for fname in flist: 136 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
150 if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname): 137 try:
151 os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname) 138 result = ytdl.extract_info("https://youtube.com/watch?v=%s" % i["id"])
139 continue
140 except DownloadError:
141 print(" video is not available! attempting to find Internet Archive pages of it...")
142 except Exception as e:
143 print(" unknown error downloading video!\n")
144 print(e)
145 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available
146 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
147 flist = []
148 for fname in range(len(fnames)):
149 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
150 flist.append(fnames[fname])
151 if len(flist) >= 1:
152 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True, retries=9999)
153 else:
154 print(" video already downloaded!")
155 continue
156 if os.path.exists("%s/%s.info.json" % (output, i["id"])): # will always exist no matter which setting was used to download
157 for fname in flist:
158 if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
159 os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
160 else:
161 print("ID file not found!")
152 else: 162 else:
153 print("ID file not found!") 163 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
154 else: 164 try: # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url,
155 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") 165 # and we wouldn't even know if it worked. so let's continue using our little "hack"
156 try: # we could use yt-dlp's extractor, but then we would need to craft a fake wayback machine url, 166 headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"])
157 # and we wouldn't even know if it worked. so let's continue using our little "hack" 167 if hasattr(headers.info(), "getheader"):
158 headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"]) 168 contenttype = headers.info().getheader("Content-Type")
159 if hasattr(headers.info(), "getheader"): 169 else:
160 contenttype = headers.info().getheader("Content-Type") 170 contenttype = headers.getheader("Content-Type")
161 else: 171 if contenttype == "video/webm":
162 contenttype = headers.getheader("Content-Type") 172 ext = "webm"
163 if contenttype == "video/webm": 173 elif contenttype == "video/mp4":
164 ext = "webm" 174 ext = "mp4"
165 elif contenttype == "video/mp4": 175 else:
166 ext = "mp4" 176 raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None)
167 else: 177 compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook)
168 raise HTTPError(url=None, code=None, msg=None, hdrs=None, fp=None) 178 print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext))
169 compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext), reporthook) 179 except HTTPError:
170 print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext)) 180 print(" video not available on the Wayback Machine!")
171 except HTTPError: 181 except Exception as e:
172 print(" video not available on the Wayback Machine!") 182 print(" unknown error downloading video!\n")
173 except Exception as e: 183 print(e)
174 print(" unknown error downloading video!\n")
175 print(e)
176 # metadata 184 # metadata
177 with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile: 185 basename = "%s/%s-%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"])
178 jsonfile.write(json.dumps(i).decode("utf-8")) 186 if not os.path.exists(basename + ".info.json"):
179 print(" saved %s" % os.path.basename(jsonfile.name)) 187 with open(basename + ".info.json", "w", encoding="utf-8") as jsonfile:
188 try:
189 jsonfile.write(json.dumps(i).decode("utf-8"))
190 except AttributeError:
191 jsonfile.write(json.dumps(i))
192 print(" saved %s" % os.path.basename(jsonfile.name))
193 if not os.path.exists(basename + ".description"):
194 with open(basename + ".description", "w", encoding="utf-8") as descfile:
195 descfile.write(i["description"])
196 print(" saved %s" % os.path.basename(descfile.name))
180 197