comparison channeldownloader.py @ 61:c615532e6572

Update channeldownloader.py add option to use the split files instead of the full json committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Sun, 02 Jan 2022 07:09:55 -0500
parents 4e7a9c7c0cce
children 9636d5dee08c
comparison
equal deleted inserted replaced
60:4e7a9c7c0cce 61:c615532e6572
1 import argparse 1 import argparse
2 import internetarchive # pip install internetarchive 2 import internetarchive # pip install internetarchive
3 import json 3 import json
4 import glob
4 import os 5 import os
5 import re 6 import re
6 import urllib.request 7 import urllib.request
7 import yt_dlp # pip install yt-dlp 8 import yt_dlp # pip install yt-dlp
8 import itertools 9 import itertools
9 from urllib.error import HTTPError 10 from urllib.error import HTTPError
11 from yt_dlp.utils import sanitize_filename
10 12
11 class MyLogger(object): 13 class MyLogger(object):
12 def debug(self, msg): 14 def debug(self, msg):
13 pass 15 pass
14 16
15 def warning(self, msg): 17 def warning(self, msg):
16 pass 18 pass
17 19
18 def error(self, msg): 20 def error(self, msg):
19 pass 21 pass
20
21 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
22 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
23 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
24
25 def sanitize_filename(s, restricted=False, is_id=False):
26 # from youtube-dl utils
27 def replace_insane(char):
28 if restricted and char in ACCENT_CHARS:
29 return ACCENT_CHARS[char]
30 if char == '?' or ord(char) < 32 or ord(char) == 127:
31 return ''
32 elif char == '"':
33 return '' if restricted else '\''
34 elif char == ':':
35 return '_-' if restricted else ' -'
36 elif char in '\\/|*<>':
37 return '_'
38 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
39 return '_'
40 if restricted and ord(char) > 127:
41 return '_'
42 return char
43
44 # Handle timestamps
45 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
46 result = ''.join(map(replace_insane, s))
47 if not is_id:
48 while '__' in result:
49 result = result.replace('__', '_')
50 result = result.strip('_')
51 # Common case of "Foreign band name - English song title"
52 if restricted and result.startswith('-_'):
53 result = result[2:]
54 if result.startswith('-'):
55 result = '_' + result[len('-'):]
56 result = result.lstrip('.')
57 if not result:
58 result = '_'
59 return result
60 22
61 def matroska_find(filelist): 23 def matroska_find(filelist):
62 for myfile in filelist: 24 for myfile in filelist:
63 if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm": 25 if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm":
64 return True 26 return True
69 print(" downloaded {0}: 100% ".format(os.path.basename(d["filename"]))) 31 print(" downloaded {0}: 100% ".format(os.path.basename(d["filename"])))
70 if d["status"] == "downloading": 32 if d["status"] == "downloading":
71 print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="") 33 print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="")
72 if d["status"] == "error": 34 if d["status"] == "error":
73 print(" an error occurred downloading {0}!") 35 print(" an error occurred downloading {0}!")
36
37 def load_split_files(path):
38 if os.path.isdir(path):
39 result = {"videos": []}
40 for f in glob.glob(os.path.join(path, "vids*.json")):
41 with open(f, "r", encoding="utf-8") as infile:
42 for i in json.loads(infile.read())["videos"]:
43 result["videos"].append(i)
44 return result
45 else:
46 return json.loads(open(path, "r", encoding="utf-8"))
74 47
75 48
76 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") 49 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
77 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True) 50 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True)
78 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True) 51 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True)
114 "progress_hooks": [ytdl_hook], 87 "progress_hooks": [ytdl_hook],
115 "logger": MyLogger(), 88 "logger": MyLogger(),
116 "ignoreerrors": False, 89 "ignoreerrors": False,
117 } 90 }
118 91
119 with open(args.database, "r", encoding="utf-8") as f: 92 for i in load_split_files(args.database):
120 data = json.load(f) 93 try:
121 for i in data["videos"]: 94 uploader = i["uploader_id"]
122 try: 95 except Exception:
123 uploader = i["uploader_id"] 96 uploader = "unknown"
124 except Exception: 97 if uploader == channel:
125 uploader = "unknown" 98 print("{0}:".format(i["id"]))
126 finally: 99 isalreadydownloaded = 0
127 if uploader == channel: 100 for file in os.listdir(output):
128 print("{0}:".format(i["id"])) 101 if os.path.splitext(file)[1] == ".json":
129 isalreadydownloaded = 0 102 if file.find("-" + i["id"] + ".info.json") != -1:
130 for file in os.listdir(output): 103 isalreadydownloaded = 1
131 if os.path.splitext(file)[1] == ".json": 104 if isalreadydownloaded == 1: # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great!
132 if file.find("-" + i["id"] + ".info.json") != -1: 105 print(" video already downloaded!")
133 isalreadydownloaded = 1 106 continue
134 if isalreadydownloaded == 1: # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great! 107 with yt_dlp.YoutubeDL(ytdl_opts) as ytdl:
135 print(" video already downloaded!") 108 try:
136 continue 109 result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])]) # TODO: add check for existing downloaded items and don't download them
137 with yt_dlp.YoutubeDL(ytdl_opts) as ytdl: 110 continue
138 try: 111 except Exception:
139 result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])]) # TODO: add check for existing downloaded items and don't download them 112 print(" video is not available! attempting to find Internet Archive pages of it...")
113 if internetarchive.get_item("youtube-{0}".format(i["id"])).exists: # download from internetarchive if available
114 fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))]
115 disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])] # list of IA-created files we don't need
116 flist = []
117 for fname in fnames:
118 if matroska_find(fnames):
119 if fname[-4:] == ".mp4":
140 continue 120 continue
141 except Exception: 121 else:
142 print(" video is not available! attempting to find Internet Archive pages of it...") 122 if fname[-7:] == ".ia.mp4":
143 if internetarchive.get_item("youtube-{0}".format(i["id"])).exists: # download from internetarchive if available
144 fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))]
145 disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])] # list of IA-created files we don't need
146 flist = []
147 for fname in fnames:
148 if matroska_find(fnames):
149 if fname[-4:] == ".mp4":
150 continue
151 else:
152 if fname[-7:] == ".ia.mp4":
153 continue
154 if fname.find("/") == -1:
155 if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]):
156 flist.append(fname)
157 if len(flist) >= 1:
158 internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
159 else:
160 print(" video already downloaded!")
161 continue 123 continue
162 if os.path.exists(output + "\\" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download 124 if fname.find("/") == -1:
163 for fname in flist: 125 if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]):
164 if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname): 126 flist.append(fname)
165 os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname) 127 if len(flist) >= 1:
166 else: 128 internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
167 print("ID file not found!") 129 else:
168 else: # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then) 130 print(" video already downloaded!")
169 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") 131 continue
170 try: 132 if os.path.exists(output + "\\" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download
171 contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type") 133 for fname in flist:
172 if contenttype == "video/webm": 134 if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
173 ext = "webm" 135 os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
174 else: 136 else:
175 ext = "mp4" 137 print("ID file not found!")
176 urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output)) 138 else: # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then)
177 print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext)) 139 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
178 except HTTPError: 140 try:
179 print(" video not available on the Wayback Machine!") 141 contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type")
180 except Exception as e: 142 if contenttype == "video/webm":
181 print(" unknown error downloading video!") 143 ext = "webm"
182 print(e) 144 else:
183 # metadata 145 ext = "mp4"
184 with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile: 146 urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output))
185 print(json.dumps(i), end="", file=jsonfile) 147 print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext))
186 print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output)) 148 except HTTPError:
149 print(" video not available on the Wayback Machine!")
150 except Exception as e:
151 print(" unknown error downloading video!")
152 print(e)
153 # metadata
154 with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile:
155 print(json.dumps(i), end="", file=jsonfile)
156 print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output))