comparison channeldownloader.py @ 47:00403c09455c

Add channeldownloader.py committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Sat, 31 Jul 2021 01:38:46 -0400
parents
children edbe4aff3b78
comparison
equal deleted inserted replaced
46:522ad91a230e 47:00403c09455c
1 import argparse
2 import internetarchive # pip install internetarchive
3 import json
4 import os
5 import re # pip install re
6 import urllib.request
7 import youtube_dl # pip install youtube-dl
8 import itertools
9 from urllib.error import HTTPError
10
11 class MyLogger(object):
12 def debug(self, msg):
13 pass
14
15 def warning(self, msg):
16 pass
17
18 def error(self, msg):
19 pass
20
21 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
22 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
23 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
24
25 def sanitize_filename(s, restricted=False, is_id=False):
26 # from youtube-dl utils
27 def replace_insane(char):
28 if restricted and char in ACCENT_CHARS:
29 return ACCENT_CHARS[char]
30 if char == '?' or ord(char) < 32 or ord(char) == 127:
31 return ''
32 elif char == '"':
33 return '' if restricted else '\''
34 elif char == ':':
35 return '_-' if restricted else ' -'
36 elif char in '\\/|*<>':
37 return '_'
38 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
39 return '_'
40 if restricted and ord(char) > 127:
41 return '_'
42 return char
43
44 # Handle timestamps
45 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
46 result = ''.join(map(replace_insane, s))
47 if not is_id:
48 while '__' in result:
49 result = result.replace('__', '_')
50 result = result.strip('_')
51 # Common case of "Foreign band name - English song title"
52 if restricted and result.startswith('-_'):
53 result = result[2:]
54 if result.startswith('-'):
55 result = '_' + result[len('-'):]
56 result = result.lstrip('.')
57 if not result:
58 result = '_'
59 return result
60
61 def matroska_find(filelist):
62 for myfile in filelist:
63 if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm":
64 return True
65 return False
66
67 def ytdl_hook(d):
68 if d["status"] == "finished":
69 print(" downloaded {0}: 100% ".format(os.path.basename(d["filename"])))
70 if d["status"] == "downloading":
71 print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="")
72 if d["status"] == "error":
73 print(" an error occurred downloading {0}!")
74
75
76 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
77 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True)
78 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True)
79 parser.add_argument("-o", "--output", help="output directory, defaults to the channel ID", metavar='<output>')
80 args = parser.parse_args()
81
82 if args.channel[:8] == "https://" or args.channel[:7] == "http://":
83 channel = args.channel.split("/")[-1]
84 else:
85 channel = args.channel
86
87 if args.output:
88 output = args.output
89 else:
90 output = channel
91
92 if not os.path.exists(output):
93 os.mkdir(output)
94
95 ytdl_opts = {
96 "outtmpl": "{0}/%(title)s-%(id)s.%(ext)s".format(output),
97 "retries": 100,
98 "nooverwrites": True,
99 "call_home": False,
100 "quiet": True,
101 "writeinfojson": True,
102 "writedescription": True,
103 "writethumbnail": True,
104 "writeannotations": True,
105 "writesubtitles": True,
106 "allsubtitles": True,
107 "ignoreerrors": True,
108 "addmetadata": True,
109 "continuedl": True,
110 "embedthumbnail": True,
111 "format": "bestvideo+bestaudio/best",
112 "restrictfilenames": True,
113 "no_warnings": True,
114 "progress_hooks": [ytdl_hook],
115 "logger": MyLogger(),
116 "ignoreerrors": False,
117 }
118
119 with open(args.database, "r", encoding="utf-8") as f:
120 data = json.load(f)
121 for i in data["videos"]:
122 try:
123 uploader = i["uploader_id"]
124 except Exception:
125 uploader = "unknown"
126 finally:
127 if uploader == channel:
128 print("{0}:".format(i["id"]))
129 isalreadydownloaded = 0
130 for file in os.listdir(output):
131 if os.path.splitext(file)[1] == ".json":
132 if file.find("-" + i["id"] + ".info.json") != -1:
133 isalreadydownloaded = 1
134 if isalreadydownloaded == 1: # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great!
135 print(" video already downloaded!")
136 continue
137 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
138 try:
139 result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])]) # TODO: add check for existing downloaded items and don't download them
140 continue
141 except Exception:
142 print(" video is not available! attempting to find Internet Archive pages of it...")
143 if internetarchive.get_item("youtube-{0}".format(i["id"])).exists: # download from internetarchive if available
144 fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))]
145 disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])] # list of IA-created files we don't need
146 flist = []
147 for fname in fnames:
148 if matroska_find(fnames):
149 if fname[-4:] == ".mp4":
150 continue
151 else:
152 if fname[-7:] == ".ia.mp4":
153 continue
154 if fname.find("/") == -1:
155 if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]):
156 flist.append(fname)
157 if len(flist) >= 1:
158 internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
159 else:
160 print(" video already downloaded!")
161 continue
162 if os.path.exists(output + "\\" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download
163 for fname in flist:
164 if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
165 os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
166 else:
167 print("ID file not found!")
168 else: # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then)
169 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
170 try:
171 contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type")
172 if contenttype == "video/webm":
173 ext = "webm"
174 else:
175 ext = "mp4"
176 urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output))
177 print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext))
178 except HTTPError:
179 print(" video not available on the Wayback Machine!")
180 except Exception as e:
181 print(" unknown error downloading video!")
182 print(e)
183 # metadata
184 meta = {
185 "fulltitle": i["title"],
186 "description": i["description"],
187 "upload_date": i["upload_date"],
188 "uploader": i["uploader"]
189 }
190 metajson = json.dumps(meta)
191 with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile:
192 print(metajson, end="", file=jsonfile)
193 print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output))