Mercurial > codedump
comparison channeldownloader.py @ 61:c615532e6572
Update channeldownloader.py
add option to use the split files instead of the full json
committer: GitHub <noreply@github.com>
| author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
|---|---|
| date | Sun, 02 Jan 2022 07:09:55 -0500 |
| parents | 4e7a9c7c0cce |
| children | 9636d5dee08c |
comparison
equal
deleted
inserted
replaced
| 60:4e7a9c7c0cce | 61:c615532e6572 |
|---|---|
| 1 import argparse | 1 import argparse |
| 2 import internetarchive # pip install internetarchive | 2 import internetarchive # pip install internetarchive |
| 3 import json | 3 import json |
| 4 import glob | |
| 4 import os | 5 import os |
| 5 import re | 6 import re |
| 6 import urllib.request | 7 import urllib.request |
| 7 import yt_dlp # pip install yt-dlp | 8 import yt_dlp # pip install yt-dlp |
| 8 import itertools | 9 import itertools |
| 9 from urllib.error import HTTPError | 10 from urllib.error import HTTPError |
| 11 from yt_dlp.utils import sanitize_filename | |
| 10 | 12 |
| 11 class MyLogger(object): | 13 class MyLogger(object): |
| 12 def debug(self, msg): | 14 def debug(self, msg): |
| 13 pass | 15 pass |
| 14 | 16 |
| 15 def warning(self, msg): | 17 def warning(self, msg): |
| 16 pass | 18 pass |
| 17 | 19 |
| 18 def error(self, msg): | 20 def error(self, msg): |
| 19 pass | 21 pass |
| 20 | |
| 21 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', | |
| 22 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], | |
| 23 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y'))) | |
| 24 | |
| 25 def sanitize_filename(s, restricted=False, is_id=False): | |
| 26 # from youtube-dl utils | |
| 27 def replace_insane(char): | |
| 28 if restricted and char in ACCENT_CHARS: | |
| 29 return ACCENT_CHARS[char] | |
| 30 if char == '?' or ord(char) < 32 or ord(char) == 127: | |
| 31 return '' | |
| 32 elif char == '"': | |
| 33 return '' if restricted else '\'' | |
| 34 elif char == ':': | |
| 35 return '_-' if restricted else ' -' | |
| 36 elif char in '\\/|*<>': | |
| 37 return '_' | |
| 38 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()): | |
| 39 return '_' | |
| 40 if restricted and ord(char) > 127: | |
| 41 return '_' | |
| 42 return char | |
| 43 | |
| 44 # Handle timestamps | |
| 45 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) | |
| 46 result = ''.join(map(replace_insane, s)) | |
| 47 if not is_id: | |
| 48 while '__' in result: | |
| 49 result = result.replace('__', '_') | |
| 50 result = result.strip('_') | |
| 51 # Common case of "Foreign band name - English song title" | |
| 52 if restricted and result.startswith('-_'): | |
| 53 result = result[2:] | |
| 54 if result.startswith('-'): | |
| 55 result = '_' + result[len('-'):] | |
| 56 result = result.lstrip('.') | |
| 57 if not result: | |
| 58 result = '_' | |
| 59 return result | |
| 60 | 22 |
| 61 def matroska_find(filelist): | 23 def matroska_find(filelist): |
| 62 for myfile in filelist: | 24 for myfile in filelist: |
| 63 if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm": | 25 if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm": |
| 64 return True | 26 return True |
| 69 print(" downloaded {0}: 100% ".format(os.path.basename(d["filename"]))) | 31 print(" downloaded {0}: 100% ".format(os.path.basename(d["filename"]))) |
| 70 if d["status"] == "downloading": | 32 if d["status"] == "downloading": |
| 71 print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="") | 33 print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="") |
| 72 if d["status"] == "error": | 34 if d["status"] == "error": |
| 73 print(" an error occurred downloading {0}!") | 35 print(" an error occurred downloading {0}!") |
| 36 | |
| 37 def load_split_files(path): | |
| 38 if os.path.isdir(path): | |
| 39 result = {"videos": []} | |
| 40 for f in glob.glob(os.path.join(path, "vids*.json")): | |
| 41 with open(f, "r", encoding="utf-8") as infile: | |
| 42 for i in json.loads(infile.read())["videos"]: | |
| 43 result["videos"].append(i) | |
| 44 return result | |
| 45 else: | |
| 46 return json.loads(open(path, "r", encoding="utf-8")) | |
| 74 | 47 |
| 75 | 48 |
| 76 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") | 49 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") |
| 77 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True) | 50 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True) |
| 78 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True) | 51 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True) |
| 114 "progress_hooks": [ytdl_hook], | 87 "progress_hooks": [ytdl_hook], |
| 115 "logger": MyLogger(), | 88 "logger": MyLogger(), |
| 116 "ignoreerrors": False, | 89 "ignoreerrors": False, |
| 117 } | 90 } |
| 118 | 91 |
| 119 with open(args.database, "r", encoding="utf-8") as f: | 92 for i in load_split_files(args.database): |
| 120 data = json.load(f) | 93 try: |
| 121 for i in data["videos"]: | 94 uploader = i["uploader_id"] |
| 122 try: | 95 except Exception: |
| 123 uploader = i["uploader_id"] | 96 uploader = "unknown" |
| 124 except Exception: | 97 if uploader == channel: |
| 125 uploader = "unknown" | 98 print("{0}:".format(i["id"])) |
| 126 finally: | 99 isalreadydownloaded = 0 |
| 127 if uploader == channel: | 100 for file in os.listdir(output): |
| 128 print("{0}:".format(i["id"])) | 101 if os.path.splitext(file)[1] == ".json": |
| 129 isalreadydownloaded = 0 | 102 if file.find("-" + i["id"] + ".info.json") != -1: |
| 130 for file in os.listdir(output): | 103 isalreadydownloaded = 1 |
| 131 if os.path.splitext(file)[1] == ".json": | 104 if isalreadydownloaded == 1: # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great! |
| 132 if file.find("-" + i["id"] + ".info.json") != -1: | 105 print(" video already downloaded!") |
| 133 isalreadydownloaded = 1 | 106 continue |
| 134 if isalreadydownloaded == 1: # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great! | 107 with yt_dlp.YoutubeDL(ytdl_opts) as ytdl: |
| 135 print(" video already downloaded!") | 108 try: |
| 136 continue | 109 result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])]) # TODO: add check for existing downloaded items and don't download them |
| 137 with yt_dlp.YoutubeDL(ytdl_opts) as ytdl: | 110 continue |
| 138 try: | 111 except Exception: |
| 139 result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])]) # TODO: add check for existing downloaded items and don't download them | 112 print(" video is not available! attempting to find Internet Archive pages of it...") |
| 113 if internetarchive.get_item("youtube-{0}".format(i["id"])).exists: # download from internetarchive if available | |
| 114 fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))] | |
| 115 disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])] # list of IA-created files we don't need | |
| 116 flist = [] | |
| 117 for fname in fnames: | |
| 118 if matroska_find(fnames): | |
| 119 if fname[-4:] == ".mp4": | |
| 140 continue | 120 continue |
| 141 except Exception: | 121 else: |
| 142 print(" video is not available! attempting to find Internet Archive pages of it...") | 122 if fname[-7:] == ".ia.mp4": |
| 143 if internetarchive.get_item("youtube-{0}".format(i["id"])).exists: # download from internetarchive if available | |
| 144 fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))] | |
| 145 disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])] # list of IA-created files we don't need | |
| 146 flist = [] | |
| 147 for fname in fnames: | |
| 148 if matroska_find(fnames): | |
| 149 if fname[-4:] == ".mp4": | |
| 150 continue | |
| 151 else: | |
| 152 if fname[-7:] == ".ia.mp4": | |
| 153 continue | |
| 154 if fname.find("/") == -1: | |
| 155 if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]): | |
| 156 flist.append(fname) | |
| 157 if len(flist) >= 1: | |
| 158 internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) | |
| 159 else: | |
| 160 print(" video already downloaded!") | |
| 161 continue | 123 continue |
| 162 if os.path.exists(output + "\\" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download | 124 if fname.find("/") == -1: |
| 163 for fname in flist: | 125 if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]): |
| 164 if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname): | 126 flist.append(fname) |
| 165 os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname) | 127 if len(flist) >= 1: |
| 166 else: | 128 internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) |
| 167 print("ID file not found!") | 129 else: |
| 168 else: # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then) | 130 print(" video already downloaded!") |
| 169 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") | 131 continue |
| 170 try: | 132 if os.path.exists(output + "\\" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download |
| 171 contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type") | 133 for fname in flist: |
| 172 if contenttype == "video/webm": | 134 if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname): |
| 173 ext = "webm" | 135 os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname) |
| 174 else: | 136 else: |
| 175 ext = "mp4" | 137 print("ID file not found!") |
| 176 urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output)) | 138 else: # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then) |
| 177 print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext)) | 139 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") |
| 178 except HTTPError: | 140 try: |
| 179 print(" video not available on the Wayback Machine!") | 141 contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type") |
| 180 except Exception as e: | 142 if contenttype == "video/webm": |
| 181 print(" unknown error downloading video!") | 143 ext = "webm" |
| 182 print(e) | 144 else: |
| 183 # metadata | 145 ext = "mp4" |
| 184 with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile: | 146 urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output)) |
| 185 print(json.dumps(i), end="", file=jsonfile) | 147 print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext)) |
| 186 print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output)) | 148 except HTTPError: |
| 149 print(" video not available on the Wayback Machine!") | |
| 150 except Exception as e: | |
| 151 print(" unknown error downloading video!") | |
| 152 print(e) | |
| 153 # metadata | |
| 154 with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile: | |
| 155 print(json.dumps(i), end="", file=jsonfile) | |
| 156 print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output)) |
