comparison channeldownloader.py @ 67:9636d5dee08c

[channeldownloader.py] Python 2.7 compatibility Also make the code a *lot* more optimized (e.g. removing the unnecessary double for-loop) committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Wed, 18 May 2022 18:57:58 -0400
parents c615532e6572
children a43ed076b28f
comparison
equal deleted inserted replaced
66:ff473892908c 67:9636d5dee08c
1 #!/usr/bin/env python3
2 #
3 # download deleted vids from old yt channels
4 # script by paper
5
6 from __future__ import print_function
1 import argparse 7 import argparse
2 import internetarchive # pip install internetarchive 8 import internetarchive
3 import json 9 try:
10 import orjson as json
11 except ImportError:
12 import json
4 import glob 13 import glob
5 import os 14 import os
6 import re 15 import re
7 import urllib.request 16 try:
8 import yt_dlp # pip install yt-dlp 17 import urllib.request as compat_urllib
9 import itertools 18 from urllib.error import HTTPError
10 from urllib.error import HTTPError 19 except ImportError: # Python 2
11 from yt_dlp.utils import sanitize_filename 20 import urllib as compat_urllib
21 from urllib2 import HTTPError
22 try:
23 import yt_dlp as youtube_dl
24 from yt_dlp.utils import sanitize_filename
25 except ImportError:
26 try:
27 import youtube_dl
28 from youtube_dl.utils import sanitize_filename
29 except ImportError:
30 print("ERROR: youtube-dl/yt-dlp not installed!")
31 exit(1)
32 from io import open # for Python 2 compatibility, in Python 3 this
33 # just maps to the built-in function
12 34
13 class MyLogger(object): 35 class MyLogger(object):
14 def debug(self, msg): 36 def debug(self, msg):
15 pass 37 pass
16 38
18 pass 40 pass
19 41
20 def error(self, msg): 42 def error(self, msg):
21 pass 43 pass
22 44
23 def matroska_find(filelist):
24 for myfile in filelist:
25 if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm":
26 return True
27 return False
28
29 def ytdl_hook(d): 45 def ytdl_hook(d):
30 if d["status"] == "finished": 46 if d["status"] == "finished":
31 print(" downloaded {0}: 100% ".format(os.path.basename(d["filename"]))) 47 print(" downloaded %s: 100% " % (os.path.basename(d["filename"])))
32 if d["status"] == "downloading": 48 if d["status"] == "downloading":
33 print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="") 49 print(" downloading %s: %s\r" % (os.path.basename(d["filename"]), d["_percent_str"]), end="")
34 if d["status"] == "error": 50 if d["status"] == "error":
35 print(" an error occurred downloading {0}!") 51 print(" an error occurred downloading {0}!")
36 52
37 def load_split_files(path): 53 def load_split_files(path):
38 if os.path.isdir(path): 54 if os.path.isdir(path):
41 with open(f, "r", encoding="utf-8") as infile: 57 with open(f, "r", encoding="utf-8") as infile:
42 for i in json.loads(infile.read())["videos"]: 58 for i in json.loads(infile.read())["videos"]:
43 result["videos"].append(i) 59 result["videos"].append(i)
44 return result 60 return result
45 else: 61 else:
46 return json.loads(open(path, "r", encoding="utf-8")) 62 return json.loads(open(path, "r", encoding="utf-8").read())
47 63
48 64
49 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") 65 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
50 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True) 66 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True)
51 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True) 67 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True)
64 80
65 if not os.path.exists(output): 81 if not os.path.exists(output):
66 os.mkdir(output) 82 os.mkdir(output)
67 83
68 ytdl_opts = { 84 ytdl_opts = {
69 "outtmpl": "{0}/%(title)s-%(id)s.%(ext)s".format(output), 85 "outtmpl": "%s/%(title)s-%(id)s.%(ext)s" % (output),
70 "retries": 100, 86 "retries": 100,
71 "nooverwrites": True, 87 "nooverwrites": True,
72 "call_home": False, 88 "call_home": False,
73 "quiet": True, 89 "quiet": True,
74 "writeinfojson": True, 90 "writeinfojson": True,
87 "progress_hooks": [ytdl_hook], 103 "progress_hooks": [ytdl_hook],
88 "logger": MyLogger(), 104 "logger": MyLogger(),
89 "ignoreerrors": False, 105 "ignoreerrors": False,
90 } 106 }
91 107
92 for i in load_split_files(args.database): 108 for i in load_split_files(args.database)["videos"]:
93 try: 109 uploader = i["uploader_id"] if "uploader_id" in i else None
94 uploader = i["uploader_id"]
95 except Exception:
96 uploader = "unknown"
97 if uploader == channel: 110 if uploader == channel:
98 print("{0}:".format(i["id"])) 111 print("%s:" % i["id"])
99 isalreadydownloaded = 0 112 if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"):
100 for file in os.listdir(output):
101 if os.path.splitext(file)[1] == ".json":
102 if file.find("-" + i["id"] + ".info.json") != -1:
103 isalreadydownloaded = 1
104 if isalreadydownloaded == 1: # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great!
105 print(" video already downloaded!") 113 print(" video already downloaded!")
106 continue 114 continue
107 with yt_dlp.YoutubeDL(ytdl_opts) as ytdl: 115 # this code is *really* ugly... todo a rewrite?
116 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
108 try: 117 try:
109 result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])]) # TODO: add check for existing downloaded items and don't download them 118 result = ytdl.download(["https://youtube.com/watch?v=%s" % i["id"]]) # TODO: add check for existing downloaded items and don't download them
110 continue 119 continue
111 except Exception: 120 except Exception:
112 print(" video is not available! attempting to find Internet Archive pages of it...") 121 print(" video is not available! attempting to find Internet Archive pages of it...")
113 if internetarchive.get_item("youtube-{0}".format(i["id"])).exists: # download from internetarchive if available 122 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available
114 fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))] 123 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
115 disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])] # list of IA-created files we don't need 124 disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]] # list of IA-created files we don't need
116 flist = [] 125 flist = []
117 for fname in fnames: 126 for fname in fnames:
118 if matroska_find(fnames): 127 if os.path.splitext(fname)[1] in [".mkv", ".webm"]:
119 if fname[-4:] == ".mp4": 128 if fname[-4:] == ".mp4":
120 continue 129 continue
121 else: 130 else:
122 if fname[-7:] == ".ia.mp4": 131 if fname[-7:] == ".ia.mp4":
123 continue 132 continue
124 if fname.find("/") == -1: 133 if fname.find("/") == -1:
125 if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]): 134 if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]:
126 flist.append(fname) 135 flist.append(fname)
127 if len(flist) >= 1: 136 if len(flist) >= 1:
128 internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) 137 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
129 else: 138 else:
130 print(" video already downloaded!") 139 print(" video already downloaded!")
131 continue 140 continue
132 if os.path.exists(output + "\\" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download 141 if os.path.exists(output + "/" + i["id"] + ".info.json"): # will always exist no matter which setting was used to download
133 for fname in flist: 142 for fname in flist:
134 if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname): 143 if os.path.exists(output + "/" + fname) and not os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
135 os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname) 144 os.rename(output + "/" + fname, output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
136 else: 145 else:
137 print("ID file not found!") 146 print("ID file not found!")
138 else: # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then) 147 else:
139 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...") 148 print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
140 try: 149 try:
141 contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type") 150 headers = compat_urllib.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"])
142 if contenttype == "video/webm": 151 if hasattr(headers.info(), "getheader"):
143 ext = "webm" 152 contenttype = headers.info().getheader("Content-Type")
144 else: 153 else:
145 ext = "mp4" 154 contenttype = headers.getheader("Content-Type")
146 urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output)) 155 ext = "webm" if contenttype == "video/webm" else "mp4"
147 print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext)) 156 compat_urllib.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s" % i["id"], "%s/%s-%s.%s" % (output, sanitize_filename(i["title"], restricted=True), i["id"], ext))
157 print(" downloaded %s-%s.%s" % (sanitize_filename(i["title"], restricted=True), i["id"], ext))
148 except HTTPError: 158 except HTTPError:
149 print(" video not available on the Wayback Machine!") 159 print(" video not available on the Wayback Machine!")
150 except Exception as e: 160 except Exception as e:
151 print(" unknown error downloading video!") 161 print(" unknown error downloading video!\n")
152 print(e) 162 print(e)
153 # metadata 163 # metadata
154 with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile: 164 with open("%s/%s-%s.info.json" % (output, sanitize_filename(i["title"], restricted=True), i["id"]), "w", encoding="utf-8") as jsonfile:
155 print(json.dumps(i), end="", file=jsonfile) 165 jsonfile.write(json.dumps(i, ensure_ascii=False).decode('utf-8'))
156 print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output)) 166 print(" saved %s" % os.path.basename(jsonfile.name))