Mercurial > codedump
comparison kemonopartydownloader.py @ 56:bde647ac9554
Update kemonopartydownloader.py
uses new zip type google drive folder downloading
"because it's easier"
committer: GitHub <noreply@github.com>
author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
---|---|
date | Mon, 16 Aug 2021 23:02:23 -0400 |
parents | 4e5000c9b48f |
children | d2e0edd4a070 |
comparison
equal
deleted
inserted
replaced
55:4e5000c9b48f | 56:bde647ac9554 |
---|---|
9 import time | 9 import time |
10 import math | 10 import math |
11 import zipfile | 11 import zipfile |
12 import urllib.parse | 12 import urllib.parse |
13 from urllib.error import HTTPError | 13 from urllib.error import HTTPError |
14 | 14 from http.client import BadStatusLine |
15 | 15 |
16 def get_google_drive_subfolder_ids(link): | 16 |
17 gdrive = requests.get(link).text | 17 def download_folder_from_google_drive(link): |
18 drivefiles = re.findall(r"\[\"(.{33}?)\",\[\"(.{33}?)\"\],\"(.+?)\",\"(.+?)\"", gdrive) # format: ["id","name","mimetype" | 18 session = requests.Session() |
19 seen = set() | 19 session.headers = { |
20 unique_ids = [] | 20 'origin': 'https://drive.google.com', |
21 names = [] | 21 'content-type': 'application/json', |
22 for files in drivefiles: | 22 } |
23 if files[3] != "application/vnd.google-apps.folder": | 23 key = "AIzaSyC1qbk75NzWBvSaDh6KnsjjA9pIrP4lYIE" # google anonymous key |
24 continue | 24 takeoutjs = session.post(f"https://takeout-pa.clients6.google.com/v1/exports?key={key}", data='{{"items":[{{"id":"{0}"}}]}}'.format(link.split("?")[0].split("/")[-1])).json() |
25 if files[0] not in seen: | 25 takeoutid = takeoutjs["exportJob"]["id"] |
26 unique_ids.append(files[0]) | 26 storagePath = None |
27 names.append(files[2]) | 27 while storagePath is None: |
28 seen.add(files[0]) | 28 succeededjson = session.get("https://takeout-pa.clients6.google.com/v1/exports/{0}?key={1}".format(takeoutid, key)).json() |
29 return unique_ids, names | 29 if succeededjson["exportJob"]["status"] == "SUCCEEDED": |
30 storagePath = succeededjson["exportJob"]["archives"][0]["storagePath"] | |
31 time.sleep(1) | |
32 size = 0 | |
33 for path, dirs, files in os.walk("./{0}/Drive - {1}".format(output, sanitize(i["title"]))): | |
34 for f in files: | |
35 fp = os.path.join(path, f) | |
36 size += os.path.getsize(fp) | |
37 if size >= int(succeededjson["exportJob"]["archives"][0]["sizeOfContents"]): | |
38 print(" {0} already downloaded!".format(succeededjson["exportJob"]["archives"][0]["fileName"])) | |
39 return | |
40 response = session.get(storagePath, stream=True) | |
41 amountdone = 0 | |
42 with open(succeededjson["exportJob"]["archives"][0]["fileName"], "wb") as f: | |
43 for chunk in response.iter_content(1024): | |
44 if chunk: # filter out keep-alive new chunks | |
45 f.write(chunk) | |
46 amountdone += 1024 | |
47 print(" downloading {0}: ".format(succeededjson["exportJob"]["archives"][0]["fileName"]) + " " + str(round((amountdone / int(succeededjson['exportJob']['archives'][0]['compressedSize'])) * 100, 2)) + "%\r", end="") | |
48 print(" downloaded {0}".format(succeededjson["exportJob"]["archives"][0]["fileName"]) + ": 100.00% ") | |
49 unzip(succeededjson["exportJob"]["archives"][0]["fileName"], "./{0}/Drive - {1}".format(output, sanitize(i["title"]))) | |
30 | 50 |
31 | 51 |
32 def unzip(src_path, dst_dir, pwd=None): | 52 def unzip(src_path, dst_dir, pwd=None): |
33 with zipfile.ZipFile(src_path) as zf: | 53 with zipfile.ZipFile(src_path) as zf: |
34 members = zf.namelist() | 54 members = zf.namelist() |
40 if not os.path.exists(dst_path): | 60 if not os.path.exists(dst_path): |
41 zf.extract(arch_info, dst_dir, pwd) | 61 zf.extract(arch_info, dst_dir, pwd) |
42 | 62 |
43 | 63 |
44 def download_from_dropbox(link): | 64 def download_from_dropbox(link): |
45 responsehead = requests.head(link.split("?")[0] + "?dl=1", allow_redirects=True) | 65 responsehead = req.head(link.split("?")[0] + "?dl=1", allow_redirects=True) |
46 if responsehead.status_code == 404: | 66 if responsehead.status_code == 404: |
47 print(" dropbox link not available!") | 67 print(" dropbox link not available!") |
48 return | 68 return |
49 if not os.path.exists(output + "\\Dropbox - " + sanitize(i["title"])): | 69 if not os.path.exists(output + "/Dropbox - " + sanitize(i["title"])): |
50 os.makedirs(output + "\\Dropbox - " + sanitize(i["title"])) | 70 os.makedirs(output + "/Dropbox - " + sanitize(i["title"])) |
51 filename = output + "\\Dropbox - " + sanitize(i["title"]) + "\\" + sanitize(responsehead.headers["Content-Disposition"].split("'")[-1]) | 71 filename = output + "/Dropbox - " + sanitize(i["title"]) + "/" + sanitize(responsehead.headers["Content-Disposition"].split("'")[-1]) |
52 if os.path.exists(urllib.parse.unquote(os.path.splitext(filename)[0])) and os.path.isdir(urllib.parse.unquote(os.path.splitext(filename)[0])): | 72 if os.path.exists(urllib.parse.unquote(os.path.splitext(filename)[0])) and os.path.isdir(urllib.parse.unquote(os.path.splitext(filename)[0])): |
53 print(" file(s) already downloaded!") | 73 print(" file(s) already downloaded!") |
54 return | 74 return |
55 if os.path.exists(filename): | 75 if os.path.exists(filename): |
56 filesize = os.stat(filename).st_size | 76 filesize = os.stat(filename).st_size |
69 if responsehead.headers["Content-Disposition"].split("'")[-1].endswith(".zip"): | 89 if responsehead.headers["Content-Disposition"].split("'")[-1].endswith(".zip"): |
70 unzip(filename, urllib.parse.unquote(os.path.splitext(filename)[0])) | 90 unzip(filename, urllib.parse.unquote(os.path.splitext(filename)[0])) |
71 os.remove(filename) | 91 os.remove(filename) |
72 | 92 |
73 | 93 |
74 def download_file_from_google_drive(id, dir=""): # https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive/39225039 ;) | 94 def download_file_from_google_drive(id, dir=""): # https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive/39225039 |
75 def get_confirm_token(response): | 95 def get_confirm_token(response): |
76 for key, value in response.cookies.items(): | 96 for key, value in response.cookies.items(): |
77 if key.startswith('download_warning'): | 97 if key.startswith('download_warning'): |
78 return value | 98 return value |
79 | 99 |
80 return None | 100 return None |
81 | 101 |
82 def save_response_content(response): | 102 def save_response_content(response): |
83 amountdone = 0 | 103 amountdone = 0 |
84 CHUNK_SIZE = 32768 | 104 CHUNK_SIZE = 4096 |
85 if not os.path.exists(output + "\\Drive - " + sanitize(i["title"])): | 105 if not os.path.exists(output + "/Drive - " + sanitize(i["title"]) + "/" + dir): |
86 os.makedirs(output + "\\Drive - " + sanitize(i["title"])) | 106 os.makedirs(output + "/Drive - " + sanitize(i["title"]) + "/" + dir) |
87 if not os.path.exists(output + "\\Drive - " + sanitize(i["title"]) + "\\" + dir): | 107 destination = output + "/Drive - " + sanitize(i["title"]) + "/" + dir + "/" + sanitize(response.headers["Content-Disposition"].split("'")[-1]) |
88 os.makedirs(output + "\\Drive - " + sanitize(i["title"]) + "\\" + dir) | |
89 destination = output + "\\Drive - " + sanitize(i["title"]) + "\\" + dir + "\\" + sanitize(response.headers["Content-Disposition"].split("'")[-1]) | |
90 if os.path.exists(destination): | 108 if os.path.exists(destination): |
91 filesize = os.stat(destination).st_size | 109 filesize = os.stat(destination).st_size |
92 else: | 110 else: |
93 filesize = 0 | 111 filesize = 0 |
94 | 112 |
99 with open(destination, "wb") as f: | 117 with open(destination, "wb") as f: |
100 for chunk in response.iter_content(CHUNK_SIZE): | 118 for chunk in response.iter_content(CHUNK_SIZE): |
101 if chunk: # filter out keep-alive new chunks | 119 if chunk: # filter out keep-alive new chunks |
102 f.write(chunk) | 120 f.write(chunk) |
103 amountdone += CHUNK_SIZE | 121 amountdone += CHUNK_SIZE |
104 print(" downloading {0}: ".format(os.path.basename(destination)) + " " + str(round(filesize + amountdone / int(response.headers["Content-Range"].partition('/')[-1])) * 100) + "%\r", end="") | 122 print(" downloading {0}: ".format(os.path.basename(destination)) + " " + str(round((amountdone / int(response.headers["Content-Range"].partition('/')[-1])) * 100, 2)) + "%\r", end="") |
105 print(" downloaded {0}".format(os.path.basename(destination)) + ": 100% ") | 123 print(" downloaded {0}".format(os.path.basename(destination)) + ": 100.00% ") |
106 | 124 |
107 URL = "https://docs.google.com/uc?export=download" | 125 URL = "https://docs.google.com/uc?export=download" |
108 | 126 |
109 session = requests.Session() | 127 session = requests.Session() |
110 | 128 |
111 headers = { | 129 headers = { |
112 "Range": "bytes=0-", | 130 "Range": "bytes=0-", |
113 } | 131 } |
114 | 132 |
133 session.proxies = req.proxies | |
134 | |
115 response = session.get(URL, headers=headers, params={'id': id}, stream=True) | 135 response = session.get(URL, headers=headers, params={'id': id}, stream=True) |
116 | 136 |
117 while response.status_code == 403: | 137 while response.status_code == 403: |
118 time.sleep(30) | 138 time.sleep(30) |
119 response = session.get(URL, headers=headers, params={'id': id}, stream=True) | 139 response = session.get(URL, headers=headers, params={'id': id}, stream=True) |
120 | 140 |
141 if response.status_code == 404: | |
142 return # bypass when root folder has no files | |
143 | |
121 token = get_confirm_token(response) | 144 token = get_confirm_token(response) |
122 | 145 |
123 if token: | 146 if token: |
124 params = {'id': id, 'confirm': token} | 147 params = {'id': id, 'confirm': token} |
125 response = session.get(URL, headers=headers, params=params, stream=True) | 148 response = session.get(URL, headers=headers, params=params, stream=True) |
131 return re.sub(r"[\/:*?\"<>|]", "_", filename).strip() | 154 return re.sub(r"[\/:*?\"<>|]", "_", filename).strip() |
132 | 155 |
133 | 156 |
134 def find_urls(s): | 157 def find_urls(s): |
135 urllist = [] | 158 urllist = [] |
136 for findall in re.findall("href=\\\"(https://.+?)\\\"", s): | 159 for findall in re.findall(r"""http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+""", s): |
137 urllist.append(re.sub(r"<[^<]+?>", "", re.sub(r"[^a-zA-Z0-9<>]+$", "", findall))) | 160 urllist.append(findall.split("<")[0].split(">")[-1]) |
138 return urllist | 161 return urllist |
139 | 162 |
140 | 163 |
141 def downloadfile(i, x, count): | 164 def downloadfile(i, x, count): |
142 filename = "{4}\\{0}_{1}p_{2}_{3}".format(i["id"], count, sanitize(i["title"]), os.path.basename(x["path"]), output) | 165 filename = "{4}/{0}_{1}p_{2}_{3}".format(i["id"], count, sanitize(i["title"]), os.path.basename(x["path"]), output) |
166 amountdone = 0 | |
143 if os.path.exists(filename): | 167 if os.path.exists(filename): |
144 filesize = os.stat(filename).st_size | 168 filesize = os.stat(filename).st_size |
145 else: | 169 else: |
146 filesize = 0 | 170 filesize = 0 |
147 serverhead = req.head("https://data.kemono.party" + x['path']) | 171 serverhead = req.head("https://data.kemono.party" + x['path']) |
148 try: | 172 for i in range(500): |
149 serverfilesize = int(serverhead.headers["Content-Length"]) | 173 serverfilesize = int(serverhead.headers["Content-Length"]) |
150 if filesize < serverfilesize: | 174 if filesize < serverfilesize: |
151 with req.get(f"https://data.kemono.party{x['path']}", stream=True, headers={"Range": f"bytes={filesize}-"}) as r: | 175 with req.get(f"https://data.kemono.party{x['path']}", stream=True, headers={"Range": f"bytes={filesize}-"}) as r: |
152 r.raise_for_status() | 176 r.raise_for_status() |
153 with open(filename, "ab") as f: | 177 with open(filename, "ab") as f: |
154 for chunk in r.iter_content(chunk_size=4096): | 178 for chunk in r.iter_content(chunk_size=4096): |
155 f.write(chunk) | 179 f.write(chunk) |
156 print(" image " + str(count) + " successfully downloaded!") | 180 amountdone += len(chunk) |
181 print(" downloading image " + str(count) + ": " + str(round(((filesize + amountdone) / serverfilesize) * 100, 2)) + "%\r", end="") | |
182 print(" downloaded image " + str(count) + ": 100.00% ") | |
157 return | 183 return |
158 else: | 184 else: |
159 print(" image " + str(count) + " already downloaded!") | 185 print(" image " + str(count) + " already downloaded!") |
160 return | 186 return |
161 except Exception as e: | 187 time.sleep(10) |
162 print(" error downloading file!") | 188 print(" download timed out!") |
163 print(e) | 189 return |
164 | 190 |
165 | 191 |
166 def parse_json(i, count): | 192 def parse_json(i, count): |
167 seen = set() | 193 seen = set() |
168 unique_urls = [] | 194 unique_urls = [] |
171 if url.split("/")[-1].split("?")[0] not in seen: | 197 if url.split("/")[-1].split("?")[0] not in seen: |
172 unique_urls.append(url) | 198 unique_urls.append(url) |
173 seen.add(url.split("/")[-1].split("?")[0]) | 199 seen.add(url.split("/")[-1].split("?")[0]) |
174 elif url.startswith("https://drive.google.com/open?id="): | 200 elif url.startswith("https://drive.google.com/open?id="): |
175 if url.split("?id=")[-1] not in seen: | 201 if url.split("?id=")[-1] not in seen: |
176 unique_urls.append(requests.head(url).headers["Location"]) | 202 unique_urls.append(req.head(url).headers["Location"]) |
177 seen.add(url.split("/")[-1].split("?")[0]) | 203 seen.add(url.split("/")[-1].split("?")[0]) |
178 elif url.startswith("https://drive.google.com/file/"): | 204 elif url.startswith("https://drive.google.com/file/"): |
179 if url.split("?")[0].split("/")[-2] not in seen: | 205 if url.split("?")[0].split("/")[-2] not in seen: |
180 unique_urls.append(url) | 206 unique_urls.append(url) |
181 seen.add(url.split("?")[0].split("/")[-2]) | 207 seen.add(url.split("?")[0].split("/")[-2]) |
182 elif url.startswith("https://www.dropbox.com"): | 208 elif url.startswith("https://www.dropbox.com"): |
209 print(" Dropbox link found! attempting to download its files...") | |
183 download_from_dropbox(url) | 210 download_from_dropbox(url) |
184 else: # TODO: add MEGA, or some sort of other file hosting website(s). gdrive and dropbox seem like the most popular ones atm | 211 else: # TODO: add MEGA, or some sort of other file hosting website(s). gdrive and dropbox seem like the most popular ones atm |
185 pass | 212 pass |
186 for url in unique_urls: | 213 for url in unique_urls: |
187 if url.startswith("https://drive.google.com/drive/folders/"): | 214 if url.startswith("https://drive.google.com/drive/folders/"): |
188 # Google Drive folder downloading | 215 # Google Drive folder downloading |
189 print(" Google Drive link found! attempting to download its files...") | 216 print(" Google Drive link found! attempting to download its files...") |
190 unique_ids = [url.split("/")[-1].split("?")[0]] | 217 download_folder_from_google_drive(url) |
191 drive_ids_to_download = [unique_ids[0]] | |
192 drive_id_names = { | |
193 unique_ids[0]: ".", | |
194 } | |
195 while len(unique_ids) > 1: | |
196 for myid in unique_ids: | |
197 unique_ids, names = get_google_drive_subfolder_ids("https://drive.google.com/drive/folders/" + myid) | |
198 for xd in range(len(unique_ids)): | |
199 drive_ids_to_download.append(unique_ids[xd]) | |
200 drive_id_names[unique_ids[xd]] = names[xd] | |
201 for ids in drive_ids_to_download: | |
202 gdrive = requests.get("https://drive.google.com/drive/folders/" + ids).text | |
203 driveids = re.findall(r'jsdata=" M2rjcd;_;\d (?:.+?);(.+?);', gdrive) | |
204 for driveid in driveids: | |
205 if not driveid.startswith("driveweb|"): | |
206 download_file_from_google_drive(driveid, dir=drive_id_names[ids]) | |
207 elif url.startswith("https://drive.google.com/file/"): | 218 elif url.startswith("https://drive.google.com/file/"): |
208 print(" Google Drive link found! attempting to download its files...") | 219 print(" Google Drive link found! attempting to download its files...") |
209 download_file_from_google_drive(url.split("?")[0].split("/")[-2]) | 220 download_file_from_google_drive(url.split("?")[0].split("/")[-2]) |
210 for x in i["attachments"]: | 221 for x in i["attachments"]: |
211 count += 1 | 222 count += 1 |
212 while not os.path.exists("{4}\\{0}_{1}p_{2}_{3}".format(int(i["id"]) - 1, count, sanitize(i["title"]), os.path.basename(x["path"]), output)): | 223 while not os.path.exists("{4}/{0}_{1}p_{2}_{3}".format(int(i["id"]) - 1, count, sanitize(i["title"]), os.path.basename(x["path"]), output)): |
213 try: | 224 try: |
214 downloadfile(i, x, count) | 225 downloadfile(i, x, count) |
215 break | 226 break |
216 except HTTPError: | 227 except HTTPError: |
217 time.sleep(10) | 228 while 1: |
218 downloadfile(i, x, count) | 229 time.sleep(10) |
230 downloadfile(i, x, count) | |
231 except BadStatusLine: # DDoS-GUARD | |
232 while 1: | |
233 time.sleep(10) | |
234 downloadfile(i, x, count) | |
219 except Exception as e: | 235 except Exception as e: |
220 print(e) | 236 print(e) |
221 time.sleep(10) | 237 time.sleep(10) |
222 | 238 |
223 | 239 |
228 if len(data) < 25: | 244 if len(data) < 25: |
229 return math.ceil((amount + 1) / 25) | 245 return math.ceil((amount + 1) / 25) |
230 amount += 25 | 246 amount += 25 |
231 | 247 |
232 | 248 |
233 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") | 249 parser = argparse.ArgumentParser(description="Downloads files from kemono.party") |
234 parser.add_argument("-u", "--url", help="user URL", metavar='<url>', required=True) | 250 parser.add_argument("-u", "--url", help="user URL", metavar='<url>', required=True) |
235 parser.add_argument("-c", "--cookies", help="", metavar='<cookies>', required=True) # required because of DDoS-GUARD | 251 parser.add_argument("-c", "--cookies", help="", metavar='<cookies>', required=True) # required because of DDoS-GUARD |
236 parser.add_argument("-p", "--proxy", help="proxy\n supported types: http, https, socks5 (requires pysocks)", metavar='<proxy>') # SOCKS proxy support is through PySocks - pip install pysocks | 252 parser.add_argument("-p", "--proxy", help="proxy\n supported types: http, https, socks5 (requires pysocks)", metavar='<proxy>') # SOCKS proxy support is through PySocks - pip install pysocks |
237 parser.add_argument("-o", "--output", help="output folder, defaults to user ID", metavar='<output>') | 253 parser.add_argument("-o", "--output", help="output folder, defaults to user ID", metavar='<output>') |
238 parser.add_argument("--test-download-services", dest="testdownloadservices", nargs="+", help="test download services\nsupported: gdrive, dropbox", metavar="<service>") | 254 parser.add_argument("--test-download-services", dest="testdownloadservices", action="store_true", help="test download services") |
239 args = parser.parse_args() | 255 args = parser.parse_args() |
240 | 256 |
241 req = requests.Session() | 257 req = requests.Session() |
242 | 258 |
243 if args.proxy: | 259 if args.proxy: |
284 if not os.path.isdir(output): | 300 if not os.path.isdir(output): |
285 if os.path.exists(output): | 301 if os.path.exists(output): |
286 os.remove(output) | 302 os.remove(output) |
287 os.makedirs(output) | 303 os.makedirs(output) |
288 | 304 |
289 if args.testdownloadservices: | |
290 i = { | |
291 "title": "Test" | |
292 } | |
293 if "gdrive" in args.testdownloadservices: | |
294 unique_ids = ["1sMVOcUesv4Ua_KJ-eQ_CMS_5KkrZGFdF"] | |
295 drive_ids_to_download = [unique_ids[0].split("?")[0].split("/")[-1]] | |
296 while len(unique_ids) > 0: | |
297 for i in unique_ids: | |
298 unique_ids = get_google_drive_subfolder_ids("https://drive.google.com/drive/folders/" + i) | |
299 for ids in unique_ids: | |
300 drive_ids_to_download.append(ids) | |
301 print(drive_ids_to_download) | |
302 if "dropbox" in args.testdownloadservices: | |
303 download_from_dropbox("https://www.dropbox.com/s/yg405bpznyobo3u/test.txt?dl=0") # File | |
304 download_from_dropbox("https://www.dropbox.com/sh/ne3c7bxtkt5tg4s/AABYPNGfHoil4HO_btudw0wPa?dl=0") # Folder | |
305 exit() | |
306 | |
307 try: | 305 try: |
308 post | 306 post |
309 pages = 1 | 307 pages = 1 |
310 except Exception: | 308 except Exception: |
311 pages = get_amount_of_posts(service, user) | 309 pages = get_amount_of_posts(service, user) |
315 userdata = req.get("https://kemono.party/api/{0}/user/{1}/post/{2}".format(service, user, post)).json() | 313 userdata = req.get("https://kemono.party/api/{0}/user/{1}/post/{2}".format(service, user, post)).json() |
316 except Exception: | 314 except Exception: |
317 userdata = req.get("https://kemono.party/api/{0}/user/{1}?o={2}".format(service, user, (page * 25))).json() | 315 userdata = req.get("https://kemono.party/api/{0}/user/{1}?o={2}".format(service, user, (page * 25))).json() |
318 for i in userdata: | 316 for i in userdata: |
319 print(i["id"]) | 317 print(i["id"]) |
320 post = i["id"] | |
321 count = 0 | 318 count = 0 |
322 parse_json(i, count) | 319 parse_json(i, count) |