comparison kemonopartydownloader.py @ 56:bde647ac9554

Update kemonopartydownloader.py uses new zip type google drive folder downloading "because it's easier" committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Mon, 16 Aug 2021 23:02:23 -0400
parents 4e5000c9b48f
children d2e0edd4a070
comparison
equal deleted inserted replaced
55:4e5000c9b48f 56:bde647ac9554
9 import time 9 import time
10 import math 10 import math
11 import zipfile 11 import zipfile
12 import urllib.parse 12 import urllib.parse
13 from urllib.error import HTTPError 13 from urllib.error import HTTPError
14 14 from http.client import BadStatusLine
15 15
16 def get_google_drive_subfolder_ids(link): 16
17 gdrive = requests.get(link).text 17 def download_folder_from_google_drive(link):
18 drivefiles = re.findall(r"\[\"(.{33}?)\",\[\"(.{33}?)\"\],\"(.+?)\",\"(.+?)\"", gdrive) # format: ["id","name","mimetype" 18 session = requests.Session()
19 seen = set() 19 session.headers = {
20 unique_ids = [] 20 'origin': 'https://drive.google.com',
21 names = [] 21 'content-type': 'application/json',
22 for files in drivefiles: 22 }
23 if files[3] != "application/vnd.google-apps.folder": 23 key = "AIzaSyC1qbk75NzWBvSaDh6KnsjjA9pIrP4lYIE" # google anonymous key
24 continue 24 takeoutjs = session.post(f"https://takeout-pa.clients6.google.com/v1/exports?key={key}", data='{{"items":[{{"id":"{0}"}}]}}'.format(link.split("?")[0].split("/")[-1])).json()
25 if files[0] not in seen: 25 takeoutid = takeoutjs["exportJob"]["id"]
26 unique_ids.append(files[0]) 26 storagePath = None
27 names.append(files[2]) 27 while storagePath is None:
28 seen.add(files[0]) 28 succeededjson = session.get("https://takeout-pa.clients6.google.com/v1/exports/{0}?key={1}".format(takeoutid, key)).json()
29 return unique_ids, names 29 if succeededjson["exportJob"]["status"] == "SUCCEEDED":
30 storagePath = succeededjson["exportJob"]["archives"][0]["storagePath"]
31 time.sleep(1)
32 size = 0
33 for path, dirs, files in os.walk("./{0}/Drive - {1}".format(output, sanitize(i["title"]))):
34 for f in files:
35 fp = os.path.join(path, f)
36 size += os.path.getsize(fp)
37 if size >= int(succeededjson["exportJob"]["archives"][0]["sizeOfContents"]):
38 print(" {0} already downloaded!".format(succeededjson["exportJob"]["archives"][0]["fileName"]))
39 return
40 response = session.get(storagePath, stream=True)
41 amountdone = 0
42 with open(succeededjson["exportJob"]["archives"][0]["fileName"], "wb") as f:
43 for chunk in response.iter_content(1024):
44 if chunk: # filter out keep-alive new chunks
45 f.write(chunk)
46 amountdone += 1024
47 print(" downloading {0}: ".format(succeededjson["exportJob"]["archives"][0]["fileName"]) + " " + str(round((amountdone / int(succeededjson['exportJob']['archives'][0]['compressedSize'])) * 100, 2)) + "%\r", end="")
48 print(" downloaded {0}".format(succeededjson["exportJob"]["archives"][0]["fileName"]) + ": 100.00% ")
49 unzip(succeededjson["exportJob"]["archives"][0]["fileName"], "./{0}/Drive - {1}".format(output, sanitize(i["title"])))
30 50
31 51
32 def unzip(src_path, dst_dir, pwd=None): 52 def unzip(src_path, dst_dir, pwd=None):
33 with zipfile.ZipFile(src_path) as zf: 53 with zipfile.ZipFile(src_path) as zf:
34 members = zf.namelist() 54 members = zf.namelist()
40 if not os.path.exists(dst_path): 60 if not os.path.exists(dst_path):
41 zf.extract(arch_info, dst_dir, pwd) 61 zf.extract(arch_info, dst_dir, pwd)
42 62
43 63
44 def download_from_dropbox(link): 64 def download_from_dropbox(link):
45 responsehead = requests.head(link.split("?")[0] + "?dl=1", allow_redirects=True) 65 responsehead = req.head(link.split("?")[0] + "?dl=1", allow_redirects=True)
46 if responsehead.status_code == 404: 66 if responsehead.status_code == 404:
47 print(" dropbox link not available!") 67 print(" dropbox link not available!")
48 return 68 return
49 if not os.path.exists(output + "\\Dropbox - " + sanitize(i["title"])): 69 if not os.path.exists(output + "/Dropbox - " + sanitize(i["title"])):
50 os.makedirs(output + "\\Dropbox - " + sanitize(i["title"])) 70 os.makedirs(output + "/Dropbox - " + sanitize(i["title"]))
51 filename = output + "\\Dropbox - " + sanitize(i["title"]) + "\\" + sanitize(responsehead.headers["Content-Disposition"].split("'")[-1]) 71 filename = output + "/Dropbox - " + sanitize(i["title"]) + "/" + sanitize(responsehead.headers["Content-Disposition"].split("'")[-1])
52 if os.path.exists(urllib.parse.unquote(os.path.splitext(filename)[0])) and os.path.isdir(urllib.parse.unquote(os.path.splitext(filename)[0])): 72 if os.path.exists(urllib.parse.unquote(os.path.splitext(filename)[0])) and os.path.isdir(urllib.parse.unquote(os.path.splitext(filename)[0])):
53 print(" file(s) already downloaded!") 73 print(" file(s) already downloaded!")
54 return 74 return
55 if os.path.exists(filename): 75 if os.path.exists(filename):
56 filesize = os.stat(filename).st_size 76 filesize = os.stat(filename).st_size
69 if responsehead.headers["Content-Disposition"].split("'")[-1].endswith(".zip"): 89 if responsehead.headers["Content-Disposition"].split("'")[-1].endswith(".zip"):
70 unzip(filename, urllib.parse.unquote(os.path.splitext(filename)[0])) 90 unzip(filename, urllib.parse.unquote(os.path.splitext(filename)[0]))
71 os.remove(filename) 91 os.remove(filename)
72 92
73 93
74 def download_file_from_google_drive(id, dir=""): # https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive/39225039 ;) 94 def download_file_from_google_drive(id, dir=""): # https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive/39225039
75 def get_confirm_token(response): 95 def get_confirm_token(response):
76 for key, value in response.cookies.items(): 96 for key, value in response.cookies.items():
77 if key.startswith('download_warning'): 97 if key.startswith('download_warning'):
78 return value 98 return value
79 99
80 return None 100 return None
81 101
82 def save_response_content(response): 102 def save_response_content(response):
83 amountdone = 0 103 amountdone = 0
84 CHUNK_SIZE = 32768 104 CHUNK_SIZE = 4096
85 if not os.path.exists(output + "\\Drive - " + sanitize(i["title"])): 105 if not os.path.exists(output + "/Drive - " + sanitize(i["title"]) + "/" + dir):
86 os.makedirs(output + "\\Drive - " + sanitize(i["title"])) 106 os.makedirs(output + "/Drive - " + sanitize(i["title"]) + "/" + dir)
87 if not os.path.exists(output + "\\Drive - " + sanitize(i["title"]) + "\\" + dir): 107 destination = output + "/Drive - " + sanitize(i["title"]) + "/" + dir + "/" + sanitize(response.headers["Content-Disposition"].split("'")[-1])
88 os.makedirs(output + "\\Drive - " + sanitize(i["title"]) + "\\" + dir)
89 destination = output + "\\Drive - " + sanitize(i["title"]) + "\\" + dir + "\\" + sanitize(response.headers["Content-Disposition"].split("'")[-1])
90 if os.path.exists(destination): 108 if os.path.exists(destination):
91 filesize = os.stat(destination).st_size 109 filesize = os.stat(destination).st_size
92 else: 110 else:
93 filesize = 0 111 filesize = 0
94 112
99 with open(destination, "wb") as f: 117 with open(destination, "wb") as f:
100 for chunk in response.iter_content(CHUNK_SIZE): 118 for chunk in response.iter_content(CHUNK_SIZE):
101 if chunk: # filter out keep-alive new chunks 119 if chunk: # filter out keep-alive new chunks
102 f.write(chunk) 120 f.write(chunk)
103 amountdone += CHUNK_SIZE 121 amountdone += CHUNK_SIZE
104 print(" downloading {0}: ".format(os.path.basename(destination)) + " " + str(round(filesize + amountdone / int(response.headers["Content-Range"].partition('/')[-1])) * 100) + "%\r", end="") 122 print(" downloading {0}: ".format(os.path.basename(destination)) + " " + str(round((amountdone / int(response.headers["Content-Range"].partition('/')[-1])) * 100, 2)) + "%\r", end="")
105 print(" downloaded {0}".format(os.path.basename(destination)) + ": 100% ") 123 print(" downloaded {0}".format(os.path.basename(destination)) + ": 100.00% ")
106 124
107 URL = "https://docs.google.com/uc?export=download" 125 URL = "https://docs.google.com/uc?export=download"
108 126
109 session = requests.Session() 127 session = requests.Session()
110 128
111 headers = { 129 headers = {
112 "Range": "bytes=0-", 130 "Range": "bytes=0-",
113 } 131 }
114 132
133 session.proxies = req.proxies
134
115 response = session.get(URL, headers=headers, params={'id': id}, stream=True) 135 response = session.get(URL, headers=headers, params={'id': id}, stream=True)
116 136
117 while response.status_code == 403: 137 while response.status_code == 403:
118 time.sleep(30) 138 time.sleep(30)
119 response = session.get(URL, headers=headers, params={'id': id}, stream=True) 139 response = session.get(URL, headers=headers, params={'id': id}, stream=True)
120 140
141 if response.status_code == 404:
142 return # bypass when root folder has no files
143
121 token = get_confirm_token(response) 144 token = get_confirm_token(response)
122 145
123 if token: 146 if token:
124 params = {'id': id, 'confirm': token} 147 params = {'id': id, 'confirm': token}
125 response = session.get(URL, headers=headers, params=params, stream=True) 148 response = session.get(URL, headers=headers, params=params, stream=True)
131 return re.sub(r"[\/:*?\"<>|]", "_", filename).strip() 154 return re.sub(r"[\/:*?\"<>|]", "_", filename).strip()
132 155
133 156
134 def find_urls(s): 157 def find_urls(s):
135 urllist = [] 158 urllist = []
136 for findall in re.findall("href=\\\"(https://.+?)\\\"", s): 159 for findall in re.findall(r"""http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+""", s):
137 urllist.append(re.sub(r"<[^<]+?>", "", re.sub(r"[^a-zA-Z0-9<>]+$", "", findall))) 160 urllist.append(findall.split("<")[0].split(">")[-1])
138 return urllist 161 return urllist
139 162
140 163
141 def downloadfile(i, x, count): 164 def downloadfile(i, x, count):
142 filename = "{4}\\{0}_{1}p_{2}_{3}".format(i["id"], count, sanitize(i["title"]), os.path.basename(x["path"]), output) 165 filename = "{4}/{0}_{1}p_{2}_{3}".format(i["id"], count, sanitize(i["title"]), os.path.basename(x["path"]), output)
166 amountdone = 0
143 if os.path.exists(filename): 167 if os.path.exists(filename):
144 filesize = os.stat(filename).st_size 168 filesize = os.stat(filename).st_size
145 else: 169 else:
146 filesize = 0 170 filesize = 0
147 serverhead = req.head("https://data.kemono.party" + x['path']) 171 serverhead = req.head("https://data.kemono.party" + x['path'])
148 try: 172 for i in range(500):
149 serverfilesize = int(serverhead.headers["Content-Length"]) 173 serverfilesize = int(serverhead.headers["Content-Length"])
150 if filesize < serverfilesize: 174 if filesize < serverfilesize:
151 with req.get(f"https://data.kemono.party{x['path']}", stream=True, headers={"Range": f"bytes={filesize}-"}) as r: 175 with req.get(f"https://data.kemono.party{x['path']}", stream=True, headers={"Range": f"bytes={filesize}-"}) as r:
152 r.raise_for_status() 176 r.raise_for_status()
153 with open(filename, "ab") as f: 177 with open(filename, "ab") as f:
154 for chunk in r.iter_content(chunk_size=4096): 178 for chunk in r.iter_content(chunk_size=4096):
155 f.write(chunk) 179 f.write(chunk)
156 print(" image " + str(count) + " successfully downloaded!") 180 amountdone += len(chunk)
181 print(" downloading image " + str(count) + ": " + str(round(((filesize + amountdone) / serverfilesize) * 100, 2)) + "%\r", end="")
182 print(" downloaded image " + str(count) + ": 100.00% ")
157 return 183 return
158 else: 184 else:
159 print(" image " + str(count) + " already downloaded!") 185 print(" image " + str(count) + " already downloaded!")
160 return 186 return
161 except Exception as e: 187 time.sleep(10)
162 print(" error downloading file!") 188 print(" download timed out!")
163 print(e) 189 return
164 190
165 191
166 def parse_json(i, count): 192 def parse_json(i, count):
167 seen = set() 193 seen = set()
168 unique_urls = [] 194 unique_urls = []
171 if url.split("/")[-1].split("?")[0] not in seen: 197 if url.split("/")[-1].split("?")[0] not in seen:
172 unique_urls.append(url) 198 unique_urls.append(url)
173 seen.add(url.split("/")[-1].split("?")[0]) 199 seen.add(url.split("/")[-1].split("?")[0])
174 elif url.startswith("https://drive.google.com/open?id="): 200 elif url.startswith("https://drive.google.com/open?id="):
175 if url.split("?id=")[-1] not in seen: 201 if url.split("?id=")[-1] not in seen:
176 unique_urls.append(requests.head(url).headers["Location"]) 202 unique_urls.append(req.head(url).headers["Location"])
177 seen.add(url.split("/")[-1].split("?")[0]) 203 seen.add(url.split("/")[-1].split("?")[0])
178 elif url.startswith("https://drive.google.com/file/"): 204 elif url.startswith("https://drive.google.com/file/"):
179 if url.split("?")[0].split("/")[-2] not in seen: 205 if url.split("?")[0].split("/")[-2] not in seen:
180 unique_urls.append(url) 206 unique_urls.append(url)
181 seen.add(url.split("?")[0].split("/")[-2]) 207 seen.add(url.split("?")[0].split("/")[-2])
182 elif url.startswith("https://www.dropbox.com"): 208 elif url.startswith("https://www.dropbox.com"):
209 print(" Dropbox link found! attempting to download its files...")
183 download_from_dropbox(url) 210 download_from_dropbox(url)
184 else: # TODO: add MEGA, or some sort of other file hosting website(s). gdrive and dropbox seem like the most popular ones atm 211 else: # TODO: add MEGA, or some sort of other file hosting website(s). gdrive and dropbox seem like the most popular ones atm
185 pass 212 pass
186 for url in unique_urls: 213 for url in unique_urls:
187 if url.startswith("https://drive.google.com/drive/folders/"): 214 if url.startswith("https://drive.google.com/drive/folders/"):
188 # Google Drive folder downloading 215 # Google Drive folder downloading
189 print(" Google Drive link found! attempting to download its files...") 216 print(" Google Drive link found! attempting to download its files...")
190 unique_ids = [url.split("/")[-1].split("?")[0]] 217 download_folder_from_google_drive(url)
191 drive_ids_to_download = [unique_ids[0]]
192 drive_id_names = {
193 unique_ids[0]: ".",
194 }
195 while len(unique_ids) > 1:
196 for myid in unique_ids:
197 unique_ids, names = get_google_drive_subfolder_ids("https://drive.google.com/drive/folders/" + myid)
198 for xd in range(len(unique_ids)):
199 drive_ids_to_download.append(unique_ids[xd])
200 drive_id_names[unique_ids[xd]] = names[xd]
201 for ids in drive_ids_to_download:
202 gdrive = requests.get("https://drive.google.com/drive/folders/" + ids).text
203 driveids = re.findall(r'jsdata=" M2rjcd;_;\d (?:.+?);(.+?);', gdrive)
204 for driveid in driveids:
205 if not driveid.startswith("driveweb|"):
206 download_file_from_google_drive(driveid, dir=drive_id_names[ids])
207 elif url.startswith("https://drive.google.com/file/"): 218 elif url.startswith("https://drive.google.com/file/"):
208 print(" Google Drive link found! attempting to download its files...") 219 print(" Google Drive link found! attempting to download its files...")
209 download_file_from_google_drive(url.split("?")[0].split("/")[-2]) 220 download_file_from_google_drive(url.split("?")[0].split("/")[-2])
210 for x in i["attachments"]: 221 for x in i["attachments"]:
211 count += 1 222 count += 1
212 while not os.path.exists("{4}\\{0}_{1}p_{2}_{3}".format(int(i["id"]) - 1, count, sanitize(i["title"]), os.path.basename(x["path"]), output)): 223 while not os.path.exists("{4}/{0}_{1}p_{2}_{3}".format(int(i["id"]) - 1, count, sanitize(i["title"]), os.path.basename(x["path"]), output)):
213 try: 224 try:
214 downloadfile(i, x, count) 225 downloadfile(i, x, count)
215 break 226 break
216 except HTTPError: 227 except HTTPError:
217 time.sleep(10) 228 while 1:
218 downloadfile(i, x, count) 229 time.sleep(10)
230 downloadfile(i, x, count)
231 except BadStatusLine: # DDoS-GUARD
232 while 1:
233 time.sleep(10)
234 downloadfile(i, x, count)
219 except Exception as e: 235 except Exception as e:
220 print(e) 236 print(e)
221 time.sleep(10) 237 time.sleep(10)
222 238
223 239
228 if len(data) < 25: 244 if len(data) < 25:
229 return math.ceil((amount + 1) / 25) 245 return math.ceil((amount + 1) / 25)
230 amount += 25 246 amount += 25
231 247
232 248
233 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") 249 parser = argparse.ArgumentParser(description="Downloads files from kemono.party")
234 parser.add_argument("-u", "--url", help="user URL", metavar='<url>', required=True) 250 parser.add_argument("-u", "--url", help="user URL", metavar='<url>', required=True)
235 parser.add_argument("-c", "--cookies", help="", metavar='<cookies>', required=True) # required because of DDoS-GUARD 251 parser.add_argument("-c", "--cookies", help="", metavar='<cookies>', required=True) # required because of DDoS-GUARD
236 parser.add_argument("-p", "--proxy", help="proxy\n supported types: http, https, socks5 (requires pysocks)", metavar='<proxy>') # SOCKS proxy support is through PySocks - pip install pysocks 252 parser.add_argument("-p", "--proxy", help="proxy\n supported types: http, https, socks5 (requires pysocks)", metavar='<proxy>') # SOCKS proxy support is through PySocks - pip install pysocks
237 parser.add_argument("-o", "--output", help="output folder, defaults to user ID", metavar='<output>') 253 parser.add_argument("-o", "--output", help="output folder, defaults to user ID", metavar='<output>')
238 parser.add_argument("--test-download-services", dest="testdownloadservices", nargs="+", help="test download services\nsupported: gdrive, dropbox", metavar="<service>") 254 parser.add_argument("--test-download-services", dest="testdownloadservices", action="store_true", help="test download services")
239 args = parser.parse_args() 255 args = parser.parse_args()
240 256
241 req = requests.Session() 257 req = requests.Session()
242 258
243 if args.proxy: 259 if args.proxy:
284 if not os.path.isdir(output): 300 if not os.path.isdir(output):
285 if os.path.exists(output): 301 if os.path.exists(output):
286 os.remove(output) 302 os.remove(output)
287 os.makedirs(output) 303 os.makedirs(output)
288 304
289 if args.testdownloadservices:
290 i = {
291 "title": "Test"
292 }
293 if "gdrive" in args.testdownloadservices:
294 unique_ids = ["1sMVOcUesv4Ua_KJ-eQ_CMS_5KkrZGFdF"]
295 drive_ids_to_download = [unique_ids[0].split("?")[0].split("/")[-1]]
296 while len(unique_ids) > 0:
297 for i in unique_ids:
298 unique_ids = get_google_drive_subfolder_ids("https://drive.google.com/drive/folders/" + i)
299 for ids in unique_ids:
300 drive_ids_to_download.append(ids)
301 print(drive_ids_to_download)
302 if "dropbox" in args.testdownloadservices:
303 download_from_dropbox("https://www.dropbox.com/s/yg405bpznyobo3u/test.txt?dl=0") # File
304 download_from_dropbox("https://www.dropbox.com/sh/ne3c7bxtkt5tg4s/AABYPNGfHoil4HO_btudw0wPa?dl=0") # Folder
305 exit()
306
307 try: 305 try:
308 post 306 post
309 pages = 1 307 pages = 1
310 except Exception: 308 except Exception:
311 pages = get_amount_of_posts(service, user) 309 pages = get_amount_of_posts(service, user)
315 userdata = req.get("https://kemono.party/api/{0}/user/{1}/post/{2}".format(service, user, post)).json() 313 userdata = req.get("https://kemono.party/api/{0}/user/{1}/post/{2}".format(service, user, post)).json()
316 except Exception: 314 except Exception:
317 userdata = req.get("https://kemono.party/api/{0}/user/{1}?o={2}".format(service, user, (page * 25))).json() 315 userdata = req.get("https://kemono.party/api/{0}/user/{1}?o={2}".format(service, user, (page * 25))).json()
318 for i in userdata: 316 for i in userdata:
319 print(i["id"]) 317 print(i["id"])
320 post = i["id"]
321 count = 0 318 count = 0
322 parse_json(i, count) 319 parse_json(i, count)