Mercurial > codedump
comparison kemonopartydownloader.py @ 54:5a5d47a795c6
Update kemonopartydownloader.py
how to easily make your code thrice as long
committer: GitHub <noreply@github.com>
| author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
|---|---|
| date | Fri, 06 Aug 2021 02:50:53 -0400 |
| parents | ae64a0c8831b |
| children | 4e5000c9b48f |
comparison
equal
deleted
inserted
replaced
| 53:ae64a0c8831b | 54:5a5d47a795c6 |
|---|---|
| 1 # example args.url: https://kemono.party/fanbox/user/5375435/post/2511461 | 1 # example args.url: https://kemono.party/fanbox/user/5375435/post/2511461 |
| 2 # created by Paper in 2021 | |
| 3 # please do not share without crediting me! | |
| 2 import argparse | 4 import argparse |
| 3 import http.cookiejar | 5 import http.cookiejar |
| 4 import os | 6 import os |
| 5 import re | 7 import re |
| 6 import requests # pip install requests | 8 import requests # pip install requests |
| 7 import time | 9 import time |
| 10 import math | |
| 11 import zipfile | |
| 12 import urllib.parse | |
| 8 from urllib.error import HTTPError | 13 from urllib.error import HTTPError |
| 9 | 14 |
| 10 | 15 |
| 16 def get_google_drive_subfolder_ids(link): | |
| 17 gdrive = requests.get(link).text | |
| 18 drivefiles = re.findall(r"\[\"(.{33}?)\",\[\"(.{33}?)\"\],\"(.+?)\",\"(.+?)\"", gdrive) # format: ["id","name","mimetype" | |
| 19 seen = set() | |
| 20 unique_ids = [] | |
| 21 for files in drivefiles: | |
| 22 if files[3] != "application/vnd.google-apps.folder": | |
| 23 continue | |
| 24 if files[0] not in seen: | |
| 25 unique_ids.append(files[0]) | |
| 26 seen.add(files[0]) | |
| 27 return unique_ids | |
| 28 | |
| 29 | |
| 30 def unzip(src_path, dst_dir, pwd=None): | |
| 31 with zipfile.ZipFile(src_path) as zf: | |
| 32 members = zf.namelist() | |
| 33 for member in members: | |
| 34 arch_info = zf.getinfo(member) | |
| 35 arch_name = arch_info.filename.replace('/', os.path.sep) | |
| 36 dst_path = os.path.join(dst_dir, arch_name) | |
| 37 dst_path = os.path.normpath(dst_path) | |
| 38 if not os.path.exists(dst_path): | |
| 39 zf.extract(arch_info, dst_dir, pwd) | |
| 40 | |
| 41 | |
| 42 def download_from_dropbox(link): | |
| 43 responsehead = requests.head(link.split("?")[0] + "?dl=1", allow_redirects=True) | |
| 44 if responsehead.status_code == 404: | |
| 45 print(" dropbox link not available!") | |
| 46 return | |
| 47 if not os.path.exists(output + "\\Dropbox - " + sanitize(i["title"])): | |
| 48 os.makedirs(output + "\\Dropbox - " + sanitize(i["title"])) | |
| 49 filename = output + "\\Dropbox - " + sanitize(i["title"]) + "\\" + sanitize(responsehead.headers["Content-Disposition"].split("'")[-1]) | |
| 50 if os.path.exists(urllib.parse.unquote(os.path.splitext(filename)[0])) and os.path.isdir(urllib.parse.unquote(os.path.splitext(filename)[0])): | |
| 51 print(" file(s) already downloaded!") | |
| 52 return | |
| 53 if os.path.exists(filename): | |
| 54 filesize = os.stat(filename).st_size | |
| 55 else: | |
| 56 filesize = 0 | |
| 57 serverfilesize = int(responsehead.headers["Content-Length"]) | |
| 58 if filesize < serverfilesize: | |
| 59 with req.get(link.split("?")[0] + "?dl=1", stream=True, headers={"Range": f"bytes={filesize}-"}) as r: | |
| 60 r.raise_for_status() | |
| 61 with open(filename, "ab") as f: | |
| 62 for chunk in r.iter_content(chunk_size=4096): | |
| 63 f.write(chunk) | |
| 64 filesize += 4096 | |
| 65 print(" file {0} downloading: ".format(urllib.parse.unquote(responsehead.headers["Content-Disposition"].split("'")[-1])) + str(round((filesize / serverfilesize) * 100)) + "%\r", end="") | |
| 66 print(" {0} successfully downloaded!".format(urllib.parse.unquote(responsehead.headers["Content-Disposition"].split("'")[-1]))) | |
| 67 if responsehead.headers["Content-Disposition"].split("'")[-1].endswith(".zip"): | |
| 68 unzip(filename, urllib.parse.unquote(os.path.splitext(filename)[0])) | |
| 69 os.remove(filename) | |
| 70 | |
| 71 | |
| 72 def download_file_from_google_drive(id): # https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive/39225039 ;) | |
| 73 def get_confirm_token(response): | |
| 74 for key, value in response.cookies.items(): | |
| 75 if key.startswith('download_warning'): | |
| 76 return value | |
| 77 | |
| 78 return None | |
| 79 | |
| 80 def save_response_content(response): | |
| 81 amountdone = 0 | |
| 82 CHUNK_SIZE = 32768 | |
| 83 if not os.path.exists(output + "\\Drive - " + sanitize(i["title"])): | |
| 84 os.makedirs(output + "\\Drive - " + sanitize(i["title"])) | |
| 85 destination = output + "\\Drive - " + sanitize(i["title"]) + "\\" + sanitize(response.headers["Content-Disposition"].split("'")[-1]) | |
| 86 if os.path.exists(destination): | |
| 87 filesize = os.stat(destination).st_size | |
| 88 else: | |
| 89 filesize = 0 | |
| 90 | |
| 91 if os.path.exists(destination): | |
| 92 print(" " + os.path.basename(destination) + " already downloaded!") | |
| 93 return | |
| 94 | |
| 95 with open(destination, "wb") as f: | |
| 96 for chunk in response.iter_content(CHUNK_SIZE): | |
| 97 if chunk: # filter out keep-alive new chunks | |
| 98 f.write(chunk) | |
| 99 amountdone += CHUNK_SIZE | |
| 100 print(" downloading {0}: ".format(os.path.basename(destination)) + " " + str(round(filesize + amountdone / int(response.headers["Content-Range"].partition('/')[-1])) * 100) + "%\r", end="") | |
| 101 print(" downloaded {0}".format(os.path.basename(destination)) + ": 100% ") | |
| 102 | |
| 103 URL = "https://docs.google.com/uc?export=download" | |
| 104 | |
| 105 session = requests.Session() | |
| 106 | |
| 107 headers = { | |
| 108 "Range": "bytes=0-", | |
| 109 } | |
| 110 | |
| 111 response = session.get(URL, headers=headers, params={'id': id}, stream=True) | |
| 112 | |
| 113 while response.status_code == "403": | |
| 114 time.sleep(30) | |
| 115 response = session.get(URL, headers=headers, params={'id': id}, stream=True) | |
| 116 | |
| 117 token = get_confirm_token(response) | |
| 118 | |
| 119 if token: | |
| 120 params = {'id': id, 'confirm': token} | |
| 121 response = session.get(URL, headers=headers, params=params, stream=True) | |
| 122 | |
| 123 save_response_content(response) | |
| 124 | |
| 125 | |
| 11 def sanitize(filename): | 126 def sanitize(filename): |
| 12 return re.sub(r"[\/:*?\"<>|]", "_", filename) | 127 return re.sub(r"[\/:*?\"<>|]", "_", filename).strip() |
| 128 | |
| 129 | |
| 130 def find_urls(s): | |
| 131 urllist = [] | |
| 132 for findall in re.findall("href=\\\"(https://.+?)\\\"", s): | |
| 133 urllist.append(re.sub(r"<[^<]+?>", "", re.sub(r"[^a-zA-Z0-9<>]+$", "", findall))) | |
| 134 return urllist | |
| 13 | 135 |
| 14 | 136 |
| 15 def downloadfile(i, x, count): | 137 def downloadfile(i, x, count): |
| 16 filename = "{4}\\{0}_{1}p_{2}_{3}".format(i["id"], count, sanitize(i["title"]), os.path.basename(x["path"]), output) | 138 filename = "{4}\\{0}_{1}p_{2}_{3}".format(i["id"], count, sanitize(i["title"]), os.path.basename(x["path"]), output) |
| 17 if os.path.exists(filename): | 139 if os.path.exists(filename): |
| 18 filesize = os.stat(filename).st_size | 140 filesize = os.stat(filename).st_size |
| 19 else: | 141 else: |
| 20 filesize = 0 | 142 filesize = 0 |
| 21 if str(filesize) != req.head(f"https://data.kemono.party{x['path']}").headers["Content-Length"]: | 143 serverhead = req.head("https://data.kemono.party" + x['path']) |
| 22 with req.get(f"https://data.kemono.party{x['path']}", stream=True, headers={"Range": f"bytes={filesize}-"}) as r: | 144 try: |
| 23 r.raise_for_status() | 145 serverfilesize = int(serverhead.headers["Content-Length"]) |
| 24 with open(filename, "ab") as f: | 146 if filesize < serverfilesize: |
| 25 for chunk in r.iter_content(chunk_size=4096): | 147 with req.get(f"https://data.kemono.party{x['path']}", stream=True, headers={"Range": f"bytes={filesize}-"}) as r: |
| 26 f.write(chunk) | 148 r.raise_for_status() |
| 27 print("image " + str(count) + " successfully downloaded!") | 149 with open(filename, "ab") as f: |
| 28 return | 150 for chunk in r.iter_content(chunk_size=4096): |
| 29 else: | 151 f.write(chunk) |
| 30 print("image " + str(count) + " already downloaded!") | 152 print(" image " + str(count) + " successfully downloaded!") |
| 31 return | 153 return |
| 154 else: | |
| 155 print(" image " + str(count) + " already downloaded!") | |
| 156 return | |
| 157 except Exception as e: | |
| 158 print(" error downloading file!") | |
| 159 print(e) | |
| 160 | |
| 161 | |
| 162 def parse_json(i, count): | |
| 163 seen = set() | |
| 164 unique_urls = [] | |
| 165 for url in find_urls(i["content"]): | |
| 166 if url.startswith("https://drive.google.com/drive/folders"): | |
| 167 if url.split("/")[-1].split("?")[0] not in seen: | |
| 168 unique_urls.append(url) | |
| 169 seen.add(url.split("/")[-1].split("?")[0]) | |
| 170 elif url.startswith("https://drive.google.com/open?id="): | |
| 171 if url.split("?id=")[-1] not in seen: | |
| 172 unique_urls.append(requests.head(url).headers["Location"]) | |
| 173 seen.add(url.split("/")[-1].split("?")[0]) | |
| 174 elif url.startswith("https://drive.google.com/file/"): | |
| 175 if url.split("?")[0].split("/")[-2] not in seen: | |
| 176 unique_urls.append(url) | |
| 177 seen.add(url.split("?")[0].split("/")[-2]) | |
| 178 elif url.startswith("https://www.dropbox.com"): | |
| 179 download_from_dropbox(url) | |
| 180 else: # TODO: add MEGA, or some sort of other file hosting website(s). gdrive and dropbox seem like the most popular ones atm | |
| 181 pass | |
| 182 for url in unique_urls: | |
| 183 if url.startswith("https://drive.google.com/drive/folders/"): | |
| 184 # Google Drive folder downloading | |
| 185 # NOTE: this doesn't currently support subfolders! they seem like a pain in the ass to implement without the api... | |
| 186 print(" Google Drive link found! attempting to download its files...") | |
| 187 unique_ids = [url.split("?")[0].split("/")[-1]] | |
| 188 drive_ids_to_download = [] | |
| 189 while len(unique_ids) > 0: | |
| 190 for myid in unique_ids: | |
| 191 unique_ids = get_google_drive_subfolder_ids("https://drive.google.com/drive/folders/" + myid) | |
| 192 for ids in unique_ids: | |
| 193 drive_ids_to_download.append(ids) | |
| 194 for ids in drive_ids_to_download: | |
| 195 gdrive = requests.get("https://drive.google.com/drive/folders/" + ids).text | |
| 196 driveids = re.findall(r'jsdata=" M2rjcd;_;\d (?:.+?);(.+?);', gdrive) | |
| 197 for driveid in driveids: | |
| 198 if not driveid.startswith("driveweb|"): | |
| 199 download_file_from_google_drive(driveid) | |
| 200 elif url.startswith("https://drive.google.com/file/"): | |
| 201 download_file_from_google_drive(url.split("?")[0].split("/")[-2]) | |
| 202 for x in i["attachments"]: | |
| 203 count += 1 | |
| 204 while not os.path.exists("{4}\\{0}_{1}p_{2}_{3}".format(int(i["id"]) - 1, count, sanitize(i["title"]), os.path.basename(x["path"]), output)): | |
| 205 try: | |
| 206 downloadfile(i, x, count) | |
| 207 break | |
| 208 except HTTPError: | |
| 209 time.sleep(10) | |
| 210 downloadfile(i, x, count) | |
| 211 except Exception as e: | |
| 212 print(e) | |
| 213 time.sleep(10) | |
| 214 | |
| 215 | |
| 216 def get_amount_of_posts(s, u): | |
| 217 amount = 0 | |
| 218 while 1: | |
| 219 data = req.get("https://kemono.party/api/{0}/user/{1}?o={2}".format(s, u, amount)).json() | |
| 220 if len(data) < 25: | |
| 221 return math.ceil((amount + 1) / 25) | |
| 222 amount += 25 | |
| 32 | 223 |
| 33 | 224 |
| 34 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") | 225 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") |
| 35 parser.add_argument("-u", "--url", help="user URL", metavar='<url>', required=True) | 226 parser.add_argument("-u", "--url", help="user URL", metavar='<url>', required=True) |
| 36 parser.add_argument("-c", "--cookies", help="", metavar='<url>', required=True) # required because of DDoS-GUARD | 227 parser.add_argument("-c", "--cookies", help="", metavar='<cookies>', required=True) # required because of DDoS-GUARD |
| 37 parser.add_argument("-p", "--proxy", help="proxy\n supported types: http, https, socks5 (requires pysocks)", metavar='<url>') # SOCKS proxy support is through PySocks - pip install pysocks | 228 parser.add_argument("-p", "--proxy", help="proxy\n supported types: http, https, socks5 (requires pysocks)", metavar='<proxy>') # SOCKS proxy support is through PySocks - pip install pysocks |
| 38 parser.add_argument("-o", "--output", help="output folder, defaults to user ID", metavar='<url>') | 229 parser.add_argument("-o", "--output", help="output folder, defaults to user ID", metavar='<output>') |
| 230 parser.add_argument("--test-download-services", dest="testdownloadservices", nargs="+", help="test download services\nsupported: gdrive, dropbox", metavar="<service>") | |
| 39 args = parser.parse_args() | 231 args = parser.parse_args() |
| 40 | 232 |
| 41 req = requests.Session() | 233 req = requests.Session() |
| 42 | 234 |
| 43 if args.proxy: | 235 if args.proxy: |
| 61 cj.load(ignore_expires=True) | 253 cj.load(ignore_expires=True) |
| 62 req.cookies = cj | 254 req.cookies = cj |
| 63 | 255 |
| 64 try: | 256 try: |
| 65 int(args.url) | 257 int(args.url) |
| 66 print("do not input user IDs here! use a link instead") | 258 print("do not input user IDs in --url! use a link instead") |
| 67 exit() | 259 exit() |
| 68 except Exception: | 260 except Exception: |
| 69 pass | 261 pass |
| 70 | 262 |
| 71 if args.url.split("/")[-2] == "post": | 263 if args.url.split("/")[-2] == "post": |
| 72 service = args.url.split("/")[-5] | 264 service = args.url.split("/")[-5] |
| 73 user = args.url.split("/")[-3] | 265 user = args.url.split("/")[-3] |
| 74 post = args.url.split("/")[-1] | 266 post = args.url.split("/")[-1] |
| 75 userdata = req.get("https://kemono.party/api/{0}/user/{1}/post/{2}".format(service, user, post)).json() | |
| 76 elif args.url.split("/")[-2] == "user": | 267 elif args.url.split("/")[-2] == "user": |
| 77 service = args.url.split("/")[-3] | 268 service = args.url.split("/")[-3] |
| 78 user = args.url.split("/")[-1] | 269 user = args.url.split("/")[-1] |
| 79 userdata = req.get("https://kemono.party/api/{0}/user/{1}".format(service, user)).json() | |
| 80 | 270 |
| 81 if not args.output: | 271 if not args.output: |
| 82 output = user | 272 output = user |
| 83 else: | 273 else: |
| 84 output = args.output | 274 output = args.output |
| 85 | 275 |
| 86 if not os.path.isdir(output): | 276 if not os.path.isdir(output): |
| 87 if os.path.exists(output): | 277 if os.path.exists(output): |
| 88 os.remove(output) | 278 os.remove(output) |
| 89 os.mkdir(output) | 279 os.makedirs(output) |
| 90 | 280 |
| 91 for i in userdata: | 281 if args.testdownloadservices: |
| 92 print(i["id"]) | 282 i = { |
| 93 post = i["id"] | 283 "title": "Test" |
| 94 count = 0 | 284 } |
| 95 for x in i["attachments"]: | 285 if "gdrive" in args.testdownloadservices: |
| 96 count += 1 | 286 unique_ids = ["1sMVOcUesv4Ua_KJ-eQ_CMS_5KkrZGFdF"] |
| 97 while not os.path.exists("{4}\\{0}_{1}p_{2}_{3}".format(int(i["id"]) - 1, count, sanitize(i["title"]), os.path.basename(x["path"]), output)): | 287 drive_ids_to_download = [unique_ids[0].split("?")[0].split("/")[-1]] |
| 98 try: | 288 while len(unique_ids) > 0: |
| 99 downloadfile(i, x, count) | 289 for i in unique_ids: |
| 100 break | 290 unique_ids = get_google_drive_subfolder_ids("https://drive.google.com/drive/folders/" + i) |
| 101 except HTTPError: | 291 for ids in unique_ids: |
| 102 time.sleep(10) | 292 drive_ids_to_download.append(ids) |
| 103 downloadfile(i, x, count) | 293 print(drive_ids_to_download) |
| 104 except Exception as e: | 294 if "dropbox" in args.testdownloadservices: |
| 105 print(e) | 295 download_from_dropbox("https://www.dropbox.com/s/yg405bpznyobo3u/test.txt?dl=0") # File |
| 106 time.sleep(10) | 296 download_from_dropbox("https://www.dropbox.com/sh/ne3c7bxtkt5tg4s/AABYPNGfHoil4HO_btudw0wPa?dl=0") # Folder |
| 297 exit() | |
| 298 | |
| 299 try: | |
| 300 post | |
| 301 pages = 1 | |
| 302 except Exception: | |
| 303 pages = get_amount_of_posts(service, user) | |
| 304 for page in range(pages): | |
| 305 try: | |
| 306 post | |
| 307 userdata = req.get("https://kemono.party/api/{0}/user/{1}/post/{2}".format(service, user, post)).json() | |
| 308 except Exception: | |
| 309 userdata = req.get("https://kemono.party/api/{0}/user/{1}?o={2}".format(service, user, (page * 25))).json() | |
| 310 for i in userdata: | |
| 311 print(i["id"]) | |
| 312 post = i["id"] | |
| 313 count = 0 | |
| 314 parse_json(i, count) |
