Mercurial > codedump
changeset 56:bde647ac9554
Update kemonopartydownloader.py
uses new zip type google drive folder downloading
"because it's easier"
committer: GitHub <noreply@github.com>
author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
---|---|
date | Mon, 16 Aug 2021 23:02:23 -0400 |
parents | 4e5000c9b48f |
children | 861f4a9992ae |
files | kemonopartydownloader.py |
diffstat | 1 files changed, 74 insertions(+), 77 deletions(-) [+] |
line wrap: on
line diff
--- a/kemonopartydownloader.py Fri Aug 06 03:46:02 2021 -0400 +++ b/kemonopartydownloader.py Mon Aug 16 23:02:23 2021 -0400 @@ -11,22 +11,42 @@ import zipfile import urllib.parse from urllib.error import HTTPError +from http.client import BadStatusLine -def get_google_drive_subfolder_ids(link): - gdrive = requests.get(link).text - drivefiles = re.findall(r"\[\"(.{33}?)\",\[\"(.{33}?)\"\],\"(.+?)\",\"(.+?)\"", gdrive) # format: ["id","name","mimetype" - seen = set() - unique_ids = [] - names = [] - for files in drivefiles: - if files[3] != "application/vnd.google-apps.folder": - continue - if files[0] not in seen: - unique_ids.append(files[0]) - names.append(files[2]) - seen.add(files[0]) - return unique_ids, names +def download_folder_from_google_drive(link): + session = requests.Session() + session.headers = { + 'origin': 'https://drive.google.com', + 'content-type': 'application/json', + } + key = "AIzaSyC1qbk75NzWBvSaDh6KnsjjA9pIrP4lYIE" # google anonymous key + takeoutjs = session.post(f"https://takeout-pa.clients6.google.com/v1/exports?key={key}", data='{{"items":[{{"id":"{0}"}}]}}'.format(link.split("?")[0].split("/")[-1])).json() + takeoutid = takeoutjs["exportJob"]["id"] + storagePath = None + while storagePath is None: + succeededjson = session.get("https://takeout-pa.clients6.google.com/v1/exports/{0}?key={1}".format(takeoutid, key)).json() + if succeededjson["exportJob"]["status"] == "SUCCEEDED": + storagePath = succeededjson["exportJob"]["archives"][0]["storagePath"] + time.sleep(1) + size = 0 + for path, dirs, files in os.walk("./{0}/Drive - {1}".format(output, sanitize(i["title"]))): + for f in files: + fp = os.path.join(path, f) + size += os.path.getsize(fp) + if size >= int(succeededjson["exportJob"]["archives"][0]["sizeOfContents"]): + print(" {0} already downloaded!".format(succeededjson["exportJob"]["archives"][0]["fileName"])) + return + response = session.get(storagePath, stream=True) + amountdone = 0 + with open(succeededjson["exportJob"]["archives"][0]["fileName"], "wb") as f: + for chunk in response.iter_content(1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + amountdone += 1024 + print(" downloading {0}: ".format(succeededjson["exportJob"]["archives"][0]["fileName"]) + " " + str(round((amountdone / int(succeededjson['exportJob']['archives'][0]['compressedSize'])) * 100, 2)) + "%\r", end="") + print(" downloaded {0}".format(succeededjson["exportJob"]["archives"][0]["fileName"]) + ": 100.00% ") + unzip(succeededjson["exportJob"]["archives"][0]["fileName"], "./{0}/Drive - {1}".format(output, sanitize(i["title"]))) def unzip(src_path, dst_dir, pwd=None): @@ -42,13 +62,13 @@ def download_from_dropbox(link): - responsehead = requests.head(link.split("?")[0] + "?dl=1", allow_redirects=True) + responsehead = req.head(link.split("?")[0] + "?dl=1", allow_redirects=True) if responsehead.status_code == 404: print(" dropbox link not available!") return - if not os.path.exists(output + "\\Dropbox - " + sanitize(i["title"])): - os.makedirs(output + "\\Dropbox - " + sanitize(i["title"])) - filename = output + "\\Dropbox - " + sanitize(i["title"]) + "\\" + sanitize(responsehead.headers["Content-Disposition"].split("'")[-1]) + if not os.path.exists(output + "/Dropbox - " + sanitize(i["title"])): + os.makedirs(output + "/Dropbox - " + sanitize(i["title"])) + filename = output + "/Dropbox - " + sanitize(i["title"]) + "/" + sanitize(responsehead.headers["Content-Disposition"].split("'")[-1]) if os.path.exists(urllib.parse.unquote(os.path.splitext(filename)[0])) and os.path.isdir(urllib.parse.unquote(os.path.splitext(filename)[0])): print(" file(s) already downloaded!") return @@ -71,7 +91,7 @@ os.remove(filename) -def download_file_from_google_drive(id, dir=""): # https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive/39225039 ;) +def download_file_from_google_drive(id, dir=""): # https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive/39225039 def get_confirm_token(response): for key, value in response.cookies.items(): if key.startswith('download_warning'): @@ -81,12 +101,10 @@ def save_response_content(response): amountdone = 0 - CHUNK_SIZE = 32768 - if not os.path.exists(output + "\\Drive - " + sanitize(i["title"])): - os.makedirs(output + "\\Drive - " + sanitize(i["title"])) - if not os.path.exists(output + "\\Drive - " + sanitize(i["title"]) + "\\" + dir): - os.makedirs(output + "\\Drive - " + sanitize(i["title"]) + "\\" + dir) - destination = output + "\\Drive - " + sanitize(i["title"]) + "\\" + dir + "\\" + sanitize(response.headers["Content-Disposition"].split("'")[-1]) + CHUNK_SIZE = 4096 + if not os.path.exists(output + "/Drive - " + sanitize(i["title"]) + "/" + dir): + os.makedirs(output + "/Drive - " + sanitize(i["title"]) + "/" + dir) + destination = output + "/Drive - " + sanitize(i["title"]) + "/" + dir + "/" + sanitize(response.headers["Content-Disposition"].split("'")[-1]) if os.path.exists(destination): filesize = os.stat(destination).st_size else: @@ -101,8 +119,8 @@ if chunk: # filter out keep-alive new chunks f.write(chunk) amountdone += CHUNK_SIZE - print(" downloading {0}: ".format(os.path.basename(destination)) + " " + str(round(filesize + amountdone / int(response.headers["Content-Range"].partition('/')[-1])) * 100) + "%\r", end="") - print(" downloaded {0}".format(os.path.basename(destination)) + ": 100% ") + print(" downloading {0}: ".format(os.path.basename(destination)) + " " + str(round((amountdone / int(response.headers["Content-Range"].partition('/')[-1])) * 100, 2)) + "%\r", end="") + print(" downloaded {0}".format(os.path.basename(destination)) + ": 100.00% ") URL = "https://docs.google.com/uc?export=download" @@ -112,12 +130,17 @@ "Range": "bytes=0-", } + session.proxies = req.proxies + response = session.get(URL, headers=headers, params={'id': id}, stream=True) while response.status_code == 403: time.sleep(30) response = session.get(URL, headers=headers, params={'id': id}, stream=True) + if response.status_code == 404: + return # bypass when root folder has no files + token = get_confirm_token(response) if token: @@ -133,19 +156,20 @@ def find_urls(s): urllist = [] - for findall in re.findall("href=\\\"(https://.+?)\\\"", s): - urllist.append(re.sub(r"<[^<]+?>", "", re.sub(r"[^a-zA-Z0-9<>]+$", "", findall))) + for findall in re.findall(r"""http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+""", s): + urllist.append(findall.split("<")[0].split(">")[-1]) return urllist def downloadfile(i, x, count): - filename = "{4}\\{0}_{1}p_{2}_{3}".format(i["id"], count, sanitize(i["title"]), os.path.basename(x["path"]), output) + filename = "{4}/{0}_{1}p_{2}_{3}".format(i["id"], count, sanitize(i["title"]), os.path.basename(x["path"]), output) + amountdone = 0 if os.path.exists(filename): filesize = os.stat(filename).st_size else: filesize = 0 serverhead = req.head("https://data.kemono.party" + x['path']) - try: + for i in range(500): serverfilesize = int(serverhead.headers["Content-Length"]) if filesize < serverfilesize: with req.get(f"https://data.kemono.party{x['path']}", stream=True, headers={"Range": f"bytes={filesize}-"}) as r: @@ -153,14 +177,16 @@ with open(filename, "ab") as f: for chunk in r.iter_content(chunk_size=4096): f.write(chunk) - print(" image " + str(count) + " successfully downloaded!") + amountdone += len(chunk) + print(" downloading image " + str(count) + ": " + str(round(((filesize + amountdone) / serverfilesize) * 100, 2)) + "%\r", end="") + print(" downloaded image " + str(count) + ": 100.00% ") return else: print(" image " + str(count) + " already downloaded!") return - except Exception as e: - print(" error downloading file!") - print(e) + time.sleep(10) + print(" download timed out!") + return def parse_json(i, count): @@ -173,13 +199,14 @@ seen.add(url.split("/")[-1].split("?")[0]) elif url.startswith("https://drive.google.com/open?id="): if url.split("?id=")[-1] not in seen: - unique_urls.append(requests.head(url).headers["Location"]) + unique_urls.append(req.head(url).headers["Location"]) seen.add(url.split("/")[-1].split("?")[0]) elif url.startswith("https://drive.google.com/file/"): if url.split("?")[0].split("/")[-2] not in seen: unique_urls.append(url) seen.add(url.split("?")[0].split("/")[-2]) elif url.startswith("https://www.dropbox.com"): + print(" Dropbox link found! attempting to download its files...") download_from_dropbox(url) else: # TODO: add MEGA, or some sort of other file hosting website(s). gdrive and dropbox seem like the most popular ones atm pass @@ -187,35 +214,24 @@ if url.startswith("https://drive.google.com/drive/folders/"): # Google Drive folder downloading print(" Google Drive link found! attempting to download its files...") - unique_ids = [url.split("/")[-1].split("?")[0]] - drive_ids_to_download = [unique_ids[0]] - drive_id_names = { - unique_ids[0]: ".", - } - while len(unique_ids) > 1: - for myid in unique_ids: - unique_ids, names = get_google_drive_subfolder_ids("https://drive.google.com/drive/folders/" + myid) - for xd in range(len(unique_ids)): - drive_ids_to_download.append(unique_ids[xd]) - drive_id_names[unique_ids[xd]] = names[xd] - for ids in drive_ids_to_download: - gdrive = requests.get("https://drive.google.com/drive/folders/" + ids).text - driveids = re.findall(r'jsdata=" M2rjcd;_;\d (?:.+?);(.+?);', gdrive) - for driveid in driveids: - if not driveid.startswith("driveweb|"): - download_file_from_google_drive(driveid, dir=drive_id_names[ids]) + download_folder_from_google_drive(url) elif url.startswith("https://drive.google.com/file/"): print(" Google Drive link found! attempting to download its files...") download_file_from_google_drive(url.split("?")[0].split("/")[-2]) for x in i["attachments"]: count += 1 - while not os.path.exists("{4}\\{0}_{1}p_{2}_{3}".format(int(i["id"]) - 1, count, sanitize(i["title"]), os.path.basename(x["path"]), output)): + while not os.path.exists("{4}/{0}_{1}p_{2}_{3}".format(int(i["id"]) - 1, count, sanitize(i["title"]), os.path.basename(x["path"]), output)): try: downloadfile(i, x, count) break except HTTPError: - time.sleep(10) - downloadfile(i, x, count) + while 1: + time.sleep(10) + downloadfile(i, x, count) + except BadStatusLine: # DDoS-GUARD + while 1: + time.sleep(10) + downloadfile(i, x, count) except Exception as e: print(e) time.sleep(10) @@ -230,12 +246,12 @@ amount += 25 -parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") +parser = argparse.ArgumentParser(description="Downloads files from kemono.party") parser.add_argument("-u", "--url", help="user URL", metavar='<url>', required=True) parser.add_argument("-c", "--cookies", help="", metavar='<cookies>', required=True) # required because of DDoS-GUARD parser.add_argument("-p", "--proxy", help="proxy\n supported types: http, https, socks5 (requires pysocks)", metavar='<proxy>') # SOCKS proxy support is through PySocks - pip install pysocks parser.add_argument("-o", "--output", help="output folder, defaults to user ID", metavar='<output>') -parser.add_argument("--test-download-services", dest="testdownloadservices", nargs="+", help="test download services\nsupported: gdrive, dropbox", metavar="<service>") +parser.add_argument("--test-download-services", dest="testdownloadservices", action="store_true", help="test download services") args = parser.parse_args() req = requests.Session() @@ -286,24 +302,6 @@ os.remove(output) os.makedirs(output) -if args.testdownloadservices: - i = { - "title": "Test" - } - if "gdrive" in args.testdownloadservices: - unique_ids = ["1sMVOcUesv4Ua_KJ-eQ_CMS_5KkrZGFdF"] - drive_ids_to_download = [unique_ids[0].split("?")[0].split("/")[-1]] - while len(unique_ids) > 0: - for i in unique_ids: - unique_ids = get_google_drive_subfolder_ids("https://drive.google.com/drive/folders/" + i) - for ids in unique_ids: - drive_ids_to_download.append(ids) - print(drive_ids_to_download) - if "dropbox" in args.testdownloadservices: - download_from_dropbox("https://www.dropbox.com/s/yg405bpznyobo3u/test.txt?dl=0") # File - download_from_dropbox("https://www.dropbox.com/sh/ne3c7bxtkt5tg4s/AABYPNGfHoil4HO_btudw0wPa?dl=0") # Folder - exit() - try: post pages = 1 @@ -317,6 +315,5 @@ userdata = req.get("https://kemono.party/api/{0}/user/{1}?o={2}".format(service, user, (page * 25))).json() for i in userdata: print(i["id"]) - post = i["id"] count = 0 parse_json(i, count)