Mercurial > channeldownloader
comparison channeldownloader.py @ 7:571c5525fccb
Use regex instead of weirdness to filter archive.org names
committer: GitHub <noreply@github.com>
| author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
|---|---|
| date | Wed, 18 May 2022 23:24:03 -0400 |
| parents | 5d93490e60e2 |
| children | 990fcd424f93 |
comparison
equal
deleted
inserted
replaced
| 6:5d93490e60e2 | 7:571c5525fccb |
|---|---|
| 1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
| 2 # | 2 # |
| 3 # download deleted vids from old yt channels | 3 # download deleted vids from old yt channels |
| 4 # script by paper | 4 # script by paper |
| 5 # it's pretty old and could definitely use some refining | |
| 5 | 6 |
| 6 from __future__ import print_function | 7 from __future__ import print_function |
| 7 import argparse | 8 import argparse |
| 8 import internetarchive | 9 import internetarchive |
| 9 try: | 10 try: |
| 10 import orjson as json | 11 import orjson as json |
| 11 except ImportError: | 12 except ImportError: |
| 12 import json | 13 import json |
| 13 import glob | |
| 14 import os | 14 import os |
| 15 import re | 15 import re |
| 16 import time | 16 import time |
| 17 try: | 17 try: |
| 18 import urllib.request as compat_urllib | 18 import urllib.request as compat_urllib |
| 52 print(" an error occurred downloading {0}!") | 52 print(" an error occurred downloading {0}!") |
| 53 | 53 |
| 54 def load_split_files(path): | 54 def load_split_files(path): |
| 55 if os.path.isdir(path): | 55 if os.path.isdir(path): |
| 56 result = {"videos": []} | 56 result = {"videos": []} |
| 57 for f in glob.glob(os.path.join(path, "vids*.json")): | 57 for fi in os.listdir(path): |
| 58 with open(f, "r", encoding="utf-8") as infile: | 58 for f in re.findall(r"vids.+?\.json", fi): |
| 59 for i in json.loads(infile.read())["videos"]: | 59 with open(path + "/" + f, "r", encoding="utf-8") as infile: |
| 60 result["videos"].append(i) | 60 for i in json.loads(infile.read())["videos"]: |
| 61 result["videos"].append(i) | |
| 61 return result | 62 return result |
| 62 else: | 63 else: |
| 63 return json.loads(open(path, "r", encoding="utf-8").read()) | 64 return json.loads(open(path, "r", encoding="utf-8").read()) |
| 64 | 65 |
| 65 def reporthook(count, block_size, total_size): | 66 def reporthook(count, block_size, total_size): |
| 66 global start_time | 67 global start_time |
| 67 if count == 0: | 68 if count == 0: |
| 68 start_time = time.time() | 69 start_time = time.time() |
| 69 return | 70 return |
| 70 duration = time.time() - start_time | 71 duration = time.time() - start_time |
| 71 progress_size = int(count * block_size) | |
| 72 speed = int(progress_size / (1024 * duration)) | |
| 73 percent = int(count * block_size * 100 / total_size) | 72 percent = int(count * block_size * 100 / total_size) |
| 74 print(" downloading %d%% \r" % (percent), end="") | 73 print(" downloading %d%% \r" % (percent), end="") |
| 75 | 74 |
| 76 | 75 |
| 77 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") | 76 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") |
| 119 | 118 |
| 120 for i in load_split_files(args.database)["videos"]: | 119 for i in load_split_files(args.database)["videos"]: |
| 121 uploader = i["uploader_id"] if "uploader_id" in i else None | 120 uploader = i["uploader_id"] if "uploader_id" in i else None |
| 122 if uploader == channel: | 121 if uploader == channel: |
| 123 print("%s:" % i["id"]) | 122 print("%s:" % i["id"]) |
| 124 if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"): | 123 if os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"): |
| 125 print(" video already downloaded!") | 124 print(" video already downloaded!") |
| 126 continue | 125 continue |
| 127 # this code is *really* ugly... todo a rewrite? | 126 # this code is *really* ugly... todo a rewrite? |
| 128 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: | 127 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: |
| 129 try: | 128 try: |
| 131 continue | 130 continue |
| 132 except Exception: | 131 except Exception: |
| 133 print(" video is not available! attempting to find Internet Archive pages of it...") | 132 print(" video is not available! attempting to find Internet Archive pages of it...") |
| 134 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available | 133 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available |
| 135 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] | 134 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] |
| 136 disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]] # list of IA-created files we don't need | |
| 137 flist = [] | 135 flist = [] |
| 138 for fname in fnames: | 136 for fname in range(len(fnames)): |
| 139 if os.path.splitext(fname)[1] in [".mkv", ".webm"]: | 137 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]): |
| 140 if fname[-4:] == ".mp4": | 138 flist.append(fnames[fname]) |
| 141 continue | |
| 142 else: | |
| 143 if fname[-7:] == ".ia.mp4": | |
| 144 continue | |
| 145 if fname.find("/") == -1: | |
| 146 if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]: | |
| 147 flist.append(fname) | |
| 148 if len(flist) >= 1: | 139 if len(flist) >= 1: |
| 149 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) | 140 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) |
| 150 else: | 141 else: |
| 151 print(" video already downloaded!") | 142 print(" video already downloaded!") |
| 152 continue | 143 continue |
