Mercurial > codedump
comparison channeldownloader.py @ 69:63e6bc911606
Use regex instead of weirdness to filter archive.org names
committer: GitHub <noreply@github.com>
author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
---|---|
date | Wed, 18 May 2022 23:24:03 -0400 |
parents | a43ed076b28f |
children | eafe13de3f76 |
comparison
equal
deleted
inserted
replaced
68:a43ed076b28f | 69:63e6bc911606 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 # | 2 # |
3 # download deleted vids from old yt channels | 3 # download deleted vids from old yt channels |
4 # script by paper | 4 # script by paper |
5 # it's pretty old and could definitely use some refining | |
5 | 6 |
6 from __future__ import print_function | 7 from __future__ import print_function |
7 import argparse | 8 import argparse |
8 import internetarchive | 9 import internetarchive |
9 try: | 10 try: |
10 import orjson as json | 11 import orjson as json |
11 except ImportError: | 12 except ImportError: |
12 import json | 13 import json |
13 import glob | |
14 import os | 14 import os |
15 import re | 15 import re |
16 import time | 16 import time |
17 try: | 17 try: |
18 import urllib.request as compat_urllib | 18 import urllib.request as compat_urllib |
52 print(" an error occurred downloading {0}!") | 52 print(" an error occurred downloading {0}!") |
53 | 53 |
54 def load_split_files(path): | 54 def load_split_files(path): |
55 if os.path.isdir(path): | 55 if os.path.isdir(path): |
56 result = {"videos": []} | 56 result = {"videos": []} |
57 for f in glob.glob(os.path.join(path, "vids*.json")): | 57 for fi in os.listdir(path): |
58 with open(f, "r", encoding="utf-8") as infile: | 58 for f in re.findall(r"vids.+?\.json", fi): |
59 for i in json.loads(infile.read())["videos"]: | 59 with open(path + "/" + f, "r", encoding="utf-8") as infile: |
60 result["videos"].append(i) | 60 for i in json.loads(infile.read())["videos"]: |
61 result["videos"].append(i) | |
61 return result | 62 return result |
62 else: | 63 else: |
63 return json.loads(open(path, "r", encoding="utf-8").read()) | 64 return json.loads(open(path, "r", encoding="utf-8").read()) |
64 | 65 |
65 def reporthook(count, block_size, total_size): | 66 def reporthook(count, block_size, total_size): |
66 global start_time | 67 global start_time |
67 if count == 0: | 68 if count == 0: |
68 start_time = time.time() | 69 start_time = time.time() |
69 return | 70 return |
70 duration = time.time() - start_time | 71 duration = time.time() - start_time |
71 progress_size = int(count * block_size) | |
72 speed = int(progress_size / (1024 * duration)) | |
73 percent = int(count * block_size * 100 / total_size) | 72 percent = int(count * block_size * 100 / total_size) |
74 print(" downloading %d%% \r" % (percent), end="") | 73 print(" downloading %d%% \r" % (percent), end="") |
75 | 74 |
76 | 75 |
77 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") | 76 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") |
119 | 118 |
120 for i in load_split_files(args.database)["videos"]: | 119 for i in load_split_files(args.database)["videos"]: |
121 uploader = i["uploader_id"] if "uploader_id" in i else None | 120 uploader = i["uploader_id"] if "uploader_id" in i else None |
122 if uploader == channel: | 121 if uploader == channel: |
123 print("%s:" % i["id"]) | 122 print("%s:" % i["id"]) |
124 if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"): | 123 if os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"): |
125 print(" video already downloaded!") | 124 print(" video already downloaded!") |
126 continue | 125 continue |
127 # this code is *really* ugly... todo a rewrite? | 126 # this code is *really* ugly... todo a rewrite? |
128 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: | 127 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: |
129 try: | 128 try: |
131 continue | 130 continue |
132 except Exception: | 131 except Exception: |
133 print(" video is not available! attempting to find Internet Archive pages of it...") | 132 print(" video is not available! attempting to find Internet Archive pages of it...") |
134 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available | 133 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available |
135 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] | 134 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] |
136 disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]] # list of IA-created files we don't need | |
137 flist = [] | 135 flist = [] |
138 for fname in fnames: | 136 for fname in range(len(fnames)): |
139 if os.path.splitext(fname)[1] in [".mkv", ".webm"]: | 137 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]): |
140 if fname[-4:] == ".mp4": | 138 flist.append(fnames[fname]) |
141 continue | |
142 else: | |
143 if fname[-7:] == ".ia.mp4": | |
144 continue | |
145 if fname.find("/") == -1: | |
146 if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]: | |
147 flist.append(fname) | |
148 if len(flist) >= 1: | 139 if len(flist) >= 1: |
149 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) | 140 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) |
150 else: | 141 else: |
151 print(" video already downloaded!") | 142 print(" video already downloaded!") |
152 continue | 143 continue |