comparison channeldownloader.py @ 69:63e6bc911606

Use regex instead of weirdness to filter archive.org names committer: GitHub <noreply@github.com>
author Paper <37962225+mrpapersonic@users.noreply.github.com>
date Wed, 18 May 2022 23:24:03 -0400
parents a43ed076b28f
children eafe13de3f76
comparison
equal deleted inserted replaced
68:a43ed076b28f 69:63e6bc911606
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 # 2 #
3 # download deleted vids from old yt channels 3 # download deleted vids from old yt channels
4 # script by paper 4 # script by paper
5 # it's pretty old and could definitely use some refining
5 6
6 from __future__ import print_function 7 from __future__ import print_function
7 import argparse 8 import argparse
8 import internetarchive 9 import internetarchive
9 try: 10 try:
10 import orjson as json 11 import orjson as json
11 except ImportError: 12 except ImportError:
12 import json 13 import json
13 import glob
14 import os 14 import os
15 import re 15 import re
16 import time 16 import time
17 try: 17 try:
18 import urllib.request as compat_urllib 18 import urllib.request as compat_urllib
52 print(" an error occurred downloading {0}!") 52 print(" an error occurred downloading {0}!")
53 53
54 def load_split_files(path): 54 def load_split_files(path):
55 if os.path.isdir(path): 55 if os.path.isdir(path):
56 result = {"videos": []} 56 result = {"videos": []}
57 for f in glob.glob(os.path.join(path, "vids*.json")): 57 for fi in os.listdir(path):
58 with open(f, "r", encoding="utf-8") as infile: 58 for f in re.findall(r"vids.+?\.json", fi):
59 for i in json.loads(infile.read())["videos"]: 59 with open(path + "/" + f, "r", encoding="utf-8") as infile:
60 result["videos"].append(i) 60 for i in json.loads(infile.read())["videos"]:
61 result["videos"].append(i)
61 return result 62 return result
62 else: 63 else:
63 return json.loads(open(path, "r", encoding="utf-8").read()) 64 return json.loads(open(path, "r", encoding="utf-8").read())
64 65
65 def reporthook(count, block_size, total_size): 66 def reporthook(count, block_size, total_size):
66 global start_time 67 global start_time
67 if count == 0: 68 if count == 0:
68 start_time = time.time() 69 start_time = time.time()
69 return 70 return
70 duration = time.time() - start_time 71 duration = time.time() - start_time
71 progress_size = int(count * block_size)
72 speed = int(progress_size / (1024 * duration))
73 percent = int(count * block_size * 100 / total_size) 72 percent = int(count * block_size * 100 / total_size)
74 print(" downloading %d%% \r" % (percent), end="") 73 print(" downloading %d%% \r" % (percent), end="")
75 74
76 75
77 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators") 76 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
119 118
120 for i in load_split_files(args.database)["videos"]: 119 for i in load_split_files(args.database)["videos"]:
121 uploader = i["uploader_id"] if "uploader_id" in i else None 120 uploader = i["uploader_id"] if "uploader_id" in i else None
122 if uploader == channel: 121 if uploader == channel:
123 print("%s:" % i["id"]) 122 print("%s:" % i["id"])
124 if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"): 123 if os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"):
125 print(" video already downloaded!") 124 print(" video already downloaded!")
126 continue 125 continue
127 # this code is *really* ugly... todo a rewrite? 126 # this code is *really* ugly... todo a rewrite?
128 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl: 127 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
129 try: 128 try:
131 continue 130 continue
132 except Exception: 131 except Exception:
133 print(" video is not available! attempting to find Internet Archive pages of it...") 132 print(" video is not available! attempting to find Internet Archive pages of it...")
134 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available 133 if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available
135 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] 134 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
136 disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]] # list of IA-created files we don't need
137 flist = [] 135 flist = []
138 for fname in fnames: 136 for fname in range(len(fnames)):
139 if os.path.splitext(fname)[1] in [".mkv", ".webm"]: 137 if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
140 if fname[-4:] == ".mp4": 138 flist.append(fnames[fname])
141 continue
142 else:
143 if fname[-7:] == ".ia.mp4":
144 continue
145 if fname.find("/") == -1:
146 if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]:
147 flist.append(fname)
148 if len(flist) >= 1: 139 if len(flist) >= 1:
149 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) 140 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
150 else: 141 else:
151 print(" video already downloaded!") 142 print(" video already downloaded!")
152 continue 143 continue