Mercurial > codedump
changeset 69:63e6bc911606
Use regex instead of weirdness to filter archive.org names
committer: GitHub <noreply@github.com>
author | Paper <37962225+mrpapersonic@users.noreply.github.com> |
---|---|
date | Wed, 18 May 2022 23:24:03 -0400 |
parents | a43ed076b28f |
children | eafe13de3f76 |
files | channeldownloader.py |
diffstat | 1 files changed, 10 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/channeldownloader.py Wed May 18 20:05:47 2022 -0400 +++ b/channeldownloader.py Wed May 18 23:24:03 2022 -0400 @@ -2,6 +2,7 @@ # # download deleted vids from old yt channels # script by paper +# it's pretty old and could definitely use some refining from __future__ import print_function import argparse @@ -10,7 +11,6 @@ import orjson as json except ImportError: import json -import glob import os import re import time @@ -54,10 +54,11 @@ def load_split_files(path): if os.path.isdir(path): result = {"videos": []} - for f in glob.glob(os.path.join(path, "vids*.json")): - with open(f, "r", encoding="utf-8") as infile: - for i in json.loads(infile.read())["videos"]: - result["videos"].append(i) + for fi in os.listdir(path): + for f in re.findall(r"vids.+?\.json", fi): + with open(path + "/" + f, "r", encoding="utf-8") as infile: + for i in json.loads(infile.read())["videos"]: + result["videos"].append(i) return result else: return json.loads(open(path, "r", encoding="utf-8").read()) @@ -68,8 +69,6 @@ start_time = time.time() return duration = time.time() - start_time - progress_size = int(count * block_size) - speed = int(progress_size / (1024 * duration)) percent = int(count * block_size * 100 / total_size) print(" downloading %d%% \r" % (percent), end="") @@ -121,7 +120,7 @@ uploader = i["uploader_id"] if "uploader_id" in i else None if uploader == channel: print("%s:" % i["id"]) - if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"): + if os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"): print(" video already downloaded!") continue # this code is *really* ugly... todo a rewrite? @@ -133,18 +132,10 @@ print(" video is not available! attempting to find Internet Archive pages of it...") if internetarchive.get_item("youtube-%s" % i["id"]).exists: # download from internetarchive if available fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])] - disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]] # list of IA-created files we don't need flist = [] - for fname in fnames: - if os.path.splitext(fname)[1] in [".mkv", ".webm"]: - if fname[-4:] == ".mp4": - continue - else: - if fname[-7:] == ".ia.mp4": - continue - if fname.find("/") == -1: - if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]: - flist.append(fname) + for fname in range(len(fnames)): + if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]): + flist.append(fnames[fname]) if len(flist) >= 1: internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True) else: