codedump: channeldownloader.py comparison

comparison channeldownloader.py @ 69:63e6bc911606

Use regex instead of weirdness to filter archive.org names committer: GitHub <noreply@github.com>

author	Paper <37962225+mrpapersonic@users.noreply.github.com>
date	Wed, 18 May 2022 23:24:03 -0400
parents	a43ed076b28f
children	eafe13de3f76

comparison

equal deleted inserted replaced

-:a43ed076b28f
+:63e6bc911606
 #!/usr/bin/env python3
 #
 # download deleted vids from old yt channels
 # script by paper
+# it's pretty old and could definitely use some refining
 from __future__ import print_function
 import argparse
 import internetarchive
 try:
 import orjson as json
 except ImportError:
 import json
-import glob
 import os
 import re
 import time
 try:
 import urllib.request as compat_urllib
 print(" an error occurred downloading {0}!")
 def load_split_files(path):
 if os.path.isdir(path):
 result = {"videos": []}
-for f in glob.glob(os.path.join(path, "vids*.json")):
+for fi in os.listdir(path):
-with open(f, "r", encoding="utf-8") as infile:
+for f in re.findall(r"vids.+?\.json", fi):
-for i in json.loads(infile.read())["videos"]:
+with open(path + "/" + f, "r", encoding="utf-8") as infile:
-result["videos"].append(i)
+for i in json.loads(infile.read())["videos"]:
+result["videos"].append(i)
 return result
 else:
 return json.loads(open(path, "r", encoding="utf-8").read())
 def reporthook(count, block_size, total_size):
 global start_time
 if count == 0:
 start_time = time.time()
 return
 duration = time.time() - start_time
-progress_size = int(count * block_size)
-speed = int(progress_size / (1024 * duration))
 percent = int(count * block_size * 100 / total_size)
 print(" downloading %d%%        \r" % (percent), end="")
 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
 for i in load_split_files(args.database)["videos"]:
 uploader = i["uploader_id"] if "uploader_id" in i else None
 if uploader == channel:
 print("%s:" % i["id"])
-if os.path.exists(uploader + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"):
+if os.path.exists(output + "/" + sanitize_filename(i["title"], restricted=True) + "-" + i["id"] + ".info.json"):
 print(" video already downloaded!")
 continue
 # this code is *really* ugly... todo a rewrite?
 with youtube_dl.YoutubeDL(ytdl_opts) as ytdl:
 try:
 continue
 except Exception:
 print(" video is not available! attempting to find Internet Archive pages of it...")
 if internetarchive.get_item("youtube-%s" % i["id"]).exists:  # download from internetarchive if available
 fnames = [f.name for f in internetarchive.get_files("youtube-%s" % i["id"])]
-disallowednames = ["__ia_thumb.jpg", "youtube-%s_archive.torrent" % i["id"], "youtube-%s_files.xml" % i["id"], "youtube-%s_meta.sqlite" % i["id"], "youtube-%s_meta.xml" % i["id"]]  # list of IA-created files we don't need
 flist = []
-for fname in fnames:
+for fname in range(len(fnames)):
-if os.path.splitext(fname)[1] in [".mkv", ".webm"]:
+if re.search("((?:.+?-)?%s\.(?:mp4|jpg|webp|mkv|webm|info\.json|description))" % (i["id"]), fnames[fname]):
-if fname[-4:] == ".mp4":
+flist.append(fnames[fname])
-continue
-else:
-if fname[-7:] == ".ia.mp4":
-continue
-if fname.find("/") == -1:
-if fname not in disallowednames and fname[-21:] != "%s_thumb.jpg" % i["id"] and fname[-15:] != "%s.ogv" % i["id"]:
-flist.append(fname)
 if len(flist) >= 1:
 internetarchive.download("youtube-%s" % i["id"], files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
 else:
 print(" video already downloaded!")
 continue

Mercurial > codedump

comparison channeldownloader.py @ 69:63e6bc911606