codedump: channeldownloader.py comparison

comparison channeldownloader.py @ 61:c615532e6572

Update channeldownloader.py add option to use the split files instead of the full json committer: GitHub <noreply@github.com>

author	Paper <37962225+mrpapersonic@users.noreply.github.com>
date	Sun, 02 Jan 2022 07:09:55 -0500
parents	4e7a9c7c0cce
children	9636d5dee08c

comparison

equal deleted inserted replaced

-:4e7a9c7c0cce
+:c615532e6572
 import argparse
 import internetarchive  # pip install internetarchive
 import json
+import glob
 import os
 import re
 import urllib.request
 import yt_dlp  # pip install yt-dlp
 import itertools
 from urllib.error import HTTPError
+from yt_dlp.utils import sanitize_filename
 class MyLogger(object):
 def debug(self, msg):
 pass
 def warning(self, msg):
 pass
 def error(self, msg):
 pass
-ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
-itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
-'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
-def sanitize_filename(s, restricted=False, is_id=False):
-# from youtube-dl utils
-def replace_insane(char):
-if restricted and char in ACCENT_CHARS:
-return ACCENT_CHARS[char]
-if char == '?' or ord(char) < 32 or ord(char) == 127:
-return ''
-elif char == '"':
-return '' if restricted else '\''
-elif char == ':':
-return '_-' if restricted else ' -'
-elif char in '\\/|*<>':
-return '_'
-if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
-return '_'
-if restricted and ord(char) > 127:
-return '_'
-return char
-# Handle timestamps
-s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
-result = ''.join(map(replace_insane, s))
-if not is_id:
-while '__' in result:
-result = result.replace('__', '_')
-result = result.strip('_')
-# Common case of "Foreign band name - English song title"
-if restricted and result.startswith('-_'):
-result = result[2:]
-if result.startswith('-'):
-result = '_' + result[len('-'):]
-result = result.lstrip('.')
-if not result:
-result = '_'
-return result
 def matroska_find(filelist):
 for myfile in filelist:
 if os.path.splitext(myfile)[1] == ".mkv" or os.path.splitext(myfile)[1] == ".webm":
 return True
 print(" downloaded {0}:    100% ".format(os.path.basename(d["filename"])))
 if d["status"] == "downloading":
 print(" downloading {0}: {1}\r".format(os.path.basename(d["filename"]), d["_percent_str"]), end="")
 if d["status"] == "error":
 print(" an error occurred downloading {0}!")
+def load_split_files(path):
+if os.path.isdir(path):
+result = {"videos": []}
+for f in glob.glob(os.path.join(path, "vids*.json")):
+with open(f, "r", encoding="utf-8") as infile:
+for i in json.loads(infile.read())["videos"]:
+result["videos"].append(i)
+return result
+else:
+return json.loads(open(path, "r", encoding="utf-8"))
 parser = argparse.ArgumentParser(description="Downloads (deleted) videos from YTPMV creators")
 parser.add_argument("-c", "--channel", help="channel URL", metavar='<url>', required=True)
 parser.add_argument("-d", "--database", help="json database (https://finnrepo.a2hosted.com/YTPMV_Database)", metavar='<path>', required=True)
 "progress_hooks": [ytdl_hook],
 "logger": MyLogger(),
 "ignoreerrors": False,
 }
-with open(args.database, "r", encoding="utf-8") as f:
+for i in load_split_files(args.database):
-data = json.load(f)
+try:
-for i in data["videos"]:
+uploader = i["uploader_id"]
-try:
+except Exception:
-uploader = i["uploader_id"]
+uploader = "unknown"
-except Exception:
+if uploader == channel:
-uploader = "unknown"
+print("{0}:".format(i["id"]))
-finally:
+isalreadydownloaded = 0
-if uploader == channel:
+for file in os.listdir(output):
-print("{0}:".format(i["id"]))
+if os.path.splitext(file)[1] == ".json":
-isalreadydownloaded = 0
+if file.find("-" + i["id"] + ".info.json") != -1:
-for file in os.listdir(output):
+isalreadydownloaded = 1
-if os.path.splitext(file)[1] == ".json":
+if isalreadydownloaded == 1:  # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great!
-if file.find("-" + i["id"] + ".info.json") != -1:
+print(" video already downloaded!")
-isalreadydownloaded = 1
+continue
-if isalreadydownloaded == 1:  # not sure how to bypass this without having to go out of the for loop, if anyone could tell me how that would be great!
+with yt_dlp.YoutubeDL(ytdl_opts) as ytdl:
-print(" video already downloaded!")
+try:
-continue
+result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])])  # TODO: add check for existing downloaded items and don't download them
-with yt_dlp.YoutubeDL(ytdl_opts) as ytdl:
+continue
-try:
+except Exception:
-result = ytdl.download(["https://youtube.com/watch?v={0}".format(i["id"])])  # TODO: add check for existing downloaded items and don't download them
+print(" video is not available! attempting to find Internet Archive pages of it...")
+if internetarchive.get_item("youtube-{0}".format(i["id"])).exists:  # download from internetarchive if available
+fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))]
+disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])]  # list of IA-created files we don't need
+flist = []
+for fname in fnames:
+if matroska_find(fnames):
+if fname[-4:] == ".mp4":
 continue
-except Exception:
+else:
-print(" video is not available! attempting to find Internet Archive pages of it...")
+if fname[-7:] == ".ia.mp4":
-if internetarchive.get_item("youtube-{0}".format(i["id"])).exists:  # download from internetarchive if available
-fnames = [f.name for f in internetarchive.get_files("youtube-{0}".format(i["id"]))]
-disallowednames = ["__ia_thumb.jpg", "youtube-{0}_archive.torrent".format(i["id"]), "youtube-{0}_files.xml".format(i["id"]), "youtube-{0}_meta.sqlite".format(i["id"]), "youtube-{0}_meta.xml".format(i["id"])]  # list of IA-created files we don't need
-flist = []
-for fname in fnames:
-if matroska_find(fnames):
-if fname[-4:] == ".mp4":
-continue
-else:
-if fname[-7:] == ".ia.mp4":
-continue
-if fname.find("/") == -1:
-if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]):
-flist.append(fname)
-if len(flist) >= 1:
-internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
-else:
-print(" video already downloaded!")
 continue
-if os.path.exists(output + "\\" + i["id"] + ".info.json"):  # will always exist no matter which setting was used to download
+if fname.find("/") == -1:
-for fname in flist:
+if fname not in disallowednames and fname[-21:] != "{0}_thumb.jpg".format(i["id"]) and fname[-15:] != "{0}.ogv".format(i["id"]):
-if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
+flist.append(fname)
-os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
+if len(flist) >= 1:
-else:
+internetarchive.download("youtube-{0}".format(i["id"]), files=flist, verbose=True, destdir=output, no_directory=True, ignore_existing=True)
-print("ID file not found!")
+else:
-else:  # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then)
+print(" video already downloaded!")
-print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
+continue
-try:
+if os.path.exists(output + "\\" + i["id"] + ".info.json"):  # will always exist no matter which setting was used to download
-contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type")
+for fname in flist:
-if contenttype == "video/webm":
+if os.path.exists(output + "\\" + fname) and not os.path.exists(output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname):
-ext = "webm"
+os.rename(output + "\\" + fname, output + "\\" + sanitize_filename(i["title"], restricted=True) + "-" + fname)
 else:
-ext = "mp4"
+print("ID file not found!")
-urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output))
+else:  # download the vid from waybackmachine (NOTE: only tested with youtube links after polymer, however SHOULD work with links created before then)
-print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext))
+print(" video does not have a Internet Archive page! attempting to download from the Wayback Machine...")
-except HTTPError:
+try:
-print(" video not available on the Wayback Machine!")
+contenttype = urllib.request.urlopen("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"])).getheader("Content-Type")
-except Exception as e:
+if contenttype == "video/webm":
-print(" unknown error downloading video!")
+ext = "webm"
-print(e)
+else:
-# metadata
+ext = "mp4"
-with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile:
+urllib.request.urlretrieve("https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{0}".format(i["id"]), "{3}\\{0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext, output))
-print(json.dumps(i), end="", file=jsonfile)
+print(" downloaded {0}-{1}.{2}".format(sanitize_filename(i["title"], restricted=True), i["id"], ext))
-print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output))
+except HTTPError:
+print(" video not available on the Wayback Machine!")
+except Exception as e:
+print(" unknown error downloading video!")
+print(e)
+# metadata
+with open("{2}\\{0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output), "w") as jsonfile:
+print(json.dumps(i), end="", file=jsonfile)
+print(" saved {0}-{1}.info.json".format(sanitize_filename(i["title"], restricted=True), i["id"], output))

Mercurial > codedump

comparison channeldownloader.py @ 61:c615532e6572