view dep/anitomy/anitomy/parser.cpp @ 65:26721c28bf22

*: avoid usage of (to|from)StdString in Qt5 (and probably Qt6 as well) these functions are only available (or even usable) if Qt and Minori were built with the *same standard headers*, which may not be the case in some circumstances. hence, we'll use our own conversion functions, which we probably should use anyway.
author Paper <mrpapersonic@gmail.com>
date Sun, 01 Oct 2023 23:26:35 -0400
parents 5c0397762b53
children a0aa8c8c4307
line wrap: on
line source

/*
** Copyright (c) 2014-2017, Eren Okka
**
** This Source Code Form is subject to the terms of the Mozilla Public
** License, v. 2.0. If a copy of the MPL was not distributed with this
** file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

#include <algorithm>

#include "keyword.h"
#include "parser.h"
#include "string.h"

namespace anitomy {

Parser::Parser(Elements& elements, const Options& options, token_container_t& tokens)
	: elements_(elements), options_(options), tokens_(tokens) {
}

bool Parser::Parse() {
	SearchForKeywords();

	SearchForIsolatedNumbers();

	if (options_.parse_episode_number)
		SearchForEpisodeNumber();

	SearchForAnimeTitle();

	if (options_.parse_release_group && elements_.empty(kElementReleaseGroup))
		SearchForReleaseGroup();

	if (options_.parse_episode_title && !elements_.empty(kElementEpisodeNumber))
		SearchForEpisodeTitle();

	ValidateElements();

	return !elements_.empty(kElementAnimeTitle);
}

////////////////////////////////////////////////////////////////////////////////

void Parser::SearchForKeywords() {
	for (auto it = tokens_.begin(); it != tokens_.end(); ++it) {
		auto& token = *it;

		if (token.category != kUnknown)
			continue;

		auto word = token.content;
		TrimString(word, L" -");

		if (word.empty())
			continue;
		// Don't bother if the word is a number that cannot be CRC
		if (word.size() != 8 && IsNumericString(word))
			continue;

		// Performs better than making a case-insensitive Find
		auto keyword = keyword_manager.Normalize(word);
		ElementCategory category = kElementUnknown;
		KeywordOptions options;

		if (keyword_manager.Find(keyword, category, options)) {
			if (!options_.parse_release_group && category == kElementReleaseGroup)
				continue;
			if (!IsElementCategorySearchable(category) || !options.searchable)
				continue;
			if (IsElementCategorySingular(category) && !elements_.empty(category))
				continue;
			if (category == kElementAnimeSeasonPrefix) {
				CheckAnimeSeasonKeyword(it);
				continue;
			} else if (category == kElementEpisodePrefix) {
				if (options.valid)
					CheckExtentKeyword(kElementEpisodeNumber, it);
				continue;
			} else if (category == kElementReleaseVersion) {
				word = word.substr(1); // number without "v"
			} else if (category == kElementVolumePrefix) {
				CheckExtentKeyword(kElementVolumeNumber, it);
				continue;
			}
		} else {
			if (elements_.empty(kElementFileChecksum) && IsCrc32(word)) {
				category = kElementFileChecksum;
			} else if (elements_.empty(kElementVideoResolution) && IsResolution(word)) {
				category = kElementVideoResolution;
			}
		}

		if (category != kElementUnknown) {
			elements_.insert(category, word);
			if (options.identifiable)
				token.category = kIdentifier;
		}
	}
}

////////////////////////////////////////////////////////////////////////////////

void Parser::SearchForEpisodeNumber() {
	// List all unknown tokens that contain a number
	std::vector<size_t> tokens;
	for (size_t i = 0; i < tokens_.size(); ++i) {
		auto& token = tokens_.at(i);
		if (token.category == kUnknown)
			if (FindNumberInString(token.content) != token.content.npos)
				tokens.push_back(i);
	}
	if (tokens.empty())
		return;

	found_episode_keywords_ = !elements_.empty(kElementEpisodeNumber);

	// If a token matches a known episode pattern, it has to be the episode number
	if (SearchForEpisodePatterns(tokens))
		return;

	if (!elements_.empty(kElementEpisodeNumber))
		return; // We have previously found an episode number via keywords

	// From now on, we're only interested in numeric tokens
	auto not_numeric_string = [&](size_t index) -> bool { return !IsNumericString(tokens_.at(index).content); };
	tokens.erase(std::remove_if(tokens.begin(), tokens.end(), not_numeric_string), tokens.end());

	if (tokens.empty())
		return;

	// e.g. "01 (176)", "29 (04)"
	if (SearchForEquivalentNumbers(tokens))
		return;

	// e.g. " - 08"
	if (SearchForSeparatedNumbers(tokens))
		return;

	// e.g. "[12]", "(2006)"
	if (SearchForIsolatedNumbers(tokens))
		return;

	// Consider using the last number as a last resort
	SearchForLastNumber(tokens);
}

////////////////////////////////////////////////////////////////////////////////

void Parser::SearchForAnimeTitle() {
	bool enclosed_title = false;

	// Find the first non-enclosed unknown token
	auto token_begin = FindToken(tokens_.begin(), tokens_.end(), kFlagNotEnclosed | kFlagUnknown);

	// If that doesn't work, find the first unknown token in the second enclosed
	// group, assuming that the first one is the release group
	if (token_begin == tokens_.end()) {
		enclosed_title = true;
		token_begin = tokens_.begin();
		bool skipped_previous_group = false;
		do {
			token_begin = FindToken(token_begin, tokens_.end(), kFlagUnknown);
			if (token_begin == tokens_.end())
				break;
			// Ignore groups that are composed of non-Latin characters
			if (IsMostlyLatinString(token_begin->content))
				if (skipped_previous_group)
					break; // Found it
			// Get the first unknown token of the next group
			token_begin = FindToken(token_begin, tokens_.end(), kFlagBracket);
			token_begin = FindToken(token_begin, tokens_.end(), kFlagUnknown);
			skipped_previous_group = true;
		} while (token_begin != tokens_.end());
	}
	if (token_begin == tokens_.end())
		return;

	// Continue until an identifier (or a bracket, if the title is enclosed)
	// is found
	auto token_end =
		FindToken(token_begin, tokens_.end(), kFlagIdentifier | (enclosed_title ? kFlagBracket : kFlagNone));

	// If within the interval there's an open bracket without its matching pair,
	// move the upper endpoint back to the bracket
	if (!enclosed_title) {
		auto last_bracket = token_end;
		bool bracket_open = false;
		for (auto token = token_begin; token != token_end; ++token) {
			if (token->category == kBracket) {
				last_bracket = token;
				bracket_open = !bracket_open;
			}
		}
		if (bracket_open)
			token_end = last_bracket;
	}

	// If the interval ends with an enclosed group (e.g. "Anime Title [Fansub]"),
	// move the upper endpoint back to the beginning of the group. We ignore
	// parentheses in order to keep certain groups (e.g. "(TV)") intact.
	if (!enclosed_title) {
		auto token = FindPreviousToken(tokens_, token_end, kFlagNotDelimiter);
		while (CheckTokenCategory(token, kBracket) && token->content.front() != ')') {
			token = FindPreviousToken(tokens_, token, kFlagBracket);
			if (token != tokens_.end()) {
				token_end = token;
				token = FindPreviousToken(tokens_, token_end, kFlagNotDelimiter);
			}
		}
	}

	// Build anime title
	BuildElement(kElementAnimeTitle, false, token_begin, token_end);
}

void Parser::SearchForReleaseGroup() {
	auto token_begin = tokens_.begin();
	auto token_end = tokens_.begin();

	do {
		// Find the first enclosed unknown token
		token_begin = FindToken(token_end, tokens_.end(), kFlagEnclosed | kFlagUnknown);
		if (token_begin == tokens_.end())
			return;

		// Continue until a bracket or identifier is found
		token_end = FindToken(token_begin, tokens_.end(), kFlagBracket | kFlagIdentifier);
		if (token_end == tokens_.end() || token_end->category != kBracket)
			continue;

		// Ignore if it's not the first non-delimiter token in group
		auto previous_token = FindPreviousToken(tokens_, token_begin, kFlagNotDelimiter);
		if (previous_token != tokens_.end() && previous_token->category != kBracket) {
			continue;
		}

		// Build release group
		BuildElement(kElementReleaseGroup, true, token_begin, token_end);
		return;
	} while (token_begin != tokens_.end());
}

void Parser::SearchForEpisodeTitle() {
	auto token_begin = tokens_.begin();
	auto token_end = tokens_.begin();

	do {
		// Find the first non-enclosed unknown token
		token_begin = FindToken(token_end, tokens_.end(), kFlagNotEnclosed | kFlagUnknown);
		if (token_begin == tokens_.end())
			return;

		// Continue until a bracket or identifier is found
		token_end = FindToken(token_begin, tokens_.end(), kFlagBracket | kFlagIdentifier);

		// Ignore if it's only a dash
		if (std::distance(token_begin, token_end) <= 2 && IsDashCharacter(token_begin->content)) {
			continue;
		}

		// Build episode title
		BuildElement(kElementEpisodeTitle, false, token_begin, token_end);
		return;
	} while (token_begin != tokens_.end());
}

////////////////////////////////////////////////////////////////////////////////

void Parser::SearchForIsolatedNumbers() {
	for (auto token = tokens_.begin(); token != tokens_.end(); ++token) {
		if (token->category != kUnknown || !IsNumericString(token->content) || !IsTokenIsolated(token))
			continue;

		auto number = StringToInt(token->content);

		// Anime year
		if (number >= kAnimeYearMin && number <= kAnimeYearMax) {
			if (elements_.empty(kElementAnimeYear)) {
				elements_.insert(kElementAnimeYear, token->content);
				token->category = kIdentifier;
				continue;
			}
		}

		// Video resolution
		if (number == 480 || number == 720 || number == 1080) {
			// If these numbers are isolated, it's more likely for them to be the
			// video resolution rather than the episode number. Some fansub groups
			// use these without the "p" suffix.
			if (elements_.empty(kElementVideoResolution)) {
				elements_.insert(kElementVideoResolution, token->content);
				token->category = kIdentifier;
				continue;
			}
		}
	}
}

////////////////////////////////////////////////////////////////////////////////

void Parser::ValidateElements() {
	// Validate anime type and episode title
	if (!elements_.empty(kElementAnimeType) && !elements_.empty(kElementEpisodeTitle)) {
		// Here we check whether the episode title contains an anime type
		const auto episode_title = elements_.get(kElementEpisodeTitle);
		for (auto it = elements_.begin(); it != elements_.end();) {
			if (it->first == kElementAnimeType) {
				if (IsInString(episode_title, it->second)) {
					if (episode_title.size() == it->second.size()) {
						elements_.erase(kElementEpisodeTitle); // invalid episode title
					} else {
						const auto keyword = keyword_manager.Normalize(it->second);
						if (keyword_manager.Find(kElementAnimeType, keyword)) {
							it = elements_.erase(it); // invalid anime type
							continue;
						}
					}
				}
			}
			++it;
		}
	}
}

} // namespace anitomy