view dep/anitomy/anitomy/parser_helper.cpp @ 186:6ef31dbb90ca

anime: no unnecessary conversion to floating point
author Paper <mrpapersonic@gmail.com>
date Wed, 06 Dec 2023 11:47:59 -0500
parents 5c0397762b53
children a0aa8c8c4307
line wrap: on
line source

/*
** Copyright (c) 2014-2017, Eren Okka
**
** This Source Code Form is subject to the terms of the Mozilla Public
** License, v. 2.0. If a copy of the MPL was not distributed with this
** file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

#include <algorithm>
#include <regex>

#include "keyword.h"
#include "parser.h"
#include "string.h"

namespace anitomy {

const string_t kDashes = L"-\u2010\u2011\u2012\u2013\u2014\u2015";
const string_t kDashesWithSpace = L" -\u2010\u2011\u2012\u2013\u2014\u2015";

size_t Parser::FindNumberInString(const string_t& str) {
	auto it = std::find_if(str.begin(), str.end(), IsNumericChar);
	return it == str.end() ? str.npos : (it - str.begin());
}

string_t Parser::GetNumberFromOrdinal(const string_t& word) {
	static const std::map<string_t, string_t> ordinals{
		{L"1st",	 L"1"},
		{L"First",   L"1"},
		{L"2nd",	 L"2"},
		{L"Second",	L"2"},
		   {L"3rd",		L"3"},
		   {L"Third",	  L"3"},
		{L"4th",	 L"4"},
		{L"Fourth",	L"4"},
		{L"5th",	 L"5"},
		{L"Fifth",   L"5"},
		   {L"6th",		L"6"},
		   {L"Sixth",	  L"6"},
		{L"7th",	 L"7"},
		{L"Seventh", L"7"},
		{L"8th",	 L"8"},
		{L"Eighth",	L"8"},
		   {L"9th",		L"9"},
		   {L"Ninth",	  L"9"},
	};

	auto it = ordinals.find(word);
	return it != ordinals.end() ? it->second : string_t();
}

bool Parser::IsCrc32(const string_t& str) {
	return str.size() == 8 && IsHexadecimalString(str);
}

bool Parser::IsDashCharacter(const string_t& str) {
	if (str.size() != 1)
		return false;

	auto result = std::find(kDashes.begin(), kDashes.end(), str.front());
	return result != kDashes.end();
}

bool Parser::IsResolution(const string_t& str) {
	// Using a regex such as "\\d{3,4}(p|(x\\d{3,4}))$" would be more elegant,
	// but it's much slower (e.g. 2.4ms -> 24.9ms).

	const size_t min_width_size = 3;
	const size_t min_height_size = 3;

	// *###x###*
	if (str.size() >= min_width_size + 1 + min_height_size) {
		size_t pos = str.find_first_of(L"xX\u00D7"); // multiplication sign
		if (pos != str.npos && pos >= min_width_size && pos <= str.size() - (min_height_size + 1)) {
			for (size_t i = 0; i < str.size(); i++)
				if (i != pos && !IsNumericChar(str.at(i)))
					return false;
			return true;
		}

		// *###p
	} else if (str.size() >= min_height_size + 1) {
		if (str.back() == L'p' || str.back() == L'P') {
			for (size_t i = 0; i < str.size() - 1; i++)
				if (!IsNumericChar(str.at(i)))
					return false;
			return true;
		}
	}

	return false;
}

////////////////////////////////////////////////////////////////////////////////

bool Parser::CheckAnimeSeasonKeyword(const token_iterator_t token) {
	auto set_anime_season = [&](token_iterator_t first, token_iterator_t second, const string_t& content) {
		elements_.insert(kElementAnimeSeason, content);
		first->category = kIdentifier;
		second->category = kIdentifier;
	};

	auto previous_token = FindPreviousToken(tokens_, token, kFlagNotDelimiter);
	if (previous_token != tokens_.end()) {
		auto number = GetNumberFromOrdinal(previous_token->content);
		if (!number.empty()) {
			set_anime_season(previous_token, token, number);
			return true;
		}
	}

	auto next_token = FindNextToken(tokens_, token, kFlagNotDelimiter);
	if (next_token != tokens_.end() && IsNumericString(next_token->content)) {
		set_anime_season(token, next_token, next_token->content);
		return true;
	}

	return false;
}

bool Parser::CheckExtentKeyword(ElementCategory category, const token_iterator_t token) {
	auto next_token = FindNextToken(tokens_, token, kFlagNotDelimiter);

	if (CheckTokenCategory(next_token, kUnknown)) {
		if (FindNumberInString(next_token->content) == 0) {
			switch (category) {
				case kElementEpisodeNumber:
					if (!MatchEpisodePatterns(next_token->content, *next_token))
						SetEpisodeNumber(next_token->content, *next_token, false);
					break;
				case kElementVolumeNumber:
					if (!MatchVolumePatterns(next_token->content, *next_token))
						SetVolumeNumber(next_token->content, *next_token, false);
					break;
				default: return false;
			}
			token->category = kIdentifier;
			return true;
		}
	}

	return false;
}

////////////////////////////////////////////////////////////////////////////////

bool Parser::IsElementCategorySearchable(ElementCategory category) {
	switch (category) {
		case kElementAnimeSeasonPrefix:
		case kElementAnimeType:
		case kElementAudioTerm:
		case kElementDeviceCompatibility:
		case kElementEpisodePrefix:
		case kElementFileChecksum:
		case kElementLanguage:
		case kElementOther:
		case kElementReleaseGroup:
		case kElementReleaseInformation:
		case kElementReleaseVersion:
		case kElementSource:
		case kElementSubtitles:
		case kElementVideoResolution:
		case kElementVideoTerm:
		case kElementVolumePrefix: return true;
		default: break;
	}

	return false;
}

bool Parser::IsElementCategorySingular(ElementCategory category) {
	switch (category) {
		case kElementAnimeSeason:
		case kElementAnimeType:
		case kElementAudioTerm:
		case kElementDeviceCompatibility:
		case kElementEpisodeNumber:
		case kElementLanguage:
		case kElementOther:
		case kElementReleaseInformation:
		case kElementSource:
		case kElementVideoTerm: return false;
		default: break;
	}

	return true;
}

////////////////////////////////////////////////////////////////////////////////

void Parser::BuildElement(ElementCategory category, bool keep_delimiters, const token_iterator_t token_begin,
						  const token_iterator_t token_end) const {
	string_t element;

	for (auto token = token_begin; token != token_end; ++token) {
		switch (token->category) {
			case kUnknown:
				element += token->content;
				token->category = kIdentifier;
				break;
			case kBracket: element += token->content; break;
			case kDelimiter: {
				auto delimiter = token->content.front();
				if (keep_delimiters) {
					element.push_back(delimiter);
				} else if (token != token_begin && token != token_end) {
					switch (delimiter) {
						case L',':
						case L'&': element.push_back(delimiter); break;
						default: element.push_back(L' '); break;
					}
				}
				break;
			}
			default: break;
		}
	}

	if (!keep_delimiters)
		TrimString(element, kDashesWithSpace.c_str());

	if (!element.empty())
		elements_.insert(category, element);
}

////////////////////////////////////////////////////////////////////////////////

bool Parser::CheckTokenCategory(const token_iterator_t token, TokenCategory category) const {
	return token != tokens_.end() && token->category == category;
}

bool Parser::IsTokenIsolated(const token_iterator_t token) const {
	auto previous_token = FindPreviousToken(tokens_, token, kFlagNotDelimiter);
	if (!CheckTokenCategory(previous_token, kBracket))
		return false;

	auto next_token = FindNextToken(tokens_, token, kFlagNotDelimiter);
	if (!CheckTokenCategory(next_token, kBracket))
		return false;

	return true;
}

} // namespace anitomy