Mercurial > minori
view dep/anitomy/anitomy/parser_number.cpp @ 155:d2bbb5773616
dep/animia: add quartz backend for windows
author | Paper <mrpapersonic@gmail.com> |
---|---|
date | Wed, 15 Nov 2023 15:24:39 -0500 |
parents | 5c0397762b53 |
children | a0aa8c8c4307 |
line wrap: on
line source
/* ** Copyright (c) 2014-2017, Eren Okka ** ** This Source Code Form is subject to the terms of the Mozilla Public ** License, v. 2.0. If a copy of the MPL was not distributed with this ** file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #include <algorithm> #include <regex> #include "element.h" #include "keyword.h" #include "parser.h" #include "string.h" namespace anitomy { bool Parser::IsValidEpisodeNumber(const string_t& number) { return StringToInt(number) <= kEpisodeNumberMax; } bool Parser::SetEpisodeNumber(const string_t& number, Token& token, bool validate) { if (validate && !IsValidEpisodeNumber(number)) return false; token.category = kIdentifier; auto category = kElementEpisodeNumber; // Handle equivalent numbers if (found_episode_keywords_) { for (auto& element : elements_) { if (element.first != kElementEpisodeNumber) continue; // The larger number gets to be the alternative one const int comparison = StringToInt(number) - StringToInt(element.second); if (comparison > 0) { category = kElementEpisodeNumberAlt; } else if (comparison < 0) { element.first = kElementEpisodeNumberAlt; } else { return false; // No need to add the same number twice } break; } } elements_.insert(category, number); return true; } bool Parser::SetAlternativeEpisodeNumber(const string_t& number, Token& token) { elements_.insert(kElementEpisodeNumberAlt, number); token.category = kIdentifier; return true; } //////////////////////////////////////////////////////////////////////////////// bool Parser::IsValidVolumeNumber(const string_t& number) { return StringToInt(number) <= kVolumeNumberMax; } bool Parser::SetVolumeNumber(const string_t& number, Token& token, bool validate) { if (validate) if (!IsValidVolumeNumber(number)) return false; elements_.insert(kElementVolumeNumber, number); token.category = kIdentifier; return true; } //////////////////////////////////////////////////////////////////////////////// bool Parser::NumberComesAfterPrefix(ElementCategory category, Token& token) { size_t number_begin = FindNumberInString(token.content); auto prefix = keyword_manager.Normalize(token.content.substr(0, number_begin)); if (keyword_manager.Find(category, prefix)) { auto number = token.content.substr(number_begin, token.content.length() - number_begin); switch (category) { case kElementEpisodePrefix: if (!MatchEpisodePatterns(number, token)) SetEpisodeNumber(number, token, false); return true; case kElementVolumePrefix: if (!MatchVolumePatterns(number, token)) SetVolumeNumber(number, token, false); return true; default: break; } } return false; } bool Parser::NumberComesBeforeAnotherNumber(const token_iterator_t token) { auto separator_token = FindNextToken(tokens_, token, kFlagNotDelimiter); if (separator_token != tokens_.end()) { static const std::vector<std::pair<string_t, bool>> separators{ {L"&", true }, {L"of", false}, }; for (const auto& separator : separators) { if (IsStringEqualTo(separator_token->content, separator.first)) { auto other_token = FindNextToken(tokens_, separator_token, kFlagNotDelimiter); if (other_token != tokens_.end() && IsNumericString(other_token->content)) { SetEpisodeNumber(token->content, *token, false); if (separator.second) SetEpisodeNumber(other_token->content, *other_token, false); separator_token->category = kIdentifier; other_token->category = kIdentifier; return true; } } } } return false; } bool Parser::SearchForEpisodePatterns(std::vector<size_t>& tokens) { for (const auto& token_index : tokens) { auto token = tokens_.begin() + token_index; bool numeric_front = IsNumericChar(token->content.front()); if (!numeric_front) { // e.g. "EP.1", "Vol.1" if (NumberComesAfterPrefix(kElementEpisodePrefix, *token)) return true; if (NumberComesAfterPrefix(kElementVolumePrefix, *token)) continue; } else { // e.g. "8 & 10", "01 of 24" if (NumberComesBeforeAnotherNumber(token)) return true; } // Look for other patterns if (MatchEpisodePatterns(token->content, *token)) return true; } return false; } //////////////////////////////////////////////////////////////////////////////// using regex_t = std::basic_regex<char_t>; using regex_match_results_t = std::match_results<string_t::const_iterator>; bool Parser::MatchSingleEpisodePattern(const string_t& word, Token& token) { static const regex_t pattern(L"(\\d{1,4})[vV](\\d)"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { SetEpisodeNumber(match_results[1].str(), token, false); elements_.insert(kElementReleaseVersion, match_results[2].str()); return true; } return false; } bool Parser::MatchMultiEpisodePattern(const string_t& word, Token& token) { static const regex_t pattern(L"(\\d{1,4})(?:[vV](\\d))?[-~&+]" L"(\\d{1,4})(?:[vV](\\d))?"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { auto lower_bound = match_results[1].str(); auto upper_bound = match_results[3].str(); // Avoid matching expressions such as "009-1" or "5-2" if (StringToInt(lower_bound) < StringToInt(upper_bound)) { if (SetEpisodeNumber(lower_bound, token, true)) { SetEpisodeNumber(upper_bound, token, false); if (match_results[2].matched) elements_.insert(kElementReleaseVersion, match_results[2].str()); if (match_results[4].matched) elements_.insert(kElementReleaseVersion, match_results[4].str()); return true; } } } return false; } bool Parser::MatchSeasonAndEpisodePattern(const string_t& word, Token& token) { static const regex_t pattern(L"S?" L"(\\d{1,2})(?:-S?(\\d{1,2}))?" L"(?:x|[ ._-x]?E)" L"(\\d{1,4})(?:-E?(\\d{1,4}))?" L"(?:[vV](\\d))?", std::regex_constants::icase); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { if (StringToInt(match_results[1]) == 0) return false; elements_.insert(kElementAnimeSeason, match_results[1]); if (match_results[2].matched) elements_.insert(kElementAnimeSeason, match_results[2]); SetEpisodeNumber(match_results[3], token, false); if (match_results[4].matched) SetEpisodeNumber(match_results[4], token, false); return true; } return false; } bool Parser::MatchTypeAndEpisodePattern(const string_t& word, Token& token) { size_t number_begin = FindNumberInString(word); auto prefix = word.substr(0, number_begin); ElementCategory category = kElementAnimeType; KeywordOptions options; if (keyword_manager.Find(keyword_manager.Normalize(prefix), category, options)) { elements_.insert(kElementAnimeType, prefix); auto number = word.substr(number_begin); if (MatchEpisodePatterns(number, token) || SetEpisodeNumber(number, token, true)) { auto it = std::find(tokens_.begin(), tokens_.end(), token); if (it != tokens_.end()) { // Split token (we do this last in order to avoid invalidating our // token reference earlier) token.content = number; tokens_.insert(it, Token(options.identifiable ? kIdentifier : kUnknown, prefix, token.enclosed)); } return true; } } return false; } bool Parser::MatchFractionalEpisodePattern(const string_t& word, Token& token) { // We don't allow any fractional part other than ".5", because there are cases // where such a number is a part of the anime title (e.g. "Evangelion: 1.11", // "Tokyo Magnitude 8.0") or a keyword (e.g. "5.1"). static const regex_t pattern(L"\\d+\\.5"); if (std::regex_match(word, pattern)) if (SetEpisodeNumber(word, token, true)) return true; return false; } bool Parser::MatchPartialEpisodePattern(const string_t& word, Token& token) { auto it = std::find_if_not(word.begin(), word.end(), IsNumericChar); auto suffix_length = std::distance(it, word.end()); auto is_valid_suffix = [](const char_t c) { return (c >= L'A' && c <= L'C') || (c >= L'a' && c <= L'c'); }; if (suffix_length == 1 && is_valid_suffix(*it)) if (SetEpisodeNumber(word, token, true)) return true; return false; } bool Parser::MatchNumberSignPattern(const string_t& word, Token& token) { if (word.front() != L'#') return false; static const regex_t pattern(L"#(\\d{1,4})(?:[-~&+](\\d{1,4}))?(?:[vV](\\d))?"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { if (SetEpisodeNumber(match_results[1].str(), token, true)) { if (match_results[2].matched) SetEpisodeNumber(match_results[2].str(), token, false); if (match_results[3].matched) elements_.insert(kElementReleaseVersion, match_results[3].str()); return true; } } return false; } bool Parser::MatchJapaneseCounterPattern(const string_t& word, Token& token) { if (word.back() != L'\u8A71') return false; static const regex_t pattern(L"(\\d{1,4})\u8A71"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { SetEpisodeNumber(match_results[1].str(), token, false); return true; } return false; } bool Parser::MatchEpisodePatterns(string_t word, Token& token) { // All patterns contain at least one non-numeric character if (IsNumericString(word)) return false; TrimString(word, L" -"); const bool numeric_front = IsNumericChar(word.front()); const bool numeric_back = IsNumericChar(word.back()); // e.g. "01v2" if (numeric_front && numeric_back) if (MatchSingleEpisodePattern(word, token)) return true; // e.g. "01-02", "03-05v2" if (numeric_front && numeric_back) if (MatchMultiEpisodePattern(word, token)) return true; // e.g. "2x01", "S01E03", "S01-02xE001-150" if (numeric_back) if (MatchSeasonAndEpisodePattern(word, token)) return true; // e.g. "ED1", "OP4a", "OVA2" if (!numeric_front) if (MatchTypeAndEpisodePattern(word, token)) return true; // e.g. "07.5" if (numeric_front && numeric_back) if (MatchFractionalEpisodePattern(word, token)) return true; // e.g. "4a", "111C" if (numeric_front && !numeric_back) if (MatchPartialEpisodePattern(word, token)) return true; // e.g. "#01", "#02-03v2" if (numeric_back) if (MatchNumberSignPattern(word, token)) return true; // U+8A71 is used as counter for stories, episodes of TV series, etc. if (numeric_front) if (MatchJapaneseCounterPattern(word, token)) return true; return false; } //////////////////////////////////////////////////////////////////////////////// bool Parser::MatchSingleVolumePattern(const string_t& word, Token& token) { static const regex_t pattern(L"(\\d{1,2})[vV](\\d)"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { SetVolumeNumber(match_results[1].str(), token, false); elements_.insert(kElementReleaseVersion, match_results[2].str()); return true; } return false; } bool Parser::MatchMultiVolumePattern(const string_t& word, Token& token) { static const regex_t pattern(L"(\\d{1,2})[-~&+](\\d{1,2})(?:[vV](\\d))?"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { auto lower_bound = match_results[1].str(); auto upper_bound = match_results[2].str(); if (StringToInt(lower_bound) < StringToInt(upper_bound)) { if (SetVolumeNumber(lower_bound, token, true)) { SetVolumeNumber(upper_bound, token, false); if (match_results[3].matched) elements_.insert(kElementReleaseVersion, match_results[3].str()); return true; } } } return false; } bool Parser::MatchVolumePatterns(string_t word, Token& token) { // All patterns contain at least one non-numeric character if (IsNumericString(word)) return false; TrimString(word, L" -"); const bool numeric_front = IsNumericChar(word.front()); const bool numeric_back = IsNumericChar(word.back()); // e.g. "01v2" if (numeric_front && numeric_back) if (MatchSingleVolumePattern(word, token)) return true; // e.g. "01-02", "03-05v2" if (numeric_front && numeric_back) if (MatchMultiVolumePattern(word, token)) return true; return false; } //////////////////////////////////////////////////////////////////////////////// bool Parser::SearchForEquivalentNumbers(std::vector<size_t>& tokens) { for (auto token_index = tokens.begin(); token_index != tokens.end(); ++token_index) { auto token = tokens_.begin() + *token_index; if (IsTokenIsolated(token) || !IsValidEpisodeNumber(token->content)) continue; // Find the first enclosed, non-delimiter token auto next_token = FindNextToken(tokens_, token, kFlagNotDelimiter); if (!CheckTokenCategory(next_token, kBracket)) continue; next_token = FindNextToken(tokens_, next_token, kFlagEnclosed | kFlagNotDelimiter); if (!CheckTokenCategory(next_token, kUnknown)) continue; // Check if it's an isolated number if (!IsTokenIsolated(next_token) || !IsNumericString(next_token->content) || !IsValidEpisodeNumber(next_token->content)) continue; auto minmax = std::minmax(token, next_token, [](const token_iterator_t& a, const token_iterator_t& b) { return StringToInt(a->content) < StringToInt(b->content); }); SetEpisodeNumber(minmax.first->content, *minmax.first, false); SetAlternativeEpisodeNumber(minmax.second->content, *minmax.second); return true; } return false; } bool Parser::SearchForIsolatedNumbers(std::vector<size_t>& tokens) { for (auto token_index = tokens.begin(); token_index != tokens.end(); ++token_index) { auto token = tokens_.begin() + *token_index; if (!token->enclosed || !IsTokenIsolated(token)) continue; if (SetEpisodeNumber(token->content, *token, true)) return true; } return false; } bool Parser::SearchForSeparatedNumbers(std::vector<size_t>& tokens) { for (auto token_index = tokens.begin(); token_index != tokens.end(); ++token_index) { auto token = tokens_.begin() + *token_index; auto previous_token = FindPreviousToken(tokens_, token, kFlagNotDelimiter); // See if the number has a preceding "-" separator if (CheckTokenCategory(previous_token, kUnknown) && IsDashCharacter(previous_token->content)) { if (SetEpisodeNumber(token->content, *token, true)) { previous_token->category = kIdentifier; return true; } } } return false; } bool Parser::SearchForLastNumber(std::vector<size_t>& tokens) { for (auto it = tokens.rbegin(); it != tokens.rend(); ++it) { size_t token_index = *it; auto token = tokens_.begin() + token_index; // Assuming that episode number always comes after the title, first token // cannot be what we're looking for if (token_index == 0) continue; // An enclosed token is unlikely to be the episode number at this point if (token->enclosed) continue; // Ignore if it's the first non-enclosed, non-delimiter token if (std::all_of(tokens_.begin(), token, [](const Token& token) { return token.enclosed || token.category == kDelimiter; })) continue; // Ignore if the previous token is "Movie" or "Part" auto previous_token = FindPreviousToken(tokens_, token, kFlagNotDelimiter); if (CheckTokenCategory(previous_token, kUnknown)) { if (IsStringEqualTo(previous_token->content, L"Movie") || IsStringEqualTo(previous_token->content, L"Part")) { continue; } } // We'll use this number after all if (SetEpisodeNumber(token->content, *token, true)) return true; } return false; } } // namespace anitomy