Mercurial > minori
view dep/anitomy/anitomy/parser_number.cpp @ 137:69db40272acd
dep/animia: [WIP] huge refactor
this WILL NOT compile, because lots of code has been changed
and every API in the original codebase has been removed.
note that this api setup is not exactly permanent...
author | Paper <mrpapersonic@gmail.com> |
---|---|
date | Fri, 10 Nov 2023 13:52:47 -0500 |
parents | 5c0397762b53 |
children | a0aa8c8c4307 |
line wrap: on
line source
/* ** Copyright (c) 2014-2017, Eren Okka ** ** This Source Code Form is subject to the terms of the Mozilla Public ** License, v. 2.0. If a copy of the MPL was not distributed with this ** file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #include <algorithm> #include <regex> #include "element.h" #include "keyword.h" #include "parser.h" #include "string.h" namespace anitomy { bool Parser::IsValidEpisodeNumber(const string_t& number) { return StringToInt(number) <= kEpisodeNumberMax; } bool Parser::SetEpisodeNumber(const string_t& number, Token& token, bool validate) { if (validate && !IsValidEpisodeNumber(number)) return false; token.category = kIdentifier; auto category = kElementEpisodeNumber; // Handle equivalent numbers if (found_episode_keywords_) { for (auto& element : elements_) { if (element.first != kElementEpisodeNumber) continue; // The larger number gets to be the alternative one const int comparison = StringToInt(number) - StringToInt(element.second); if (comparison > 0) { category = kElementEpisodeNumberAlt; } else if (comparison < 0) { element.first = kElementEpisodeNumberAlt; } else { return false; // No need to add the same number twice } break; } } elements_.insert(category, number); return true; } bool Parser::SetAlternativeEpisodeNumber(const string_t& number, Token& token) { elements_.insert(kElementEpisodeNumberAlt, number); token.category = kIdentifier; return true; } //////////////////////////////////////////////////////////////////////////////// bool Parser::IsValidVolumeNumber(const string_t& number) { return StringToInt(number) <= kVolumeNumberMax; } bool Parser::SetVolumeNumber(const string_t& number, Token& token, bool validate) { if (validate) if (!IsValidVolumeNumber(number)) return false; elements_.insert(kElementVolumeNumber, number); token.category = kIdentifier; return true; } //////////////////////////////////////////////////////////////////////////////// bool Parser::NumberComesAfterPrefix(ElementCategory category, Token& token) { size_t number_begin = FindNumberInString(token.content); auto prefix = keyword_manager.Normalize(token.content.substr(0, number_begin)); if (keyword_manager.Find(category, prefix)) { auto number = token.content.substr(number_begin, token.content.length() - number_begin); switch (category) { case kElementEpisodePrefix: if (!MatchEpisodePatterns(number, token)) SetEpisodeNumber(number, token, false); return true; case kElementVolumePrefix: if (!MatchVolumePatterns(number, token)) SetVolumeNumber(number, token, false); return true; default: break; } } return false; } bool Parser::NumberComesBeforeAnotherNumber(const token_iterator_t token) { auto separator_token = FindNextToken(tokens_, token, kFlagNotDelimiter); if (separator_token != tokens_.end()) { static const std::vector<std::pair<string_t, bool>> separators{ {L"&", true }, {L"of", false}, }; for (const auto& separator : separators) { if (IsStringEqualTo(separator_token->content, separator.first)) { auto other_token = FindNextToken(tokens_, separator_token, kFlagNotDelimiter); if (other_token != tokens_.end() && IsNumericString(other_token->content)) { SetEpisodeNumber(token->content, *token, false); if (separator.second) SetEpisodeNumber(other_token->content, *other_token, false); separator_token->category = kIdentifier; other_token->category = kIdentifier; return true; } } } } return false; } bool Parser::SearchForEpisodePatterns(std::vector<size_t>& tokens) { for (const auto& token_index : tokens) { auto token = tokens_.begin() + token_index; bool numeric_front = IsNumericChar(token->content.front()); if (!numeric_front) { // e.g. "EP.1", "Vol.1" if (NumberComesAfterPrefix(kElementEpisodePrefix, *token)) return true; if (NumberComesAfterPrefix(kElementVolumePrefix, *token)) continue; } else { // e.g. "8 & 10", "01 of 24" if (NumberComesBeforeAnotherNumber(token)) return true; } // Look for other patterns if (MatchEpisodePatterns(token->content, *token)) return true; } return false; } //////////////////////////////////////////////////////////////////////////////// using regex_t = std::basic_regex<char_t>; using regex_match_results_t = std::match_results<string_t::const_iterator>; bool Parser::MatchSingleEpisodePattern(const string_t& word, Token& token) { static const regex_t pattern(L"(\\d{1,4})[vV](\\d)"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { SetEpisodeNumber(match_results[1].str(), token, false); elements_.insert(kElementReleaseVersion, match_results[2].str()); return true; } return false; } bool Parser::MatchMultiEpisodePattern(const string_t& word, Token& token) { static const regex_t pattern(L"(\\d{1,4})(?:[vV](\\d))?[-~&+]" L"(\\d{1,4})(?:[vV](\\d))?"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { auto lower_bound = match_results[1].str(); auto upper_bound = match_results[3].str(); // Avoid matching expressions such as "009-1" or "5-2" if (StringToInt(lower_bound) < StringToInt(upper_bound)) { if (SetEpisodeNumber(lower_bound, token, true)) { SetEpisodeNumber(upper_bound, token, false); if (match_results[2].matched) elements_.insert(kElementReleaseVersion, match_results[2].str()); if (match_results[4].matched) elements_.insert(kElementReleaseVersion, match_results[4].str()); return true; } } } return false; } bool Parser::MatchSeasonAndEpisodePattern(const string_t& word, Token& token) { static const regex_t pattern(L"S?" L"(\\d{1,2})(?:-S?(\\d{1,2}))?" L"(?:x|[ ._-x]?E)" L"(\\d{1,4})(?:-E?(\\d{1,4}))?" L"(?:[vV](\\d))?", std::regex_constants::icase); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { if (StringToInt(match_results[1]) == 0) return false; elements_.insert(kElementAnimeSeason, match_results[1]); if (match_results[2].matched) elements_.insert(kElementAnimeSeason, match_results[2]); SetEpisodeNumber(match_results[3], token, false); if (match_results[4].matched) SetEpisodeNumber(match_results[4], token, false); return true; } return false; } bool Parser::MatchTypeAndEpisodePattern(const string_t& word, Token& token) { size_t number_begin = FindNumberInString(word); auto prefix = word.substr(0, number_begin); ElementCategory category = kElementAnimeType; KeywordOptions options; if (keyword_manager.Find(keyword_manager.Normalize(prefix), category, options)) { elements_.insert(kElementAnimeType, prefix); auto number = word.substr(number_begin); if (MatchEpisodePatterns(number, token) || SetEpisodeNumber(number, token, true)) { auto it = std::find(tokens_.begin(), tokens_.end(), token); if (it != tokens_.end()) { // Split token (we do this last in order to avoid invalidating our // token reference earlier) token.content = number; tokens_.insert(it, Token(options.identifiable ? kIdentifier : kUnknown, prefix, token.enclosed)); } return true; } } return false; } bool Parser::MatchFractionalEpisodePattern(const string_t& word, Token& token) { // We don't allow any fractional part other than ".5", because there are cases // where such a number is a part of the anime title (e.g. "Evangelion: 1.11", // "Tokyo Magnitude 8.0") or a keyword (e.g. "5.1"). static const regex_t pattern(L"\\d+\\.5"); if (std::regex_match(word, pattern)) if (SetEpisodeNumber(word, token, true)) return true; return false; } bool Parser::MatchPartialEpisodePattern(const string_t& word, Token& token) { auto it = std::find_if_not(word.begin(), word.end(), IsNumericChar); auto suffix_length = std::distance(it, word.end()); auto is_valid_suffix = [](const char_t c) { return (c >= L'A' && c <= L'C') || (c >= L'a' && c <= L'c'); }; if (suffix_length == 1 && is_valid_suffix(*it)) if (SetEpisodeNumber(word, token, true)) return true; return false; } bool Parser::MatchNumberSignPattern(const string_t& word, Token& token) { if (word.front() != L'#') return false; static const regex_t pattern(L"#(\\d{1,4})(?:[-~&+](\\d{1,4}))?(?:[vV](\\d))?"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { if (SetEpisodeNumber(match_results[1].str(), token, true)) { if (match_results[2].matched) SetEpisodeNumber(match_results[2].str(), token, false); if (match_results[3].matched) elements_.insert(kElementReleaseVersion, match_results[3].str()); return true; } } return false; } bool Parser::MatchJapaneseCounterPattern(const string_t& word, Token& token) { if (word.back() != L'\u8A71') return false; static const regex_t pattern(L"(\\d{1,4})\u8A71"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { SetEpisodeNumber(match_results[1].str(), token, false); return true; } return false; } bool Parser::MatchEpisodePatterns(string_t word, Token& token) { // All patterns contain at least one non-numeric character if (IsNumericString(word)) return false; TrimString(word, L" -"); const bool numeric_front = IsNumericChar(word.front()); const bool numeric_back = IsNumericChar(word.back()); // e.g. "01v2" if (numeric_front && numeric_back) if (MatchSingleEpisodePattern(word, token)) return true; // e.g. "01-02", "03-05v2" if (numeric_front && numeric_back) if (MatchMultiEpisodePattern(word, token)) return true; // e.g. "2x01", "S01E03", "S01-02xE001-150" if (numeric_back) if (MatchSeasonAndEpisodePattern(word, token)) return true; // e.g. "ED1", "OP4a", "OVA2" if (!numeric_front) if (MatchTypeAndEpisodePattern(word, token)) return true; // e.g. "07.5" if (numeric_front && numeric_back) if (MatchFractionalEpisodePattern(word, token)) return true; // e.g. "4a", "111C" if (numeric_front && !numeric_back) if (MatchPartialEpisodePattern(word, token)) return true; // e.g. "#01", "#02-03v2" if (numeric_back) if (MatchNumberSignPattern(word, token)) return true; // U+8A71 is used as counter for stories, episodes of TV series, etc. if (numeric_front) if (MatchJapaneseCounterPattern(word, token)) return true; return false; } //////////////////////////////////////////////////////////////////////////////// bool Parser::MatchSingleVolumePattern(const string_t& word, Token& token) { static const regex_t pattern(L"(\\d{1,2})[vV](\\d)"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { SetVolumeNumber(match_results[1].str(), token, false); elements_.insert(kElementReleaseVersion, match_results[2].str()); return true; } return false; } bool Parser::MatchMultiVolumePattern(const string_t& word, Token& token) { static const regex_t pattern(L"(\\d{1,2})[-~&+](\\d{1,2})(?:[vV](\\d))?"); regex_match_results_t match_results; if (std::regex_match(word, match_results, pattern)) { auto lower_bound = match_results[1].str(); auto upper_bound = match_results[2].str(); if (StringToInt(lower_bound) < StringToInt(upper_bound)) { if (SetVolumeNumber(lower_bound, token, true)) { SetVolumeNumber(upper_bound, token, false); if (match_results[3].matched) elements_.insert(kElementReleaseVersion, match_results[3].str()); return true; } } } return false; } bool Parser::MatchVolumePatterns(string_t word, Token& token) { // All patterns contain at least one non-numeric character if (IsNumericString(word)) return false; TrimString(word, L" -"); const bool numeric_front = IsNumericChar(word.front()); const bool numeric_back = IsNumericChar(word.back()); // e.g. "01v2" if (numeric_front && numeric_back) if (MatchSingleVolumePattern(word, token)) return true; // e.g. "01-02", "03-05v2" if (numeric_front && numeric_back) if (MatchMultiVolumePattern(word, token)) return true; return false; } //////////////////////////////////////////////////////////////////////////////// bool Parser::SearchForEquivalentNumbers(std::vector<size_t>& tokens) { for (auto token_index = tokens.begin(); token_index != tokens.end(); ++token_index) { auto token = tokens_.begin() + *token_index; if (IsTokenIsolated(token) || !IsValidEpisodeNumber(token->content)) continue; // Find the first enclosed, non-delimiter token auto next_token = FindNextToken(tokens_, token, kFlagNotDelimiter); if (!CheckTokenCategory(next_token, kBracket)) continue; next_token = FindNextToken(tokens_, next_token, kFlagEnclosed | kFlagNotDelimiter); if (!CheckTokenCategory(next_token, kUnknown)) continue; // Check if it's an isolated number if (!IsTokenIsolated(next_token) || !IsNumericString(next_token->content) || !IsValidEpisodeNumber(next_token->content)) continue; auto minmax = std::minmax(token, next_token, [](const token_iterator_t& a, const token_iterator_t& b) { return StringToInt(a->content) < StringToInt(b->content); }); SetEpisodeNumber(minmax.first->content, *minmax.first, false); SetAlternativeEpisodeNumber(minmax.second->content, *minmax.second); return true; } return false; } bool Parser::SearchForIsolatedNumbers(std::vector<size_t>& tokens) { for (auto token_index = tokens.begin(); token_index != tokens.end(); ++token_index) { auto token = tokens_.begin() + *token_index; if (!token->enclosed || !IsTokenIsolated(token)) continue; if (SetEpisodeNumber(token->content, *token, true)) return true; } return false; } bool Parser::SearchForSeparatedNumbers(std::vector<size_t>& tokens) { for (auto token_index = tokens.begin(); token_index != tokens.end(); ++token_index) { auto token = tokens_.begin() + *token_index; auto previous_token = FindPreviousToken(tokens_, token, kFlagNotDelimiter); // See if the number has a preceding "-" separator if (CheckTokenCategory(previous_token, kUnknown) && IsDashCharacter(previous_token->content)) { if (SetEpisodeNumber(token->content, *token, true)) { previous_token->category = kIdentifier; return true; } } } return false; } bool Parser::SearchForLastNumber(std::vector<size_t>& tokens) { for (auto it = tokens.rbegin(); it != tokens.rend(); ++it) { size_t token_index = *it; auto token = tokens_.begin() + token_index; // Assuming that episode number always comes after the title, first token // cannot be what we're looking for if (token_index == 0) continue; // An enclosed token is unlikely to be the episode number at this point if (token->enclosed) continue; // Ignore if it's the first non-enclosed, non-delimiter token if (std::all_of(tokens_.begin(), token, [](const Token& token) { return token.enclosed || token.category == kDelimiter; })) continue; // Ignore if the previous token is "Movie" or "Part" auto previous_token = FindPreviousToken(tokens_, token, kFlagNotDelimiter); if (CheckTokenCategory(previous_token, kUnknown)) { if (IsStringEqualTo(previous_token->content, L"Movie") || IsStringEqualTo(previous_token->content, L"Part")) { continue; } } // We'll use this number after all if (SetEpisodeNumber(token->content, *token, true)) return true; } return false; } } // namespace anitomy