Mercurial > minori
view dep/anitomy/anitomy/tokenizer.cpp @ 347:a0aa8c8c4307
dep/anitomy: port to use UCS-4 rather than wide strings
rationale: wide strings are not the same on every platform, and
might not even be Unicode. (while they usually are, its possible
that they are not)
I was *going* to change StringToInt to use a string stream, but
outputting to an integer doesn't seem to work at all with UCS-4,
even though it ought to, so I just rolled my own that uses the
arabic digits only.
author | Paper <paper@paper.us.eu.org> |
---|---|
date | Sun, 23 Jun 2024 10:32:09 -0400 |
parents | 5c0397762b53 |
children |
line wrap: on
line source
/* ** Copyright (c) 2014-2017, Eren Okka ** ** This Source Code Form is subject to the terms of the Mozilla Public ** License, v. 2.0. If a copy of the MPL was not distributed with this ** file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #include <algorithm> #include <iterator> #include <set> #include "keyword.h" #include "string.h" #include "tokenizer.h" namespace anitomy { Tokenizer::Tokenizer(const string_t& filename, Elements& elements, const Options& options, token_container_t& tokens) : elements_(elements), filename_(filename), options_(options), tokens_(tokens) { } bool Tokenizer::Tokenize() { tokens_.reserve(32); // Usually there are no more than 20 tokens TokenizeByBrackets(); return !tokens_.empty(); } //////////////////////////////////////////////////////////////////////////////// void Tokenizer::AddToken(TokenCategory category, bool enclosed, const TokenRange& range) { tokens_.push_back(Token(category, filename_.substr(range.offset, range.size), enclosed)); } void Tokenizer::TokenizeByBrackets() { static const std::vector<std::pair<char_t, char_t>> brackets{ {U'(', U')' }, // U+0028-U+0029 Parenthesis {U'[', U']' }, // U+005B-U+005D Square bracket {U'{', U'}' }, // U+007B-U+007D Curly bracket {U'\u300C', U'\u300D'}, // Corner bracket {U'\u300E', U'\u300F'}, // White corner bracket {U'\u3010', U'\u3011'}, // Black lenticular bracket {U'\uFF08', U'\uFF09'}, // Fullwidth parenthesis }; bool is_bracket_open = false; char_t matching_bracket = U'\0'; auto char_begin = filename_.begin(); const auto char_end = filename_.end(); // This is basically std::find_first_of() customized to our needs auto find_first_bracket = [&]() -> string_t::const_iterator { for (auto it = char_begin; it != char_end; ++it) { for (const auto& bracket_pair : brackets) { if (*it == bracket_pair.first) { matching_bracket = bracket_pair.second; return it; } } } return char_end; }; auto current_char = char_begin; while (current_char != char_end && char_begin != char_end) { if (!is_bracket_open) { current_char = find_first_bracket(); } else { // Looking for the matching bracket allows us to better handle some rare // cases with nested brackets. current_char = std::find(char_begin, char_end, matching_bracket); } const TokenRange range{static_cast<size_t>(std::distance(filename_.begin(), char_begin)), static_cast<size_t>(std::distance(char_begin, current_char))}; if (range.size > 0) // Found unknown token TokenizeByPreidentified(is_bracket_open, range); if (current_char != char_end) { // Found bracket AddToken(kBracket, true, TokenRange{range.offset + range.size, 1}); is_bracket_open = !is_bracket_open; char_begin = ++current_char; } } } void Tokenizer::TokenizeByPreidentified(bool enclosed, const TokenRange& range) { std::vector<TokenRange> preidentified_tokens; keyword_manager.Peek(filename_, range, elements_, preidentified_tokens); size_t offset = range.offset; TokenRange subrange{range.offset, 0}; while (offset < range.offset + range.size) { for (const auto& preidentified_token : preidentified_tokens) { if (offset == preidentified_token.offset) { if (subrange.size > 0) TokenizeByDelimiters(enclosed, subrange); AddToken(kIdentifier, enclosed, preidentified_token); subrange.offset = preidentified_token.offset + preidentified_token.size; offset = subrange.offset - 1; // It's going to be incremented below break; } } subrange.size = ++offset - subrange.offset; } // Either there was no preidentified token range, or we're now about to // process the tail of our current range. if (subrange.size > 0) TokenizeByDelimiters(enclosed, subrange); } void Tokenizer::TokenizeByDelimiters(bool enclosed, const TokenRange& range) { const string_t delimiters = GetDelimiters(range); if (delimiters.empty()) { AddToken(kUnknown, enclosed, range); return; } auto char_begin = filename_.begin() + range.offset; const auto char_end = char_begin + range.size; auto current_char = char_begin; while (current_char != char_end) { current_char = std::find_first_of(current_char, char_end, delimiters.begin(), delimiters.end()); const TokenRange subrange{static_cast<size_t>(std::distance(filename_.begin(), char_begin)), static_cast<size_t>(std::distance(char_begin, current_char))}; if (subrange.size > 0) // Found unknown token AddToken(kUnknown, enclosed, subrange); if (current_char != char_end) { // Found delimiter AddToken(kDelimiter, enclosed, TokenRange{subrange.offset + subrange.size, 1}); char_begin = ++current_char; } } ValidateDelimiterTokens(); } //////////////////////////////////////////////////////////////////////////////// string_t Tokenizer::GetDelimiters(const TokenRange& range) const { string_t delimiters; auto is_delimiter = [&](const char_t& c) { if (!IsAlphanumericChar(c)) if (options_.allowed_delimiters.find(c) != string_t::npos) if (delimiters.find(c) == string_t::npos) return true; return false; }; std::copy_if(filename_.begin() + range.offset, filename_.begin() + range.offset + range.size, std::back_inserter(delimiters), is_delimiter); return delimiters; } void Tokenizer::ValidateDelimiterTokens() { auto is_delimiter_token = [&](token_iterator_t it) { return it != tokens_.end() && it->category == kDelimiter; }; auto is_unknown_token = [&](token_iterator_t it) { return it != tokens_.end() && it->category == kUnknown; }; auto is_single_character_token = [&](token_iterator_t it) { return is_unknown_token(it) && it->content.size() == 1 && it->content.front() != L'-'; }; auto append_token_to = [](token_iterator_t token, token_iterator_t append_to) { append_to->content.append(token->content); token->category = kInvalid; }; for (auto token = tokens_.begin(); token != tokens_.end(); ++token) { if (token->category != kDelimiter) continue; auto delimiter = token->content.front(); auto prev_token = FindPreviousToken(tokens_, token, kFlagValid); auto next_token = FindNextToken(tokens_, token, kFlagValid); // Check for single-character tokens to prevent splitting group names, // keywords, episode number, etc. if (delimiter != L' ' && delimiter != L'_') { if (is_single_character_token(prev_token)) { append_token_to(token, prev_token); while (is_unknown_token(next_token)) { append_token_to(next_token, prev_token); next_token = FindNextToken(tokens_, next_token, kFlagValid); if (is_delimiter_token(next_token) && next_token->content.front() == delimiter) { append_token_to(next_token, prev_token); next_token = FindNextToken(tokens_, next_token, kFlagValid); } } continue; } if (is_single_character_token(next_token)) { append_token_to(token, prev_token); append_token_to(next_token, prev_token); continue; } } // Check for adjacent delimiters if (is_unknown_token(prev_token) && is_delimiter_token(next_token)) { auto next_delimiter = next_token->content.front(); if (delimiter != next_delimiter && delimiter != ',') { if (next_delimiter == ' ' || next_delimiter == '_') { append_token_to(token, prev_token); } } } else if (is_delimiter_token(prev_token) && is_delimiter_token(next_token)) { const auto prev_delimiter = prev_token->content.front(); const auto next_delimiter = next_token->content.front(); if (prev_delimiter == next_delimiter && prev_delimiter != delimiter) { token->category = kUnknown; // e.g. "&" in "_&_" } } // Check for other special cases if (delimiter == '&' || delimiter == '+') { if (is_unknown_token(prev_token) && is_unknown_token(next_token)) { if (IsNumericString(prev_token->content) && IsNumericString(next_token->content)) { append_token_to(token, prev_token); append_token_to(next_token, prev_token); // e.g. "01+02" } } } } auto remove_if_invalid = std::remove_if(tokens_.begin(), tokens_.end(), [](const Token& token) -> bool { return token.category == kInvalid; }); tokens_.erase(remove_if_invalid, tokens_.end()); } } // namespace anitomy