Mercurial > foo_out_sdl
diff foosdk/sdk/pfc/SmartStrStr.cpp @ 1:20d02a178406 default tip
*: check in everything else
yay
| author | Paper <paper@tflc.us> |
|---|---|
| date | Mon, 05 Jan 2026 02:15:46 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/foosdk/sdk/pfc/SmartStrStr.cpp Mon Jan 05 02:15:46 2026 -0500 @@ -0,0 +1,444 @@ +#include "pfc-lite.h" + +#include "string-conv-lite.h" +#include "string_conv.h" +#include "SmartStrStr.h" +#include <algorithm> +#include "SmartStrStr-table.h" +#include "SmartStrStr-twoCharMappings.h" + +bool SmartStrStr::isWordChar(unsigned c) { + // FIX ME map Unicode ranges somehow + return c >= 128 || pfc::char_is_ascii_alphanumeric((char)c); +} + +bool SmartStrStr::isWordChar(const char* ptr) { + unsigned c; + size_t d = pfc::utf8_decode_char(ptr, c); + if (d == 0) return false; // bad UTF-8 + return isWordChar(c); +} + +bool SmartStrStr::isValidWord(const char* ptr) { + if (*ptr == 0) return false; + do { + unsigned c; + size_t d = pfc::utf8_decode_char(ptr, c); + if (d == 0) return false; // bad UTF-8 + if (!isWordChar(c)) return false; + ptr += d; + } while (*ptr != 0); + return true; +} + +void SmartStrStr::findWords(const char* str, std::function<void(pfc::string_part_ref)> cb) { + size_t base = 0, walk = 0; + for (;; ) { + unsigned c = 0; + size_t d = pfc::utf8_decode_char(str + walk, c); + if (d == 0) break; + + if (!SmartStrStr::isWordChar(c)) { + if (walk > base) { + cb(pfc::string_part(str + base, walk - base)); + } + base = walk + d; + } + walk += d; + } + if (walk > base) { + cb(pfc::string_part(str + base, walk - base)); + } +} + +SmartStrStr::SmartStrStr() { + std::map<uint32_t, std::set<uint32_t> > substitutions, substitutionsReverse; + std::map<uint32_t, uint32_t > downconvert; + +#if 1 + for (auto& walk : SmartStrStrTable) { + downconvert[walk.from] = walk.to; + substitutions[walk.from].insert(walk.to); + } +#else + for (uint32_t walk = 128; walk < 0x10000; ++walk) { + uint32_t c = Transform(walk); + if (c != walk) { + downconvert[walk] = c; + substitutions[walk].insert(c); + } + } +#endif + + for (uint32_t walk = 32; walk < 0x10000; ++walk) { + auto lo = ToLower(walk); + if (lo != walk) { + auto & s = substitutions[walk]; s.insert(lo); + + auto iter = substitutions.find(lo); + if (iter != substitutions.end()) { + s.insert(iter->second.begin(), iter->second.end()); + } + } + } + + for( auto & walk : substitutions ) { + for( auto & walk2 : walk.second ) { + substitutionsReverse[walk2].insert(walk.first); + } + } + + this->m_substitutions.initialize(std::move(substitutions)); + this->m_substitutionsReverse.initialize(std::move(substitutionsReverse)); + this->m_downconvert.initialize(std::move(downconvert)); + InitTwoCharMappings(); +} + +// == TEMPLATES == +template<typename char_t> const char_t * SmartStrStr::matchHere_(const char_t * pString, const char_t * pUserString) const { + auto walkData = pString; + auto walkUser = pUserString; + for (;; ) { + if (*walkUser == 0) return walkData; + + uint32_t cData, cUser; + size_t dData = pfc::uni_decode_char(walkData, cData); + size_t dUser = pfc::uni_decode_char(walkUser, cUser); + if (dData == 0 || dUser == 0) return nullptr; + + if (cData != cUser) { + bool gotMulti = false; + { + const char * cDataSubst = m_twoCharMappings.query(cData); + if (cDataSubst != nullptr) { + PFC_ASSERT(strlen(cDataSubst) == 2); + if (matchOneChar(cUser, (uint32_t)cDataSubst[0])) { + auto walkUser2 = walkUser + dUser; + uint32_t cUser2; + auto dUser2 = pfc::uni_decode_char(walkUser2, cUser2); + if (matchOneChar(cUser2, (uint32_t)cDataSubst[1])) { + gotMulti = true; + dUser += dUser2; + } + } + } + } + if (!gotMulti) { + if (!matchOneChar(cUser, cData)) return nullptr; + } + } + + walkData += dData; + walkUser += dUser; + } +} +template<typename char_t> bool SmartStrStr::equals_( const char_t * pString, const char_t * pUserString) const { + auto p = this->matchHere_(pString, pUserString); + if ( p == nullptr ) return false; + return *p == 0; +} + +template<typename char_t> const char_t * SmartStrStr::strStrEnd_(const char_t * pString, const char_t * pSubString, size_t * outFoundAt) const { + size_t walk = 0; + for (;; ) { + if (pString[walk] == 0) return nullptr; + auto end = matchHere_(pString + walk, pSubString); + if (end != nullptr) { + if (outFoundAt != nullptr) * outFoundAt = walk; + return end; + } + + size_t delta = pfc::uni_char_length(pString + walk); + if (delta == 0) return nullptr; + walk += delta; + } +} +// == END TEMPLATES == + +const char16_t * SmartStrStr::matchHere16(const char16_t * pString, const char16_t * pUserString) const { + return this->matchHere_(pString, pUserString); +} +const char * SmartStrStr::matchHere(const char * pString, const char * pUserString) const { + return this->matchHere_(pString, pUserString); +} +const wchar_t * SmartStrStr::matchHereW(const wchar_t * pString, const wchar_t * pUserString) const { + return this->matchHere_(pString, pUserString); +} + +bool SmartStrStr::equals(const char * pString, const char * pUserString) const { + return equals_(pString, pUserString); +} +bool SmartStrStr::equals16(const char16_t* pString, const char16_t* pUserString) const { + return equals_(pString, pUserString); +} +bool SmartStrStr::equalsW( const wchar_t * pString, const wchar_t * pUserString) const { + return equals_(pString, pUserString); +} +const char * SmartStrStr::strStrEnd(const char * pString, const char * pSubString, size_t * outFoundAt) const { + return strStrEnd_(pString, pSubString, outFoundAt); +} + +const char16_t * SmartStrStr::strStrEnd16(const char16_t * pString, const char16_t * pSubString, size_t * outFoundAt) const { + return strStrEnd_(pString, pSubString, outFoundAt); +} + +const wchar_t * SmartStrStr::strStrEndW(const wchar_t * pString, const wchar_t * pSubString, size_t * outFoundAt) const { + return strStrEnd_(pString, pSubString, outFoundAt); +} + +static bool wordBeginsHere(const char* base, size_t offset) { + if (offset == 0) return true; + for (size_t len = 1; len <= offset && len <= 6; --len) { + unsigned c; + if (pfc::utf8_decode_char(base + offset - len, c) == len) { + return !SmartStrStr::isWordChar(c); + } + } + return false; +} + +const char* SmartStrStr::strStrEndWord(const char* pString, const char* pSubString, size_t* outFoundAt) const { + size_t walk = 0; + for (;;) { + size_t foundAt = 0; + auto end = strStrEnd(pString + walk, pSubString, &foundAt); + if (end == nullptr) return nullptr; + foundAt += walk; + if (!isWordChar(end) && wordBeginsHere(pString, foundAt)) { + if (outFoundAt) *outFoundAt = foundAt; + return end; + } + walk = end - pString; + } +} + +bool SmartStrStr::matchOneChar(uint32_t cInput, uint32_t cData) const { + if (cInput == cData) return true; + auto v = m_substitutions.query_ptr(cData); + if (v == nullptr) return false; + return v->count(cInput) > 0; +} + +pfc::string8 SmartStrStr::transformStr(const char* str) const { + pfc::string8 ret; transformStrHere(ret, str); return ret; +} + +void SmartStrStr::transformStrHere(pfc::string8& out, const char* in) const { + transformStrHere(out, in, strlen(in)); +} + +void SmartStrStr::transformStrHere(pfc::string8& out, const char* in, size_t inLen) const { + out.prealloc(inLen); + out.clear(); + for (size_t walk = 0; walk < inLen; ) { + unsigned c; + size_t d = pfc::utf8_decode_char(in + walk, c); + if (d == 0 || walk+d>inLen) break; + walk += d; + const char* alt = m_twoCharMappings.query(c); + if (alt != nullptr) { + out << alt; continue; + } + unsigned alt2 = m_downconvert.query(c); + if (alt2 != 0) { + out.add_char(alt2); continue; + } + out.add_char(c); + } +} + +#if 0 // Windows specific code +uint32_t SmartStrStr::Transform(uint32_t c) { + wchar_t wide[2] = {}; char out[4] = {}; + pfc::utf16_encode_char(c, wide); + BOOL fail = FALSE; + if (WideCharToMultiByte(pfc::stringcvt::codepage_ascii, 0, wide, 2, out, 4, "?", &fail) > 0) { + if (!fail) { + if (out[0] > 0 && out[1] == 0) { + c = out[0]; + } + } + } + return c; +} +#endif + +uint32_t SmartStrStr::ToLower(uint32_t c) { + return pfc::charLower(c); +} + +void SmartStrStr::InitTwoCharMappings() { + std::map<uint32_t, const char* > mappings; + std::map<uint32_t, uint32_t> reverse; + for (auto& walk : twoCharMappings) { + mappings[walk.from] = walk.to; + uint32_t c1, c2; + const char * p = walk.to; + size_t d; + d = pfc::utf8_decode_char(p, c1); + if ( d > 0 ) { + p += d; + d = pfc::utf8_decode_char(p, c2); + if (d > 0) { + if (c1 < 0x10000 && c2 < 0x10000) { + reverse[c1 | (c2 << 16)] = walk.from; + } + } + } + } + m_twoCharMappings.initialize(std::move(mappings)); + m_twoCharMappingsReverse.initialize(std::move(reverse)); +} +bool SmartStrStr::testSubString_prefix(const char* str, const char* sub, const char * prefix, size_t prefixLen) const { + + switch(prefixLen) { + case 0: + return false; + case 1: + for(const char * walk = str;; ) { + walk = strchr(walk, *prefix); + if ( walk == nullptr ) return false; + ++walk; + if (matchHere(walk, sub)) return true; + } + default: + for(const char * walk = str;; ) { + walk = strstr(walk, prefix); + if ( walk == nullptr ) return false; + walk += prefixLen; + if (matchHere(walk, sub)) return true; + } + } +} +bool SmartStrStr::testSubString_prefix(const char* str, const char* sub, uint32_t c) const { + size_t tempLen; + char temp[8]; + tempLen = pfc::utf8_encode_char(c, temp); temp[tempLen] = 0; + return testSubString_prefix(str, sub, temp, tempLen); +} +bool SmartStrStr::testSubString_prefix_subst(const char* str, const char* sub, uint32_t prefix) const { + if ( testSubString_prefix(str, sub, prefix)) return true; + + auto alt = m_substitutionsReverse.query_ptr( prefix ); + if (alt != nullptr) { + for (auto c : *alt) { + if (testSubString_prefix(str, sub, c)) return true; + } + } + + return false; +} +bool SmartStrStr::testSubstring(const char* str, const char* sub) const { +#if 1 + // optimized version for UTF-8 + unsigned prefix; + const size_t skip = pfc::uni_decode_char(sub, prefix); + if ( skip == 0 ) return false; + sub += skip; + + if (testSubString_prefix_subst(str, sub, prefix)) return true; + + unsigned prefix2; + const size_t skip2 = pfc::uni_decode_char(sub, prefix2); + if (skip2 > 0 && prefix < 0x10000 && prefix2 < 0x10000) { + sub += skip2; + auto alt = m_twoCharMappingsReverse.query(prefix | (prefix2 << 16)); + if (alt != 0) { + if (testSubString_prefix_subst(str, sub, alt)) return true; + } + } + + return false; +#else + return this->strStrEnd(str, sub) != nullptr; +#endif +} +bool SmartStrStr::testSubstring16(const char16_t* str, const char16_t* sub) const { + return this->strStrEnd16(str, sub) != nullptr; +} +bool SmartStrStr::testSubstringW( const wchar_t * str, const wchar_t * sub ) const { + return this->strStrEndW(str, sub) != nullptr; +} + +SmartStrStr& SmartStrStr::global() { + static SmartStrStr g; + return g; +} + + +void SmartStrFilter::init(const char* ptr, size_t len) { + pfc::string_formatter current, temp; + bool inQuotation = false; + + auto addCurrent = [&] { + if (!current.is_empty()) { + ++m_items[current.get_ptr()]; current.reset(); + } + }; + + for (t_size walk = 0; walk < len; ++walk) { + const char c = ptr[walk]; + if (c == '\"') inQuotation = !inQuotation; + else if (!inQuotation && is_spacing(c)) { + addCurrent(); + } else { + current.add_byte(c); + } + } + if (inQuotation) { + // Allow unbalanced quotes, take the whole string *with* quotation marks + m_items.clear(); + current.set_string_nc(ptr, len); + } + + addCurrent(); +} + + +bool SmartStrFilter::test_disregardCounts(const char* src) const { + if (m_items.empty()) return false; + + for (auto& walk : m_items) { + if (!dc->strStrEnd(src, walk.first.c_str())) return false; + } + return true; +} + +bool SmartStrFilter::testWords(const char* src) const { + if (m_items.empty()) return false; + + for (auto& walk : m_items) { + const auto count = walk.second; + const auto& str = walk.first; + const auto* strWalk = src; + for (size_t i = 0; i < count; ++i) { + auto next = dc->strStrEndWord(strWalk, str.c_str()); + if (next == nullptr) return false; + strWalk = next; + } + } + return true; +} + +bool SmartStrFilter::test(const char* src) const { + + if (m_items.empty()) return false; + + // Use the faster routine first, it can't be used to count occurances but nobody really knows about this feature + for (auto& walk : m_items) { + if (!dc->testSubstring(src, walk.first.c_str())) return false; + } + // Have any items where specific number of occurances is wanted? + for (auto & walk : m_items) { + const auto count = walk.second; + if (count == 1) continue; + const auto& str = walk.first; + const auto* strWalk = src; + for (size_t i = 0; i < count; ++i) { + auto next = dc->strStrEnd(strWalk, str.c_str()); + if (next == nullptr) return false; + strWalk = next; + } + } + return true; +}
