Mercurial > foo_out_sdl
comparison foosdk/sdk/pfc/SmartStrStr.cpp @ 1:20d02a178406 default tip
*: check in everything else
yay
| author | Paper <paper@tflc.us> |
|---|---|
| date | Mon, 05 Jan 2026 02:15:46 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:e9bb126753e7 | 1:20d02a178406 |
|---|---|
| 1 #include "pfc-lite.h" | |
| 2 | |
| 3 #include "string-conv-lite.h" | |
| 4 #include "string_conv.h" | |
| 5 #include "SmartStrStr.h" | |
| 6 #include <algorithm> | |
| 7 #include "SmartStrStr-table.h" | |
| 8 #include "SmartStrStr-twoCharMappings.h" | |
| 9 | |
| 10 bool SmartStrStr::isWordChar(unsigned c) { | |
| 11 // FIX ME map Unicode ranges somehow | |
| 12 return c >= 128 || pfc::char_is_ascii_alphanumeric((char)c); | |
| 13 } | |
| 14 | |
| 15 bool SmartStrStr::isWordChar(const char* ptr) { | |
| 16 unsigned c; | |
| 17 size_t d = pfc::utf8_decode_char(ptr, c); | |
| 18 if (d == 0) return false; // bad UTF-8 | |
| 19 return isWordChar(c); | |
| 20 } | |
| 21 | |
| 22 bool SmartStrStr::isValidWord(const char* ptr) { | |
| 23 if (*ptr == 0) return false; | |
| 24 do { | |
| 25 unsigned c; | |
| 26 size_t d = pfc::utf8_decode_char(ptr, c); | |
| 27 if (d == 0) return false; // bad UTF-8 | |
| 28 if (!isWordChar(c)) return false; | |
| 29 ptr += d; | |
| 30 } while (*ptr != 0); | |
| 31 return true; | |
| 32 } | |
| 33 | |
| 34 void SmartStrStr::findWords(const char* str, std::function<void(pfc::string_part_ref)> cb) { | |
| 35 size_t base = 0, walk = 0; | |
| 36 for (;; ) { | |
| 37 unsigned c = 0; | |
| 38 size_t d = pfc::utf8_decode_char(str + walk, c); | |
| 39 if (d == 0) break; | |
| 40 | |
| 41 if (!SmartStrStr::isWordChar(c)) { | |
| 42 if (walk > base) { | |
| 43 cb(pfc::string_part(str + base, walk - base)); | |
| 44 } | |
| 45 base = walk + d; | |
| 46 } | |
| 47 walk += d; | |
| 48 } | |
| 49 if (walk > base) { | |
| 50 cb(pfc::string_part(str + base, walk - base)); | |
| 51 } | |
| 52 } | |
| 53 | |
| 54 SmartStrStr::SmartStrStr() { | |
| 55 std::map<uint32_t, std::set<uint32_t> > substitutions, substitutionsReverse; | |
| 56 std::map<uint32_t, uint32_t > downconvert; | |
| 57 | |
| 58 #if 1 | |
| 59 for (auto& walk : SmartStrStrTable) { | |
| 60 downconvert[walk.from] = walk.to; | |
| 61 substitutions[walk.from].insert(walk.to); | |
| 62 } | |
| 63 #else | |
| 64 for (uint32_t walk = 128; walk < 0x10000; ++walk) { | |
| 65 uint32_t c = Transform(walk); | |
| 66 if (c != walk) { | |
| 67 downconvert[walk] = c; | |
| 68 substitutions[walk].insert(c); | |
| 69 } | |
| 70 } | |
| 71 #endif | |
| 72 | |
| 73 for (uint32_t walk = 32; walk < 0x10000; ++walk) { | |
| 74 auto lo = ToLower(walk); | |
| 75 if (lo != walk) { | |
| 76 auto & s = substitutions[walk]; s.insert(lo); | |
| 77 | |
| 78 auto iter = substitutions.find(lo); | |
| 79 if (iter != substitutions.end()) { | |
| 80 s.insert(iter->second.begin(), iter->second.end()); | |
| 81 } | |
| 82 } | |
| 83 } | |
| 84 | |
| 85 for( auto & walk : substitutions ) { | |
| 86 for( auto & walk2 : walk.second ) { | |
| 87 substitutionsReverse[walk2].insert(walk.first); | |
| 88 } | |
| 89 } | |
| 90 | |
| 91 this->m_substitutions.initialize(std::move(substitutions)); | |
| 92 this->m_substitutionsReverse.initialize(std::move(substitutionsReverse)); | |
| 93 this->m_downconvert.initialize(std::move(downconvert)); | |
| 94 InitTwoCharMappings(); | |
| 95 } | |
| 96 | |
| 97 // == TEMPLATES == | |
| 98 template<typename char_t> const char_t * SmartStrStr::matchHere_(const char_t * pString, const char_t * pUserString) const { | |
| 99 auto walkData = pString; | |
| 100 auto walkUser = pUserString; | |
| 101 for (;; ) { | |
| 102 if (*walkUser == 0) return walkData; | |
| 103 | |
| 104 uint32_t cData, cUser; | |
| 105 size_t dData = pfc::uni_decode_char(walkData, cData); | |
| 106 size_t dUser = pfc::uni_decode_char(walkUser, cUser); | |
| 107 if (dData == 0 || dUser == 0) return nullptr; | |
| 108 | |
| 109 if (cData != cUser) { | |
| 110 bool gotMulti = false; | |
| 111 { | |
| 112 const char * cDataSubst = m_twoCharMappings.query(cData); | |
| 113 if (cDataSubst != nullptr) { | |
| 114 PFC_ASSERT(strlen(cDataSubst) == 2); | |
| 115 if (matchOneChar(cUser, (uint32_t)cDataSubst[0])) { | |
| 116 auto walkUser2 = walkUser + dUser; | |
| 117 uint32_t cUser2; | |
| 118 auto dUser2 = pfc::uni_decode_char(walkUser2, cUser2); | |
| 119 if (matchOneChar(cUser2, (uint32_t)cDataSubst[1])) { | |
| 120 gotMulti = true; | |
| 121 dUser += dUser2; | |
| 122 } | |
| 123 } | |
| 124 } | |
| 125 } | |
| 126 if (!gotMulti) { | |
| 127 if (!matchOneChar(cUser, cData)) return nullptr; | |
| 128 } | |
| 129 } | |
| 130 | |
| 131 walkData += dData; | |
| 132 walkUser += dUser; | |
| 133 } | |
| 134 } | |
| 135 template<typename char_t> bool SmartStrStr::equals_( const char_t * pString, const char_t * pUserString) const { | |
| 136 auto p = this->matchHere_(pString, pUserString); | |
| 137 if ( p == nullptr ) return false; | |
| 138 return *p == 0; | |
| 139 } | |
| 140 | |
| 141 template<typename char_t> const char_t * SmartStrStr::strStrEnd_(const char_t * pString, const char_t * pSubString, size_t * outFoundAt) const { | |
| 142 size_t walk = 0; | |
| 143 for (;; ) { | |
| 144 if (pString[walk] == 0) return nullptr; | |
| 145 auto end = matchHere_(pString + walk, pSubString); | |
| 146 if (end != nullptr) { | |
| 147 if (outFoundAt != nullptr) * outFoundAt = walk; | |
| 148 return end; | |
| 149 } | |
| 150 | |
| 151 size_t delta = pfc::uni_char_length(pString + walk); | |
| 152 if (delta == 0) return nullptr; | |
| 153 walk += delta; | |
| 154 } | |
| 155 } | |
| 156 // == END TEMPLATES == | |
| 157 | |
| 158 const char16_t * SmartStrStr::matchHere16(const char16_t * pString, const char16_t * pUserString) const { | |
| 159 return this->matchHere_(pString, pUserString); | |
| 160 } | |
| 161 const char * SmartStrStr::matchHere(const char * pString, const char * pUserString) const { | |
| 162 return this->matchHere_(pString, pUserString); | |
| 163 } | |
| 164 const wchar_t * SmartStrStr::matchHereW(const wchar_t * pString, const wchar_t * pUserString) const { | |
| 165 return this->matchHere_(pString, pUserString); | |
| 166 } | |
| 167 | |
| 168 bool SmartStrStr::equals(const char * pString, const char * pUserString) const { | |
| 169 return equals_(pString, pUserString); | |
| 170 } | |
| 171 bool SmartStrStr::equals16(const char16_t* pString, const char16_t* pUserString) const { | |
| 172 return equals_(pString, pUserString); | |
| 173 } | |
| 174 bool SmartStrStr::equalsW( const wchar_t * pString, const wchar_t * pUserString) const { | |
| 175 return equals_(pString, pUserString); | |
| 176 } | |
| 177 const char * SmartStrStr::strStrEnd(const char * pString, const char * pSubString, size_t * outFoundAt) const { | |
| 178 return strStrEnd_(pString, pSubString, outFoundAt); | |
| 179 } | |
| 180 | |
| 181 const char16_t * SmartStrStr::strStrEnd16(const char16_t * pString, const char16_t * pSubString, size_t * outFoundAt) const { | |
| 182 return strStrEnd_(pString, pSubString, outFoundAt); | |
| 183 } | |
| 184 | |
| 185 const wchar_t * SmartStrStr::strStrEndW(const wchar_t * pString, const wchar_t * pSubString, size_t * outFoundAt) const { | |
| 186 return strStrEnd_(pString, pSubString, outFoundAt); | |
| 187 } | |
| 188 | |
| 189 static bool wordBeginsHere(const char* base, size_t offset) { | |
| 190 if (offset == 0) return true; | |
| 191 for (size_t len = 1; len <= offset && len <= 6; --len) { | |
| 192 unsigned c; | |
| 193 if (pfc::utf8_decode_char(base + offset - len, c) == len) { | |
| 194 return !SmartStrStr::isWordChar(c); | |
| 195 } | |
| 196 } | |
| 197 return false; | |
| 198 } | |
| 199 | |
| 200 const char* SmartStrStr::strStrEndWord(const char* pString, const char* pSubString, size_t* outFoundAt) const { | |
| 201 size_t walk = 0; | |
| 202 for (;;) { | |
| 203 size_t foundAt = 0; | |
| 204 auto end = strStrEnd(pString + walk, pSubString, &foundAt); | |
| 205 if (end == nullptr) return nullptr; | |
| 206 foundAt += walk; | |
| 207 if (!isWordChar(end) && wordBeginsHere(pString, foundAt)) { | |
| 208 if (outFoundAt) *outFoundAt = foundAt; | |
| 209 return end; | |
| 210 } | |
| 211 walk = end - pString; | |
| 212 } | |
| 213 } | |
| 214 | |
| 215 bool SmartStrStr::matchOneChar(uint32_t cInput, uint32_t cData) const { | |
| 216 if (cInput == cData) return true; | |
| 217 auto v = m_substitutions.query_ptr(cData); | |
| 218 if (v == nullptr) return false; | |
| 219 return v->count(cInput) > 0; | |
| 220 } | |
| 221 | |
| 222 pfc::string8 SmartStrStr::transformStr(const char* str) const { | |
| 223 pfc::string8 ret; transformStrHere(ret, str); return ret; | |
| 224 } | |
| 225 | |
| 226 void SmartStrStr::transformStrHere(pfc::string8& out, const char* in) const { | |
| 227 transformStrHere(out, in, strlen(in)); | |
| 228 } | |
| 229 | |
| 230 void SmartStrStr::transformStrHere(pfc::string8& out, const char* in, size_t inLen) const { | |
| 231 out.prealloc(inLen); | |
| 232 out.clear(); | |
| 233 for (size_t walk = 0; walk < inLen; ) { | |
| 234 unsigned c; | |
| 235 size_t d = pfc::utf8_decode_char(in + walk, c); | |
| 236 if (d == 0 || walk+d>inLen) break; | |
| 237 walk += d; | |
| 238 const char* alt = m_twoCharMappings.query(c); | |
| 239 if (alt != nullptr) { | |
| 240 out << alt; continue; | |
| 241 } | |
| 242 unsigned alt2 = m_downconvert.query(c); | |
| 243 if (alt2 != 0) { | |
| 244 out.add_char(alt2); continue; | |
| 245 } | |
| 246 out.add_char(c); | |
| 247 } | |
| 248 } | |
| 249 | |
| 250 #if 0 // Windows specific code | |
| 251 uint32_t SmartStrStr::Transform(uint32_t c) { | |
| 252 wchar_t wide[2] = {}; char out[4] = {}; | |
| 253 pfc::utf16_encode_char(c, wide); | |
| 254 BOOL fail = FALSE; | |
| 255 if (WideCharToMultiByte(pfc::stringcvt::codepage_ascii, 0, wide, 2, out, 4, "?", &fail) > 0) { | |
| 256 if (!fail) { | |
| 257 if (out[0] > 0 && out[1] == 0) { | |
| 258 c = out[0]; | |
| 259 } | |
| 260 } | |
| 261 } | |
| 262 return c; | |
| 263 } | |
| 264 #endif | |
| 265 | |
| 266 uint32_t SmartStrStr::ToLower(uint32_t c) { | |
| 267 return pfc::charLower(c); | |
| 268 } | |
| 269 | |
| 270 void SmartStrStr::InitTwoCharMappings() { | |
| 271 std::map<uint32_t, const char* > mappings; | |
| 272 std::map<uint32_t, uint32_t> reverse; | |
| 273 for (auto& walk : twoCharMappings) { | |
| 274 mappings[walk.from] = walk.to; | |
| 275 uint32_t c1, c2; | |
| 276 const char * p = walk.to; | |
| 277 size_t d; | |
| 278 d = pfc::utf8_decode_char(p, c1); | |
| 279 if ( d > 0 ) { | |
| 280 p += d; | |
| 281 d = pfc::utf8_decode_char(p, c2); | |
| 282 if (d > 0) { | |
| 283 if (c1 < 0x10000 && c2 < 0x10000) { | |
| 284 reverse[c1 | (c2 << 16)] = walk.from; | |
| 285 } | |
| 286 } | |
| 287 } | |
| 288 } | |
| 289 m_twoCharMappings.initialize(std::move(mappings)); | |
| 290 m_twoCharMappingsReverse.initialize(std::move(reverse)); | |
| 291 } | |
| 292 bool SmartStrStr::testSubString_prefix(const char* str, const char* sub, const char * prefix, size_t prefixLen) const { | |
| 293 | |
| 294 switch(prefixLen) { | |
| 295 case 0: | |
| 296 return false; | |
| 297 case 1: | |
| 298 for(const char * walk = str;; ) { | |
| 299 walk = strchr(walk, *prefix); | |
| 300 if ( walk == nullptr ) return false; | |
| 301 ++walk; | |
| 302 if (matchHere(walk, sub)) return true; | |
| 303 } | |
| 304 default: | |
| 305 for(const char * walk = str;; ) { | |
| 306 walk = strstr(walk, prefix); | |
| 307 if ( walk == nullptr ) return false; | |
| 308 walk += prefixLen; | |
| 309 if (matchHere(walk, sub)) return true; | |
| 310 } | |
| 311 } | |
| 312 } | |
| 313 bool SmartStrStr::testSubString_prefix(const char* str, const char* sub, uint32_t c) const { | |
| 314 size_t tempLen; | |
| 315 char temp[8]; | |
| 316 tempLen = pfc::utf8_encode_char(c, temp); temp[tempLen] = 0; | |
| 317 return testSubString_prefix(str, sub, temp, tempLen); | |
| 318 } | |
| 319 bool SmartStrStr::testSubString_prefix_subst(const char* str, const char* sub, uint32_t prefix) const { | |
| 320 if ( testSubString_prefix(str, sub, prefix)) return true; | |
| 321 | |
| 322 auto alt = m_substitutionsReverse.query_ptr( prefix ); | |
| 323 if (alt != nullptr) { | |
| 324 for (auto c : *alt) { | |
| 325 if (testSubString_prefix(str, sub, c)) return true; | |
| 326 } | |
| 327 } | |
| 328 | |
| 329 return false; | |
| 330 } | |
| 331 bool SmartStrStr::testSubstring(const char* str, const char* sub) const { | |
| 332 #if 1 | |
| 333 // optimized version for UTF-8 | |
| 334 unsigned prefix; | |
| 335 const size_t skip = pfc::uni_decode_char(sub, prefix); | |
| 336 if ( skip == 0 ) return false; | |
| 337 sub += skip; | |
| 338 | |
| 339 if (testSubString_prefix_subst(str, sub, prefix)) return true; | |
| 340 | |
| 341 unsigned prefix2; | |
| 342 const size_t skip2 = pfc::uni_decode_char(sub, prefix2); | |
| 343 if (skip2 > 0 && prefix < 0x10000 && prefix2 < 0x10000) { | |
| 344 sub += skip2; | |
| 345 auto alt = m_twoCharMappingsReverse.query(prefix | (prefix2 << 16)); | |
| 346 if (alt != 0) { | |
| 347 if (testSubString_prefix_subst(str, sub, alt)) return true; | |
| 348 } | |
| 349 } | |
| 350 | |
| 351 return false; | |
| 352 #else | |
| 353 return this->strStrEnd(str, sub) != nullptr; | |
| 354 #endif | |
| 355 } | |
| 356 bool SmartStrStr::testSubstring16(const char16_t* str, const char16_t* sub) const { | |
| 357 return this->strStrEnd16(str, sub) != nullptr; | |
| 358 } | |
| 359 bool SmartStrStr::testSubstringW( const wchar_t * str, const wchar_t * sub ) const { | |
| 360 return this->strStrEndW(str, sub) != nullptr; | |
| 361 } | |
| 362 | |
| 363 SmartStrStr& SmartStrStr::global() { | |
| 364 static SmartStrStr g; | |
| 365 return g; | |
| 366 } | |
| 367 | |
| 368 | |
| 369 void SmartStrFilter::init(const char* ptr, size_t len) { | |
| 370 pfc::string_formatter current, temp; | |
| 371 bool inQuotation = false; | |
| 372 | |
| 373 auto addCurrent = [&] { | |
| 374 if (!current.is_empty()) { | |
| 375 ++m_items[current.get_ptr()]; current.reset(); | |
| 376 } | |
| 377 }; | |
| 378 | |
| 379 for (t_size walk = 0; walk < len; ++walk) { | |
| 380 const char c = ptr[walk]; | |
| 381 if (c == '\"') inQuotation = !inQuotation; | |
| 382 else if (!inQuotation && is_spacing(c)) { | |
| 383 addCurrent(); | |
| 384 } else { | |
| 385 current.add_byte(c); | |
| 386 } | |
| 387 } | |
| 388 if (inQuotation) { | |
| 389 // Allow unbalanced quotes, take the whole string *with* quotation marks | |
| 390 m_items.clear(); | |
| 391 current.set_string_nc(ptr, len); | |
| 392 } | |
| 393 | |
| 394 addCurrent(); | |
| 395 } | |
| 396 | |
| 397 | |
| 398 bool SmartStrFilter::test_disregardCounts(const char* src) const { | |
| 399 if (m_items.empty()) return false; | |
| 400 | |
| 401 for (auto& walk : m_items) { | |
| 402 if (!dc->strStrEnd(src, walk.first.c_str())) return false; | |
| 403 } | |
| 404 return true; | |
| 405 } | |
| 406 | |
| 407 bool SmartStrFilter::testWords(const char* src) const { | |
| 408 if (m_items.empty()) return false; | |
| 409 | |
| 410 for (auto& walk : m_items) { | |
| 411 const auto count = walk.second; | |
| 412 const auto& str = walk.first; | |
| 413 const auto* strWalk = src; | |
| 414 for (size_t i = 0; i < count; ++i) { | |
| 415 auto next = dc->strStrEndWord(strWalk, str.c_str()); | |
| 416 if (next == nullptr) return false; | |
| 417 strWalk = next; | |
| 418 } | |
| 419 } | |
| 420 return true; | |
| 421 } | |
| 422 | |
| 423 bool SmartStrFilter::test(const char* src) const { | |
| 424 | |
| 425 if (m_items.empty()) return false; | |
| 426 | |
| 427 // Use the faster routine first, it can't be used to count occurances but nobody really knows about this feature | |
| 428 for (auto& walk : m_items) { | |
| 429 if (!dc->testSubstring(src, walk.first.c_str())) return false; | |
| 430 } | |
| 431 // Have any items where specific number of occurances is wanted? | |
| 432 for (auto & walk : m_items) { | |
| 433 const auto count = walk.second; | |
| 434 if (count == 1) continue; | |
| 435 const auto& str = walk.first; | |
| 436 const auto* strWalk = src; | |
| 437 for (size_t i = 0; i < count; ++i) { | |
| 438 auto next = dc->strStrEnd(strWalk, str.c_str()); | |
| 439 if (next == nullptr) return false; | |
| 440 strWalk = next; | |
| 441 } | |
| 442 } | |
| 443 return true; | |
| 444 } |
