comparison foosdk/sdk/pfc/SmartStrStr.cpp @ 1:20d02a178406 default tip

*: check in everything else yay
author Paper <paper@tflc.us>
date Mon, 05 Jan 2026 02:15:46 -0500
parents
children
comparison
equal deleted inserted replaced
0:e9bb126753e7 1:20d02a178406
1 #include "pfc-lite.h"
2
3 #include "string-conv-lite.h"
4 #include "string_conv.h"
5 #include "SmartStrStr.h"
6 #include <algorithm>
7 #include "SmartStrStr-table.h"
8 #include "SmartStrStr-twoCharMappings.h"
9
10 bool SmartStrStr::isWordChar(unsigned c) {
11 // FIX ME map Unicode ranges somehow
12 return c >= 128 || pfc::char_is_ascii_alphanumeric((char)c);
13 }
14
15 bool SmartStrStr::isWordChar(const char* ptr) {
16 unsigned c;
17 size_t d = pfc::utf8_decode_char(ptr, c);
18 if (d == 0) return false; // bad UTF-8
19 return isWordChar(c);
20 }
21
22 bool SmartStrStr::isValidWord(const char* ptr) {
23 if (*ptr == 0) return false;
24 do {
25 unsigned c;
26 size_t d = pfc::utf8_decode_char(ptr, c);
27 if (d == 0) return false; // bad UTF-8
28 if (!isWordChar(c)) return false;
29 ptr += d;
30 } while (*ptr != 0);
31 return true;
32 }
33
34 void SmartStrStr::findWords(const char* str, std::function<void(pfc::string_part_ref)> cb) {
35 size_t base = 0, walk = 0;
36 for (;; ) {
37 unsigned c = 0;
38 size_t d = pfc::utf8_decode_char(str + walk, c);
39 if (d == 0) break;
40
41 if (!SmartStrStr::isWordChar(c)) {
42 if (walk > base) {
43 cb(pfc::string_part(str + base, walk - base));
44 }
45 base = walk + d;
46 }
47 walk += d;
48 }
49 if (walk > base) {
50 cb(pfc::string_part(str + base, walk - base));
51 }
52 }
53
54 SmartStrStr::SmartStrStr() {
55 std::map<uint32_t, std::set<uint32_t> > substitutions, substitutionsReverse;
56 std::map<uint32_t, uint32_t > downconvert;
57
58 #if 1
59 for (auto& walk : SmartStrStrTable) {
60 downconvert[walk.from] = walk.to;
61 substitutions[walk.from].insert(walk.to);
62 }
63 #else
64 for (uint32_t walk = 128; walk < 0x10000; ++walk) {
65 uint32_t c = Transform(walk);
66 if (c != walk) {
67 downconvert[walk] = c;
68 substitutions[walk].insert(c);
69 }
70 }
71 #endif
72
73 for (uint32_t walk = 32; walk < 0x10000; ++walk) {
74 auto lo = ToLower(walk);
75 if (lo != walk) {
76 auto & s = substitutions[walk]; s.insert(lo);
77
78 auto iter = substitutions.find(lo);
79 if (iter != substitutions.end()) {
80 s.insert(iter->second.begin(), iter->second.end());
81 }
82 }
83 }
84
85 for( auto & walk : substitutions ) {
86 for( auto & walk2 : walk.second ) {
87 substitutionsReverse[walk2].insert(walk.first);
88 }
89 }
90
91 this->m_substitutions.initialize(std::move(substitutions));
92 this->m_substitutionsReverse.initialize(std::move(substitutionsReverse));
93 this->m_downconvert.initialize(std::move(downconvert));
94 InitTwoCharMappings();
95 }
96
97 // == TEMPLATES ==
98 template<typename char_t> const char_t * SmartStrStr::matchHere_(const char_t * pString, const char_t * pUserString) const {
99 auto walkData = pString;
100 auto walkUser = pUserString;
101 for (;; ) {
102 if (*walkUser == 0) return walkData;
103
104 uint32_t cData, cUser;
105 size_t dData = pfc::uni_decode_char(walkData, cData);
106 size_t dUser = pfc::uni_decode_char(walkUser, cUser);
107 if (dData == 0 || dUser == 0) return nullptr;
108
109 if (cData != cUser) {
110 bool gotMulti = false;
111 {
112 const char * cDataSubst = m_twoCharMappings.query(cData);
113 if (cDataSubst != nullptr) {
114 PFC_ASSERT(strlen(cDataSubst) == 2);
115 if (matchOneChar(cUser, (uint32_t)cDataSubst[0])) {
116 auto walkUser2 = walkUser + dUser;
117 uint32_t cUser2;
118 auto dUser2 = pfc::uni_decode_char(walkUser2, cUser2);
119 if (matchOneChar(cUser2, (uint32_t)cDataSubst[1])) {
120 gotMulti = true;
121 dUser += dUser2;
122 }
123 }
124 }
125 }
126 if (!gotMulti) {
127 if (!matchOneChar(cUser, cData)) return nullptr;
128 }
129 }
130
131 walkData += dData;
132 walkUser += dUser;
133 }
134 }
135 template<typename char_t> bool SmartStrStr::equals_( const char_t * pString, const char_t * pUserString) const {
136 auto p = this->matchHere_(pString, pUserString);
137 if ( p == nullptr ) return false;
138 return *p == 0;
139 }
140
141 template<typename char_t> const char_t * SmartStrStr::strStrEnd_(const char_t * pString, const char_t * pSubString, size_t * outFoundAt) const {
142 size_t walk = 0;
143 for (;; ) {
144 if (pString[walk] == 0) return nullptr;
145 auto end = matchHere_(pString + walk, pSubString);
146 if (end != nullptr) {
147 if (outFoundAt != nullptr) * outFoundAt = walk;
148 return end;
149 }
150
151 size_t delta = pfc::uni_char_length(pString + walk);
152 if (delta == 0) return nullptr;
153 walk += delta;
154 }
155 }
156 // == END TEMPLATES ==
157
158 const char16_t * SmartStrStr::matchHere16(const char16_t * pString, const char16_t * pUserString) const {
159 return this->matchHere_(pString, pUserString);
160 }
161 const char * SmartStrStr::matchHere(const char * pString, const char * pUserString) const {
162 return this->matchHere_(pString, pUserString);
163 }
164 const wchar_t * SmartStrStr::matchHereW(const wchar_t * pString, const wchar_t * pUserString) const {
165 return this->matchHere_(pString, pUserString);
166 }
167
168 bool SmartStrStr::equals(const char * pString, const char * pUserString) const {
169 return equals_(pString, pUserString);
170 }
171 bool SmartStrStr::equals16(const char16_t* pString, const char16_t* pUserString) const {
172 return equals_(pString, pUserString);
173 }
174 bool SmartStrStr::equalsW( const wchar_t * pString, const wchar_t * pUserString) const {
175 return equals_(pString, pUserString);
176 }
177 const char * SmartStrStr::strStrEnd(const char * pString, const char * pSubString, size_t * outFoundAt) const {
178 return strStrEnd_(pString, pSubString, outFoundAt);
179 }
180
181 const char16_t * SmartStrStr::strStrEnd16(const char16_t * pString, const char16_t * pSubString, size_t * outFoundAt) const {
182 return strStrEnd_(pString, pSubString, outFoundAt);
183 }
184
185 const wchar_t * SmartStrStr::strStrEndW(const wchar_t * pString, const wchar_t * pSubString, size_t * outFoundAt) const {
186 return strStrEnd_(pString, pSubString, outFoundAt);
187 }
188
189 static bool wordBeginsHere(const char* base, size_t offset) {
190 if (offset == 0) return true;
191 for (size_t len = 1; len <= offset && len <= 6; --len) {
192 unsigned c;
193 if (pfc::utf8_decode_char(base + offset - len, c) == len) {
194 return !SmartStrStr::isWordChar(c);
195 }
196 }
197 return false;
198 }
199
200 const char* SmartStrStr::strStrEndWord(const char* pString, const char* pSubString, size_t* outFoundAt) const {
201 size_t walk = 0;
202 for (;;) {
203 size_t foundAt = 0;
204 auto end = strStrEnd(pString + walk, pSubString, &foundAt);
205 if (end == nullptr) return nullptr;
206 foundAt += walk;
207 if (!isWordChar(end) && wordBeginsHere(pString, foundAt)) {
208 if (outFoundAt) *outFoundAt = foundAt;
209 return end;
210 }
211 walk = end - pString;
212 }
213 }
214
215 bool SmartStrStr::matchOneChar(uint32_t cInput, uint32_t cData) const {
216 if (cInput == cData) return true;
217 auto v = m_substitutions.query_ptr(cData);
218 if (v == nullptr) return false;
219 return v->count(cInput) > 0;
220 }
221
222 pfc::string8 SmartStrStr::transformStr(const char* str) const {
223 pfc::string8 ret; transformStrHere(ret, str); return ret;
224 }
225
226 void SmartStrStr::transformStrHere(pfc::string8& out, const char* in) const {
227 transformStrHere(out, in, strlen(in));
228 }
229
230 void SmartStrStr::transformStrHere(pfc::string8& out, const char* in, size_t inLen) const {
231 out.prealloc(inLen);
232 out.clear();
233 for (size_t walk = 0; walk < inLen; ) {
234 unsigned c;
235 size_t d = pfc::utf8_decode_char(in + walk, c);
236 if (d == 0 || walk+d>inLen) break;
237 walk += d;
238 const char* alt = m_twoCharMappings.query(c);
239 if (alt != nullptr) {
240 out << alt; continue;
241 }
242 unsigned alt2 = m_downconvert.query(c);
243 if (alt2 != 0) {
244 out.add_char(alt2); continue;
245 }
246 out.add_char(c);
247 }
248 }
249
250 #if 0 // Windows specific code
251 uint32_t SmartStrStr::Transform(uint32_t c) {
252 wchar_t wide[2] = {}; char out[4] = {};
253 pfc::utf16_encode_char(c, wide);
254 BOOL fail = FALSE;
255 if (WideCharToMultiByte(pfc::stringcvt::codepage_ascii, 0, wide, 2, out, 4, "?", &fail) > 0) {
256 if (!fail) {
257 if (out[0] > 0 && out[1] == 0) {
258 c = out[0];
259 }
260 }
261 }
262 return c;
263 }
264 #endif
265
266 uint32_t SmartStrStr::ToLower(uint32_t c) {
267 return pfc::charLower(c);
268 }
269
270 void SmartStrStr::InitTwoCharMappings() {
271 std::map<uint32_t, const char* > mappings;
272 std::map<uint32_t, uint32_t> reverse;
273 for (auto& walk : twoCharMappings) {
274 mappings[walk.from] = walk.to;
275 uint32_t c1, c2;
276 const char * p = walk.to;
277 size_t d;
278 d = pfc::utf8_decode_char(p, c1);
279 if ( d > 0 ) {
280 p += d;
281 d = pfc::utf8_decode_char(p, c2);
282 if (d > 0) {
283 if (c1 < 0x10000 && c2 < 0x10000) {
284 reverse[c1 | (c2 << 16)] = walk.from;
285 }
286 }
287 }
288 }
289 m_twoCharMappings.initialize(std::move(mappings));
290 m_twoCharMappingsReverse.initialize(std::move(reverse));
291 }
292 bool SmartStrStr::testSubString_prefix(const char* str, const char* sub, const char * prefix, size_t prefixLen) const {
293
294 switch(prefixLen) {
295 case 0:
296 return false;
297 case 1:
298 for(const char * walk = str;; ) {
299 walk = strchr(walk, *prefix);
300 if ( walk == nullptr ) return false;
301 ++walk;
302 if (matchHere(walk, sub)) return true;
303 }
304 default:
305 for(const char * walk = str;; ) {
306 walk = strstr(walk, prefix);
307 if ( walk == nullptr ) return false;
308 walk += prefixLen;
309 if (matchHere(walk, sub)) return true;
310 }
311 }
312 }
313 bool SmartStrStr::testSubString_prefix(const char* str, const char* sub, uint32_t c) const {
314 size_t tempLen;
315 char temp[8];
316 tempLen = pfc::utf8_encode_char(c, temp); temp[tempLen] = 0;
317 return testSubString_prefix(str, sub, temp, tempLen);
318 }
319 bool SmartStrStr::testSubString_prefix_subst(const char* str, const char* sub, uint32_t prefix) const {
320 if ( testSubString_prefix(str, sub, prefix)) return true;
321
322 auto alt = m_substitutionsReverse.query_ptr( prefix );
323 if (alt != nullptr) {
324 for (auto c : *alt) {
325 if (testSubString_prefix(str, sub, c)) return true;
326 }
327 }
328
329 return false;
330 }
331 bool SmartStrStr::testSubstring(const char* str, const char* sub) const {
332 #if 1
333 // optimized version for UTF-8
334 unsigned prefix;
335 const size_t skip = pfc::uni_decode_char(sub, prefix);
336 if ( skip == 0 ) return false;
337 sub += skip;
338
339 if (testSubString_prefix_subst(str, sub, prefix)) return true;
340
341 unsigned prefix2;
342 const size_t skip2 = pfc::uni_decode_char(sub, prefix2);
343 if (skip2 > 0 && prefix < 0x10000 && prefix2 < 0x10000) {
344 sub += skip2;
345 auto alt = m_twoCharMappingsReverse.query(prefix | (prefix2 << 16));
346 if (alt != 0) {
347 if (testSubString_prefix_subst(str, sub, alt)) return true;
348 }
349 }
350
351 return false;
352 #else
353 return this->strStrEnd(str, sub) != nullptr;
354 #endif
355 }
356 bool SmartStrStr::testSubstring16(const char16_t* str, const char16_t* sub) const {
357 return this->strStrEnd16(str, sub) != nullptr;
358 }
359 bool SmartStrStr::testSubstringW( const wchar_t * str, const wchar_t * sub ) const {
360 return this->strStrEndW(str, sub) != nullptr;
361 }
362
363 SmartStrStr& SmartStrStr::global() {
364 static SmartStrStr g;
365 return g;
366 }
367
368
369 void SmartStrFilter::init(const char* ptr, size_t len) {
370 pfc::string_formatter current, temp;
371 bool inQuotation = false;
372
373 auto addCurrent = [&] {
374 if (!current.is_empty()) {
375 ++m_items[current.get_ptr()]; current.reset();
376 }
377 };
378
379 for (t_size walk = 0; walk < len; ++walk) {
380 const char c = ptr[walk];
381 if (c == '\"') inQuotation = !inQuotation;
382 else if (!inQuotation && is_spacing(c)) {
383 addCurrent();
384 } else {
385 current.add_byte(c);
386 }
387 }
388 if (inQuotation) {
389 // Allow unbalanced quotes, take the whole string *with* quotation marks
390 m_items.clear();
391 current.set_string_nc(ptr, len);
392 }
393
394 addCurrent();
395 }
396
397
398 bool SmartStrFilter::test_disregardCounts(const char* src) const {
399 if (m_items.empty()) return false;
400
401 for (auto& walk : m_items) {
402 if (!dc->strStrEnd(src, walk.first.c_str())) return false;
403 }
404 return true;
405 }
406
407 bool SmartStrFilter::testWords(const char* src) const {
408 if (m_items.empty()) return false;
409
410 for (auto& walk : m_items) {
411 const auto count = walk.second;
412 const auto& str = walk.first;
413 const auto* strWalk = src;
414 for (size_t i = 0; i < count; ++i) {
415 auto next = dc->strStrEndWord(strWalk, str.c_str());
416 if (next == nullptr) return false;
417 strWalk = next;
418 }
419 }
420 return true;
421 }
422
423 bool SmartStrFilter::test(const char* src) const {
424
425 if (m_items.empty()) return false;
426
427 // Use the faster routine first, it can't be used to count occurances but nobody really knows about this feature
428 for (auto& walk : m_items) {
429 if (!dc->testSubstring(src, walk.first.c_str())) return false;
430 }
431 // Have any items where specific number of occurances is wanted?
432 for (auto & walk : m_items) {
433 const auto count = walk.second;
434 if (count == 1) continue;
435 const auto& str = walk.first;
436 const auto* strWalk = src;
437 for (size_t i = 0; i < count; ++i) {
438 auto next = dc->strStrEnd(strWalk, str.c_str());
439 if (next == nullptr) return false;
440 strWalk = next;
441 }
442 }
443 return true;
444 }