|
1
|
1 #include "pfc-lite.h"
|
|
|
2
|
|
|
3 #include "string-conv-lite.h"
|
|
|
4 #include "string_conv.h"
|
|
|
5 #include "SmartStrStr.h"
|
|
|
6 #include <algorithm>
|
|
|
7 #include "SmartStrStr-table.h"
|
|
|
8 #include "SmartStrStr-twoCharMappings.h"
|
|
|
9
|
|
|
10 bool SmartStrStr::isWordChar(unsigned c) {
|
|
|
11 // FIX ME map Unicode ranges somehow
|
|
|
12 return c >= 128 || pfc::char_is_ascii_alphanumeric((char)c);
|
|
|
13 }
|
|
|
14
|
|
|
15 bool SmartStrStr::isWordChar(const char* ptr) {
|
|
|
16 unsigned c;
|
|
|
17 size_t d = pfc::utf8_decode_char(ptr, c);
|
|
|
18 if (d == 0) return false; // bad UTF-8
|
|
|
19 return isWordChar(c);
|
|
|
20 }
|
|
|
21
|
|
|
22 bool SmartStrStr::isValidWord(const char* ptr) {
|
|
|
23 if (*ptr == 0) return false;
|
|
|
24 do {
|
|
|
25 unsigned c;
|
|
|
26 size_t d = pfc::utf8_decode_char(ptr, c);
|
|
|
27 if (d == 0) return false; // bad UTF-8
|
|
|
28 if (!isWordChar(c)) return false;
|
|
|
29 ptr += d;
|
|
|
30 } while (*ptr != 0);
|
|
|
31 return true;
|
|
|
32 }
|
|
|
33
|
|
|
34 void SmartStrStr::findWords(const char* str, std::function<void(pfc::string_part_ref)> cb) {
|
|
|
35 size_t base = 0, walk = 0;
|
|
|
36 for (;; ) {
|
|
|
37 unsigned c = 0;
|
|
|
38 size_t d = pfc::utf8_decode_char(str + walk, c);
|
|
|
39 if (d == 0) break;
|
|
|
40
|
|
|
41 if (!SmartStrStr::isWordChar(c)) {
|
|
|
42 if (walk > base) {
|
|
|
43 cb(pfc::string_part(str + base, walk - base));
|
|
|
44 }
|
|
|
45 base = walk + d;
|
|
|
46 }
|
|
|
47 walk += d;
|
|
|
48 }
|
|
|
49 if (walk > base) {
|
|
|
50 cb(pfc::string_part(str + base, walk - base));
|
|
|
51 }
|
|
|
52 }
|
|
|
53
|
|
|
54 SmartStrStr::SmartStrStr() {
|
|
|
55 std::map<uint32_t, std::set<uint32_t> > substitutions, substitutionsReverse;
|
|
|
56 std::map<uint32_t, uint32_t > downconvert;
|
|
|
57
|
|
|
58 #if 1
|
|
|
59 for (auto& walk : SmartStrStrTable) {
|
|
|
60 downconvert[walk.from] = walk.to;
|
|
|
61 substitutions[walk.from].insert(walk.to);
|
|
|
62 }
|
|
|
63 #else
|
|
|
64 for (uint32_t walk = 128; walk < 0x10000; ++walk) {
|
|
|
65 uint32_t c = Transform(walk);
|
|
|
66 if (c != walk) {
|
|
|
67 downconvert[walk] = c;
|
|
|
68 substitutions[walk].insert(c);
|
|
|
69 }
|
|
|
70 }
|
|
|
71 #endif
|
|
|
72
|
|
|
73 for (uint32_t walk = 32; walk < 0x10000; ++walk) {
|
|
|
74 auto lo = ToLower(walk);
|
|
|
75 if (lo != walk) {
|
|
|
76 auto & s = substitutions[walk]; s.insert(lo);
|
|
|
77
|
|
|
78 auto iter = substitutions.find(lo);
|
|
|
79 if (iter != substitutions.end()) {
|
|
|
80 s.insert(iter->second.begin(), iter->second.end());
|
|
|
81 }
|
|
|
82 }
|
|
|
83 }
|
|
|
84
|
|
|
85 for( auto & walk : substitutions ) {
|
|
|
86 for( auto & walk2 : walk.second ) {
|
|
|
87 substitutionsReverse[walk2].insert(walk.first);
|
|
|
88 }
|
|
|
89 }
|
|
|
90
|
|
|
91 this->m_substitutions.initialize(std::move(substitutions));
|
|
|
92 this->m_substitutionsReverse.initialize(std::move(substitutionsReverse));
|
|
|
93 this->m_downconvert.initialize(std::move(downconvert));
|
|
|
94 InitTwoCharMappings();
|
|
|
95 }
|
|
|
96
|
|
|
97 // == TEMPLATES ==
|
|
|
98 template<typename char_t> const char_t * SmartStrStr::matchHere_(const char_t * pString, const char_t * pUserString) const {
|
|
|
99 auto walkData = pString;
|
|
|
100 auto walkUser = pUserString;
|
|
|
101 for (;; ) {
|
|
|
102 if (*walkUser == 0) return walkData;
|
|
|
103
|
|
|
104 uint32_t cData, cUser;
|
|
|
105 size_t dData = pfc::uni_decode_char(walkData, cData);
|
|
|
106 size_t dUser = pfc::uni_decode_char(walkUser, cUser);
|
|
|
107 if (dData == 0 || dUser == 0) return nullptr;
|
|
|
108
|
|
|
109 if (cData != cUser) {
|
|
|
110 bool gotMulti = false;
|
|
|
111 {
|
|
|
112 const char * cDataSubst = m_twoCharMappings.query(cData);
|
|
|
113 if (cDataSubst != nullptr) {
|
|
|
114 PFC_ASSERT(strlen(cDataSubst) == 2);
|
|
|
115 if (matchOneChar(cUser, (uint32_t)cDataSubst[0])) {
|
|
|
116 auto walkUser2 = walkUser + dUser;
|
|
|
117 uint32_t cUser2;
|
|
|
118 auto dUser2 = pfc::uni_decode_char(walkUser2, cUser2);
|
|
|
119 if (matchOneChar(cUser2, (uint32_t)cDataSubst[1])) {
|
|
|
120 gotMulti = true;
|
|
|
121 dUser += dUser2;
|
|
|
122 }
|
|
|
123 }
|
|
|
124 }
|
|
|
125 }
|
|
|
126 if (!gotMulti) {
|
|
|
127 if (!matchOneChar(cUser, cData)) return nullptr;
|
|
|
128 }
|
|
|
129 }
|
|
|
130
|
|
|
131 walkData += dData;
|
|
|
132 walkUser += dUser;
|
|
|
133 }
|
|
|
134 }
|
|
|
135 template<typename char_t> bool SmartStrStr::equals_( const char_t * pString, const char_t * pUserString) const {
|
|
|
136 auto p = this->matchHere_(pString, pUserString);
|
|
|
137 if ( p == nullptr ) return false;
|
|
|
138 return *p == 0;
|
|
|
139 }
|
|
|
140
|
|
|
141 template<typename char_t> const char_t * SmartStrStr::strStrEnd_(const char_t * pString, const char_t * pSubString, size_t * outFoundAt) const {
|
|
|
142 size_t walk = 0;
|
|
|
143 for (;; ) {
|
|
|
144 if (pString[walk] == 0) return nullptr;
|
|
|
145 auto end = matchHere_(pString + walk, pSubString);
|
|
|
146 if (end != nullptr) {
|
|
|
147 if (outFoundAt != nullptr) * outFoundAt = walk;
|
|
|
148 return end;
|
|
|
149 }
|
|
|
150
|
|
|
151 size_t delta = pfc::uni_char_length(pString + walk);
|
|
|
152 if (delta == 0) return nullptr;
|
|
|
153 walk += delta;
|
|
|
154 }
|
|
|
155 }
|
|
|
156 // == END TEMPLATES ==
|
|
|
157
|
|
|
158 const char16_t * SmartStrStr::matchHere16(const char16_t * pString, const char16_t * pUserString) const {
|
|
|
159 return this->matchHere_(pString, pUserString);
|
|
|
160 }
|
|
|
161 const char * SmartStrStr::matchHere(const char * pString, const char * pUserString) const {
|
|
|
162 return this->matchHere_(pString, pUserString);
|
|
|
163 }
|
|
|
164 const wchar_t * SmartStrStr::matchHereW(const wchar_t * pString, const wchar_t * pUserString) const {
|
|
|
165 return this->matchHere_(pString, pUserString);
|
|
|
166 }
|
|
|
167
|
|
|
168 bool SmartStrStr::equals(const char * pString, const char * pUserString) const {
|
|
|
169 return equals_(pString, pUserString);
|
|
|
170 }
|
|
|
171 bool SmartStrStr::equals16(const char16_t* pString, const char16_t* pUserString) const {
|
|
|
172 return equals_(pString, pUserString);
|
|
|
173 }
|
|
|
174 bool SmartStrStr::equalsW( const wchar_t * pString, const wchar_t * pUserString) const {
|
|
|
175 return equals_(pString, pUserString);
|
|
|
176 }
|
|
|
177 const char * SmartStrStr::strStrEnd(const char * pString, const char * pSubString, size_t * outFoundAt) const {
|
|
|
178 return strStrEnd_(pString, pSubString, outFoundAt);
|
|
|
179 }
|
|
|
180
|
|
|
181 const char16_t * SmartStrStr::strStrEnd16(const char16_t * pString, const char16_t * pSubString, size_t * outFoundAt) const {
|
|
|
182 return strStrEnd_(pString, pSubString, outFoundAt);
|
|
|
183 }
|
|
|
184
|
|
|
185 const wchar_t * SmartStrStr::strStrEndW(const wchar_t * pString, const wchar_t * pSubString, size_t * outFoundAt) const {
|
|
|
186 return strStrEnd_(pString, pSubString, outFoundAt);
|
|
|
187 }
|
|
|
188
|
|
|
189 static bool wordBeginsHere(const char* base, size_t offset) {
|
|
|
190 if (offset == 0) return true;
|
|
|
191 for (size_t len = 1; len <= offset && len <= 6; --len) {
|
|
|
192 unsigned c;
|
|
|
193 if (pfc::utf8_decode_char(base + offset - len, c) == len) {
|
|
|
194 return !SmartStrStr::isWordChar(c);
|
|
|
195 }
|
|
|
196 }
|
|
|
197 return false;
|
|
|
198 }
|
|
|
199
|
|
|
200 const char* SmartStrStr::strStrEndWord(const char* pString, const char* pSubString, size_t* outFoundAt) const {
|
|
|
201 size_t walk = 0;
|
|
|
202 for (;;) {
|
|
|
203 size_t foundAt = 0;
|
|
|
204 auto end = strStrEnd(pString + walk, pSubString, &foundAt);
|
|
|
205 if (end == nullptr) return nullptr;
|
|
|
206 foundAt += walk;
|
|
|
207 if (!isWordChar(end) && wordBeginsHere(pString, foundAt)) {
|
|
|
208 if (outFoundAt) *outFoundAt = foundAt;
|
|
|
209 return end;
|
|
|
210 }
|
|
|
211 walk = end - pString;
|
|
|
212 }
|
|
|
213 }
|
|
|
214
|
|
|
215 bool SmartStrStr::matchOneChar(uint32_t cInput, uint32_t cData) const {
|
|
|
216 if (cInput == cData) return true;
|
|
|
217 auto v = m_substitutions.query_ptr(cData);
|
|
|
218 if (v == nullptr) return false;
|
|
|
219 return v->count(cInput) > 0;
|
|
|
220 }
|
|
|
221
|
|
|
222 pfc::string8 SmartStrStr::transformStr(const char* str) const {
|
|
|
223 pfc::string8 ret; transformStrHere(ret, str); return ret;
|
|
|
224 }
|
|
|
225
|
|
|
226 void SmartStrStr::transformStrHere(pfc::string8& out, const char* in) const {
|
|
|
227 transformStrHere(out, in, strlen(in));
|
|
|
228 }
|
|
|
229
|
|
|
230 void SmartStrStr::transformStrHere(pfc::string8& out, const char* in, size_t inLen) const {
|
|
|
231 out.prealloc(inLen);
|
|
|
232 out.clear();
|
|
|
233 for (size_t walk = 0; walk < inLen; ) {
|
|
|
234 unsigned c;
|
|
|
235 size_t d = pfc::utf8_decode_char(in + walk, c);
|
|
|
236 if (d == 0 || walk+d>inLen) break;
|
|
|
237 walk += d;
|
|
|
238 const char* alt = m_twoCharMappings.query(c);
|
|
|
239 if (alt != nullptr) {
|
|
|
240 out << alt; continue;
|
|
|
241 }
|
|
|
242 unsigned alt2 = m_downconvert.query(c);
|
|
|
243 if (alt2 != 0) {
|
|
|
244 out.add_char(alt2); continue;
|
|
|
245 }
|
|
|
246 out.add_char(c);
|
|
|
247 }
|
|
|
248 }
|
|
|
249
|
|
|
250 #if 0 // Windows specific code
|
|
|
251 uint32_t SmartStrStr::Transform(uint32_t c) {
|
|
|
252 wchar_t wide[2] = {}; char out[4] = {};
|
|
|
253 pfc::utf16_encode_char(c, wide);
|
|
|
254 BOOL fail = FALSE;
|
|
|
255 if (WideCharToMultiByte(pfc::stringcvt::codepage_ascii, 0, wide, 2, out, 4, "?", &fail) > 0) {
|
|
|
256 if (!fail) {
|
|
|
257 if (out[0] > 0 && out[1] == 0) {
|
|
|
258 c = out[0];
|
|
|
259 }
|
|
|
260 }
|
|
|
261 }
|
|
|
262 return c;
|
|
|
263 }
|
|
|
264 #endif
|
|
|
265
|
|
|
266 uint32_t SmartStrStr::ToLower(uint32_t c) {
|
|
|
267 return pfc::charLower(c);
|
|
|
268 }
|
|
|
269
|
|
|
270 void SmartStrStr::InitTwoCharMappings() {
|
|
|
271 std::map<uint32_t, const char* > mappings;
|
|
|
272 std::map<uint32_t, uint32_t> reverse;
|
|
|
273 for (auto& walk : twoCharMappings) {
|
|
|
274 mappings[walk.from] = walk.to;
|
|
|
275 uint32_t c1, c2;
|
|
|
276 const char * p = walk.to;
|
|
|
277 size_t d;
|
|
|
278 d = pfc::utf8_decode_char(p, c1);
|
|
|
279 if ( d > 0 ) {
|
|
|
280 p += d;
|
|
|
281 d = pfc::utf8_decode_char(p, c2);
|
|
|
282 if (d > 0) {
|
|
|
283 if (c1 < 0x10000 && c2 < 0x10000) {
|
|
|
284 reverse[c1 | (c2 << 16)] = walk.from;
|
|
|
285 }
|
|
|
286 }
|
|
|
287 }
|
|
|
288 }
|
|
|
289 m_twoCharMappings.initialize(std::move(mappings));
|
|
|
290 m_twoCharMappingsReverse.initialize(std::move(reverse));
|
|
|
291 }
|
|
|
292 bool SmartStrStr::testSubString_prefix(const char* str, const char* sub, const char * prefix, size_t prefixLen) const {
|
|
|
293
|
|
|
294 switch(prefixLen) {
|
|
|
295 case 0:
|
|
|
296 return false;
|
|
|
297 case 1:
|
|
|
298 for(const char * walk = str;; ) {
|
|
|
299 walk = strchr(walk, *prefix);
|
|
|
300 if ( walk == nullptr ) return false;
|
|
|
301 ++walk;
|
|
|
302 if (matchHere(walk, sub)) return true;
|
|
|
303 }
|
|
|
304 default:
|
|
|
305 for(const char * walk = str;; ) {
|
|
|
306 walk = strstr(walk, prefix);
|
|
|
307 if ( walk == nullptr ) return false;
|
|
|
308 walk += prefixLen;
|
|
|
309 if (matchHere(walk, sub)) return true;
|
|
|
310 }
|
|
|
311 }
|
|
|
312 }
|
|
|
313 bool SmartStrStr::testSubString_prefix(const char* str, const char* sub, uint32_t c) const {
|
|
|
314 size_t tempLen;
|
|
|
315 char temp[8];
|
|
|
316 tempLen = pfc::utf8_encode_char(c, temp); temp[tempLen] = 0;
|
|
|
317 return testSubString_prefix(str, sub, temp, tempLen);
|
|
|
318 }
|
|
|
319 bool SmartStrStr::testSubString_prefix_subst(const char* str, const char* sub, uint32_t prefix) const {
|
|
|
320 if ( testSubString_prefix(str, sub, prefix)) return true;
|
|
|
321
|
|
|
322 auto alt = m_substitutionsReverse.query_ptr( prefix );
|
|
|
323 if (alt != nullptr) {
|
|
|
324 for (auto c : *alt) {
|
|
|
325 if (testSubString_prefix(str, sub, c)) return true;
|
|
|
326 }
|
|
|
327 }
|
|
|
328
|
|
|
329 return false;
|
|
|
330 }
|
|
|
331 bool SmartStrStr::testSubstring(const char* str, const char* sub) const {
|
|
|
332 #if 1
|
|
|
333 // optimized version for UTF-8
|
|
|
334 unsigned prefix;
|
|
|
335 const size_t skip = pfc::uni_decode_char(sub, prefix);
|
|
|
336 if ( skip == 0 ) return false;
|
|
|
337 sub += skip;
|
|
|
338
|
|
|
339 if (testSubString_prefix_subst(str, sub, prefix)) return true;
|
|
|
340
|
|
|
341 unsigned prefix2;
|
|
|
342 const size_t skip2 = pfc::uni_decode_char(sub, prefix2);
|
|
|
343 if (skip2 > 0 && prefix < 0x10000 && prefix2 < 0x10000) {
|
|
|
344 sub += skip2;
|
|
|
345 auto alt = m_twoCharMappingsReverse.query(prefix | (prefix2 << 16));
|
|
|
346 if (alt != 0) {
|
|
|
347 if (testSubString_prefix_subst(str, sub, alt)) return true;
|
|
|
348 }
|
|
|
349 }
|
|
|
350
|
|
|
351 return false;
|
|
|
352 #else
|
|
|
353 return this->strStrEnd(str, sub) != nullptr;
|
|
|
354 #endif
|
|
|
355 }
|
|
|
356 bool SmartStrStr::testSubstring16(const char16_t* str, const char16_t* sub) const {
|
|
|
357 return this->strStrEnd16(str, sub) != nullptr;
|
|
|
358 }
|
|
|
359 bool SmartStrStr::testSubstringW( const wchar_t * str, const wchar_t * sub ) const {
|
|
|
360 return this->strStrEndW(str, sub) != nullptr;
|
|
|
361 }
|
|
|
362
|
|
|
363 SmartStrStr& SmartStrStr::global() {
|
|
|
364 static SmartStrStr g;
|
|
|
365 return g;
|
|
|
366 }
|
|
|
367
|
|
|
368
|
|
|
369 void SmartStrFilter::init(const char* ptr, size_t len) {
|
|
|
370 pfc::string_formatter current, temp;
|
|
|
371 bool inQuotation = false;
|
|
|
372
|
|
|
373 auto addCurrent = [&] {
|
|
|
374 if (!current.is_empty()) {
|
|
|
375 ++m_items[current.get_ptr()]; current.reset();
|
|
|
376 }
|
|
|
377 };
|
|
|
378
|
|
|
379 for (t_size walk = 0; walk < len; ++walk) {
|
|
|
380 const char c = ptr[walk];
|
|
|
381 if (c == '\"') inQuotation = !inQuotation;
|
|
|
382 else if (!inQuotation && is_spacing(c)) {
|
|
|
383 addCurrent();
|
|
|
384 } else {
|
|
|
385 current.add_byte(c);
|
|
|
386 }
|
|
|
387 }
|
|
|
388 if (inQuotation) {
|
|
|
389 // Allow unbalanced quotes, take the whole string *with* quotation marks
|
|
|
390 m_items.clear();
|
|
|
391 current.set_string_nc(ptr, len);
|
|
|
392 }
|
|
|
393
|
|
|
394 addCurrent();
|
|
|
395 }
|
|
|
396
|
|
|
397
|
|
|
398 bool SmartStrFilter::test_disregardCounts(const char* src) const {
|
|
|
399 if (m_items.empty()) return false;
|
|
|
400
|
|
|
401 for (auto& walk : m_items) {
|
|
|
402 if (!dc->strStrEnd(src, walk.first.c_str())) return false;
|
|
|
403 }
|
|
|
404 return true;
|
|
|
405 }
|
|
|
406
|
|
|
407 bool SmartStrFilter::testWords(const char* src) const {
|
|
|
408 if (m_items.empty()) return false;
|
|
|
409
|
|
|
410 for (auto& walk : m_items) {
|
|
|
411 const auto count = walk.second;
|
|
|
412 const auto& str = walk.first;
|
|
|
413 const auto* strWalk = src;
|
|
|
414 for (size_t i = 0; i < count; ++i) {
|
|
|
415 auto next = dc->strStrEndWord(strWalk, str.c_str());
|
|
|
416 if (next == nullptr) return false;
|
|
|
417 strWalk = next;
|
|
|
418 }
|
|
|
419 }
|
|
|
420 return true;
|
|
|
421 }
|
|
|
422
|
|
|
423 bool SmartStrFilter::test(const char* src) const {
|
|
|
424
|
|
|
425 if (m_items.empty()) return false;
|
|
|
426
|
|
|
427 // Use the faster routine first, it can't be used to count occurances but nobody really knows about this feature
|
|
|
428 for (auto& walk : m_items) {
|
|
|
429 if (!dc->testSubstring(src, walk.first.c_str())) return false;
|
|
|
430 }
|
|
|
431 // Have any items where specific number of occurances is wanted?
|
|
|
432 for (auto & walk : m_items) {
|
|
|
433 const auto count = walk.second;
|
|
|
434 if (count == 1) continue;
|
|
|
435 const auto& str = walk.first;
|
|
|
436 const auto* strWalk = src;
|
|
|
437 for (size_t i = 0; i < count; ++i) {
|
|
|
438 auto next = dc->strStrEnd(strWalk, str.c_str());
|
|
|
439 if (next == nullptr) return false;
|
|
|
440 strWalk = next;
|
|
|
441 }
|
|
|
442 }
|
|
|
443 return true;
|
|
|
444 }
|