view foosdk/sdk/pfc/SmartStrStr.cpp @ 1:20d02a178406 default tip

*: check in everything else yay
author Paper <paper@tflc.us>
date Mon, 05 Jan 2026 02:15:46 -0500
parents
children
line wrap: on
line source

#include "pfc-lite.h"

#include "string-conv-lite.h"
#include "string_conv.h"
#include "SmartStrStr.h"
#include <algorithm>
#include "SmartStrStr-table.h"
#include "SmartStrStr-twoCharMappings.h"

bool SmartStrStr::isWordChar(unsigned c) {
	// FIX ME map Unicode ranges somehow
	return c >= 128 || pfc::char_is_ascii_alphanumeric((char)c);
}

bool SmartStrStr::isWordChar(const char* ptr) {
	unsigned c;
	size_t d = pfc::utf8_decode_char(ptr, c);
	if (d == 0) return false; // bad UTF-8
	return isWordChar(c);
}

bool SmartStrStr::isValidWord(const char* ptr) {
	if (*ptr == 0) return false;
	do {
		unsigned c;
		size_t d = pfc::utf8_decode_char(ptr, c);
		if (d == 0) return false; // bad UTF-8
		if (!isWordChar(c)) return false;
		ptr += d;
	} while (*ptr != 0);
	return true;
}

void SmartStrStr::findWords(const char* str, std::function<void(pfc::string_part_ref)> cb) {
	size_t base = 0, walk = 0;
	for (;; ) {
		unsigned c = 0;
		size_t d = pfc::utf8_decode_char(str + walk, c);
		if (d == 0) break;
		
		if (!SmartStrStr::isWordChar(c)) {
			if (walk > base) {
				cb(pfc::string_part(str + base, walk - base));
			}
			base = walk + d;
		}
		walk += d;
	}
	if (walk > base) {
		cb(pfc::string_part(str + base, walk - base));
	}
}

SmartStrStr::SmartStrStr() {
	std::map<uint32_t, std::set<uint32_t> > substitutions, substitutionsReverse;
	std::map<uint32_t, uint32_t > downconvert;

#if 1
	for (auto& walk : SmartStrStrTable) {
		downconvert[walk.from] = walk.to;
		substitutions[walk.from].insert(walk.to);
	}
#else
	for (uint32_t walk = 128; walk < 0x10000; ++walk) {
		uint32_t c = Transform(walk);
		if (c != walk) {
			downconvert[walk] = c;
			substitutions[walk].insert(c);
		}
	}
#endif

	for (uint32_t walk = 32; walk < 0x10000; ++walk) {
		auto lo = ToLower(walk);
		if (lo != walk) {
			auto & s = substitutions[walk]; s.insert(lo);

			auto iter = substitutions.find(lo);
			if (iter != substitutions.end()) {
				s.insert(iter->second.begin(), iter->second.end());
			}
		}
	}

	for( auto & walk : substitutions ) {
		for( auto & walk2 : walk.second ) {
			substitutionsReverse[walk2].insert(walk.first);
		}
	}

	this->m_substitutions.initialize(std::move(substitutions));
	this->m_substitutionsReverse.initialize(std::move(substitutionsReverse));
	this->m_downconvert.initialize(std::move(downconvert));
	InitTwoCharMappings();
}

// == TEMPLATES ==
template<typename char_t> const char_t * SmartStrStr::matchHere_(const char_t * pString, const char_t * pUserString) const {
    auto walkData = pString;
    auto walkUser = pUserString;
    for (;; ) {
        if (*walkUser == 0) return walkData;

        uint32_t cData, cUser;
        size_t dData = pfc::uni_decode_char(walkData, cData);
        size_t dUser = pfc::uni_decode_char(walkUser, cUser);
        if (dData == 0 || dUser == 0) return nullptr;

        if (cData != cUser) {
            bool gotMulti = false;
            {
                const char * cDataSubst = m_twoCharMappings.query(cData);
                if (cDataSubst != nullptr) {
                    PFC_ASSERT(strlen(cDataSubst) == 2);
                    if (matchOneChar(cUser, (uint32_t)cDataSubst[0])) {
                        auto walkUser2 = walkUser + dUser;
                        uint32_t cUser2;
                        auto dUser2 = pfc::uni_decode_char(walkUser2, cUser2);
                        if (matchOneChar(cUser2, (uint32_t)cDataSubst[1])) {
                            gotMulti = true;
                            dUser += dUser2;
                        }
                    }
                }
            }
            if (!gotMulti) {
                if (!matchOneChar(cUser, cData)) return nullptr;
            }
        }

        walkData += dData;
        walkUser += dUser;
    }
}
template<typename char_t> bool SmartStrStr::equals_( const char_t * pString, const char_t * pUserString) const {
    auto p = this->matchHere_(pString, pUserString);
    if ( p == nullptr ) return false;
    return *p == 0;
}

template<typename char_t> const char_t * SmartStrStr::strStrEnd_(const char_t * pString, const char_t * pSubString, size_t * outFoundAt) const {
    size_t walk = 0;
    for (;; ) {
        if (pString[walk] == 0) return nullptr;
        auto end = matchHere_(pString + walk, pSubString);
        if (end != nullptr) {
            if (outFoundAt != nullptr) * outFoundAt = walk;
            return end;
        }

        size_t delta = pfc::uni_char_length(pString + walk);
        if (delta == 0) return nullptr;
        walk += delta;
    }
}
// == END TEMPLATES ==

const char16_t * SmartStrStr::matchHere16(const char16_t * pString, const char16_t * pUserString) const {
    return this->matchHere_(pString, pUserString);
}
const char * SmartStrStr::matchHere(const char * pString, const char * pUserString) const {
    return this->matchHere_(pString, pUserString);
}
const wchar_t * SmartStrStr::matchHereW(const wchar_t * pString, const wchar_t * pUserString) const {
    return this->matchHere_(pString, pUserString);
}

bool SmartStrStr::equals(const char * pString, const char * pUserString) const {
    return equals_(pString, pUserString);
}
bool SmartStrStr::equals16(const char16_t* pString, const char16_t* pUserString) const {
    return equals_(pString, pUserString);
}
bool SmartStrStr::equalsW( const wchar_t * pString, const wchar_t * pUserString) const {
    return equals_(pString, pUserString);
}
const char * SmartStrStr::strStrEnd(const char * pString, const char * pSubString, size_t * outFoundAt) const {
    return strStrEnd_(pString, pSubString, outFoundAt);
}

const char16_t * SmartStrStr::strStrEnd16(const char16_t * pString, const char16_t * pSubString, size_t * outFoundAt) const {
    return strStrEnd_(pString, pSubString, outFoundAt);
}

const wchar_t * SmartStrStr::strStrEndW(const wchar_t * pString, const wchar_t * pSubString, size_t * outFoundAt) const {
    return strStrEnd_(pString, pSubString, outFoundAt);
}

static bool wordBeginsHere(const char* base, size_t offset) {
	if (offset == 0) return true;
	for (size_t len = 1; len <= offset && len <= 6; --len) {
		unsigned c;
		if (pfc::utf8_decode_char(base + offset - len, c) == len) {
			return !SmartStrStr::isWordChar(c);
		}
	}
	return false;
}

const char* SmartStrStr::strStrEndWord(const char* pString, const char* pSubString, size_t* outFoundAt) const {
	size_t walk = 0;
	for (;;) {
		size_t foundAt = 0;
		auto end = strStrEnd(pString + walk, pSubString, &foundAt);
		if (end == nullptr) return nullptr;
		foundAt += walk;
		if (!isWordChar(end) && wordBeginsHere(pString, foundAt)) {
			if (outFoundAt) *outFoundAt = foundAt;
			return end;
		}
		walk = end - pString;
	}
}

bool SmartStrStr::matchOneChar(uint32_t cInput, uint32_t cData) const {
	if (cInput == cData) return true;
	auto v = m_substitutions.query_ptr(cData);
	if (v == nullptr) return false;
	return v->count(cInput) > 0;
}

pfc::string8 SmartStrStr::transformStr(const char* str) const {
	pfc::string8 ret; transformStrHere(ret, str); return ret;
}

void SmartStrStr::transformStrHere(pfc::string8& out, const char* in) const {
	transformStrHere(out, in, strlen(in));
}

void SmartStrStr::transformStrHere(pfc::string8& out, const char* in, size_t inLen) const {
	out.prealloc(inLen);
	out.clear();
	for (size_t walk = 0; walk < inLen; ) {
		unsigned c;
		size_t d = pfc::utf8_decode_char(in + walk, c);
		if (d == 0 || walk+d>inLen) break;
		walk += d;
		const char* alt = m_twoCharMappings.query(c);
		if (alt != nullptr) {
			out << alt; continue;
		}
		unsigned alt2 = m_downconvert.query(c);
		if (alt2 != 0) {
			out.add_char(alt2); continue;
		}
		out.add_char(c);
	}
}

#if 0 // Windows specific code
uint32_t SmartStrStr::Transform(uint32_t c) {
	wchar_t wide[2] = {}; char out[4] = {};
	pfc::utf16_encode_char(c, wide);
	BOOL fail = FALSE;
	if (WideCharToMultiByte(pfc::stringcvt::codepage_ascii, 0, wide, 2, out, 4, "?", &fail) > 0) {
		if (!fail) {
			if (out[0] > 0 && out[1] == 0) {
				c = out[0];
			}
		}
	}
	return c;
}
#endif

uint32_t SmartStrStr::ToLower(uint32_t c) {
	return pfc::charLower(c);
}

void SmartStrStr::InitTwoCharMappings() {
	std::map<uint32_t, const char* > mappings;
	std::map<uint32_t, uint32_t> reverse;
	for (auto& walk : twoCharMappings) {
		mappings[walk.from] = walk.to;
		uint32_t c1, c2;
		const char * p = walk.to;
		size_t d;
		d = pfc::utf8_decode_char(p, c1);
		if ( d > 0 ) {
			p += d;
			d = pfc::utf8_decode_char(p, c2);
			if (d > 0) {
				if (c1 < 0x10000 && c2 < 0x10000) {
					reverse[c1 | (c2 << 16)] = walk.from;
				}
			}
		}
	}
	m_twoCharMappings.initialize(std::move(mappings));
	m_twoCharMappingsReverse.initialize(std::move(reverse));
}
bool SmartStrStr::testSubString_prefix(const char* str, const char* sub, const char * prefix, size_t prefixLen) const {

	switch(prefixLen) {
	case 0:
		return false;
	case 1:
		for(const char * walk = str;; ) {
			walk = strchr(walk, *prefix);
			if ( walk == nullptr ) return false;
			++walk;
			if (matchHere(walk, sub)) return true;
		}
	default:
		for(const char * walk = str;; ) {
			walk = strstr(walk, prefix);
			if ( walk == nullptr ) return false;
			walk += prefixLen;
			if (matchHere(walk, sub)) return true;
		}
	}
}
bool SmartStrStr::testSubString_prefix(const char* str, const char* sub, uint32_t c) const {
	size_t tempLen;
	char temp[8];
	tempLen = pfc::utf8_encode_char(c, temp); temp[tempLen] = 0;
	return testSubString_prefix(str, sub, temp, tempLen);
}
bool SmartStrStr::testSubString_prefix_subst(const char* str, const char* sub, uint32_t prefix) const {
	if ( testSubString_prefix(str, sub, prefix)) return true;

	auto alt = m_substitutionsReverse.query_ptr( prefix );
	if (alt != nullptr) {
		for (auto c : *alt) {
			if (testSubString_prefix(str, sub, c)) return true;
		}
	}

	return false;
}
bool SmartStrStr::testSubstring(const char* str, const char* sub) const {
#if 1
    // optimized version for UTF-8
	unsigned prefix;
	const size_t skip = pfc::uni_decode_char(sub, prefix);
	if ( skip == 0 ) return false;
	sub += skip;

	if (testSubString_prefix_subst(str, sub, prefix)) return true;

	unsigned prefix2;
	const size_t skip2 = pfc::uni_decode_char(sub, prefix2);
	if (skip2 > 0 && prefix < 0x10000 && prefix2 < 0x10000) {
		sub += skip2;
		auto alt = m_twoCharMappingsReverse.query(prefix | (prefix2 << 16));
		if (alt != 0) {
			if (testSubString_prefix_subst(str, sub, alt)) return true;
		}
	}
	
	return false;
#else
	return this->strStrEnd(str, sub) != nullptr;
#endif
}
bool SmartStrStr::testSubstring16(const char16_t* str, const char16_t* sub) const {
	return this->strStrEnd16(str, sub) != nullptr;
}
bool SmartStrStr::testSubstringW( const wchar_t * str, const wchar_t * sub ) const {
    return this->strStrEndW(str, sub) != nullptr;
}

SmartStrStr& SmartStrStr::global() {
	static SmartStrStr g;
	return g;
}


void SmartStrFilter::init(const char* ptr, size_t len) {
	pfc::string_formatter current, temp;
	bool inQuotation = false;

	auto addCurrent = [&] {
		if (!current.is_empty()) {
			++m_items[current.get_ptr()]; current.reset();
		}
	};

	for (t_size walk = 0; walk < len; ++walk) {
		const char c = ptr[walk];
		if (c == '\"') inQuotation = !inQuotation;
		else if (!inQuotation && is_spacing(c)) {
			addCurrent();
		} else {
			current.add_byte(c);
		}
	}
	if (inQuotation) {
		// Allow unbalanced quotes, take the whole string *with* quotation marks
		m_items.clear();
		current.set_string_nc(ptr, len);
	}

	addCurrent();
}


bool SmartStrFilter::test_disregardCounts(const char* src) const {
	if (m_items.empty()) return false;

	for (auto& walk : m_items) {
		if (!dc->strStrEnd(src, walk.first.c_str())) return false;
	}
	return true;
}

bool SmartStrFilter::testWords(const char* src) const {
	if (m_items.empty()) return false;

	for (auto& walk : m_items) {
		const auto count = walk.second;
		const auto& str = walk.first;
		const auto* strWalk = src;
		for (size_t i = 0; i < count; ++i) {
			auto next = dc->strStrEndWord(strWalk, str.c_str());
			if (next == nullptr) return false;
			strWalk = next;
		}
	}
	return true;
}

bool SmartStrFilter::test(const char* src) const {

	if (m_items.empty()) return false;

	// Use the faster routine first, it can't be used to count occurances but nobody really knows about this feature
	for (auto& walk : m_items) {
		if (!dc->testSubstring(src, walk.first.c_str())) return false;
	}
	// Have any items where specific number of occurances is wanted?
	for (auto & walk : m_items) {
		const auto count = walk.second;
		if (count == 1) continue;
		const auto& str = walk.first;
		const auto* strWalk = src;
		for (size_t i = 0; i < count; ++i) {
			auto next = dc->strStrEnd(strWalk, str.c_str());
			if (next == nullptr) return false;
			strWalk = next;
		}
	}
	return true;
}