diff foosdk/sdk/pfc/pocket_char_ops.h @ 1:20d02a178406 default tip

*: check in everything else yay
author Paper <paper@tflc.us>
date Mon, 05 Jan 2026 02:15:46 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/foosdk/sdk/pfc/pocket_char_ops.h	Mon Jan 05 02:15:46 2026 -0500
@@ -0,0 +1,269 @@
+#pragma once
+
+// Standalone header (no dependencies) with implementations of PFC UTF-8 & UTF-16 manipulation routines
+
+static const uint8_t mask_tab[6] = { 0x80,0xE0,0xF0,0xF8,0xFC,0xFE };
+
+static const uint8_t val_tab[6] = { 0,0xC0,0xE0,0xF0,0xF8,0xFC };
+
+size_t utf8_char_len_from_header(char p_c) noexcept
+{
+	size_t cnt = 0;
+	for (;;)
+	{
+		if ((p_c & mask_tab[cnt]) == val_tab[cnt]) break;
+		if (++cnt >= 6) return 0;
+	}
+
+	return cnt + 1;
+
+}
+size_t utf8_decode_char(const char *p_utf8, unsigned & wide) noexcept {
+	const uint8_t * utf8 = (const uint8_t*)p_utf8;
+	const size_t max = 6;
+
+	if (utf8[0]<0x80) {
+		wide = utf8[0];
+		return utf8[0]>0 ? 1 : 0;
+	}
+	wide = 0;
+
+	unsigned res = 0;
+	unsigned n;
+	unsigned cnt = 0;
+	for (;;)
+	{
+		if ((*utf8&mask_tab[cnt]) == val_tab[cnt]) break;
+		if (++cnt >= max) return 0;
+	}
+	cnt++;
+
+	if (cnt == 2 && !(*utf8 & 0x1E)) return 0;
+
+	if (cnt == 1)
+		res = *utf8;
+	else
+		res = (0xFF >> (cnt + 1))&*utf8;
+
+	for (n = 1; n<cnt; n++)
+	{
+		if ((utf8[n] & 0xC0) != 0x80)
+			return 0;
+		if (!res && n == 2 && !((utf8[n] & 0x7F) >> (7 - cnt)))
+			return 0;
+
+		res = (res << 6) | (utf8[n] & 0x3F);
+	}
+
+	wide = res;
+
+	return cnt;
+}
+
+size_t utf8_decode_char(const char *p_utf8, unsigned & wide, size_t max) noexcept
+{
+	const uint8_t * utf8 = (const uint8_t*)p_utf8;
+
+	if (max == 0) {
+		wide = 0;
+		return 0;
+	}
+
+	if (utf8[0]<0x80) {
+		wide = utf8[0];
+		return utf8[0]>0 ? 1 : 0;
+	}
+	if (max>6) max = 6;
+	wide = 0;
+
+	unsigned res = 0;
+	unsigned n;
+	unsigned cnt = 0;
+	for (;;)
+	{
+		if ((*utf8&mask_tab[cnt]) == val_tab[cnt]) break;
+		if (++cnt >= max) return 0;
+	}
+	cnt++;
+
+	if (cnt == 2 && !(*utf8 & 0x1E)) return 0;
+
+	if (cnt == 1)
+		res = *utf8;
+	else
+		res = (0xFF >> (cnt + 1))&*utf8;
+
+	for (n = 1; n<cnt; n++)
+	{
+		if ((utf8[n] & 0xC0) != 0x80)
+			return 0;
+		if (!res && n == 2 && !((utf8[n] & 0x7F) >> (7 - cnt)))
+			return 0;
+
+		res = (res << 6) | (utf8[n] & 0x3F);
+	}
+
+	wide = res;
+
+	return cnt;
+}
+
+
+size_t utf8_encode_char(unsigned wide, char * target) noexcept
+{
+	size_t count;
+
+	if (wide < 0x80)
+		count = 1;
+	else if (wide < 0x800)
+		count = 2;
+	else if (wide < 0x10000)
+		count = 3;
+	else if (wide < 0x200000)
+		count = 4;
+	else if (wide < 0x4000000)
+		count = 5;
+	else if (wide <= 0x7FFFFFFF)
+		count = 6;
+	else
+		return 0;
+	//if (count>max) return 0;
+
+	if (target == 0)
+		return count;
+
+	switch (count)
+	{
+	case 6:
+		target[5] = 0x80 | (wide & 0x3F);
+		wide = wide >> 6;
+		wide |= 0x4000000;
+		[[fallthrough]];
+	case 5:
+		target[4] = 0x80 | (wide & 0x3F);
+		wide = wide >> 6;
+		wide |= 0x200000;
+		[[fallthrough]];
+	case 4:
+		target[3] = 0x80 | (wide & 0x3F);
+		wide = wide >> 6;
+		wide |= 0x10000;
+		[[fallthrough]];
+	case 3:
+		target[2] = 0x80 | (wide & 0x3F);
+		wide = wide >> 6;
+		wide |= 0x800;
+		[[fallthrough]];
+	case 2:
+		target[1] = 0x80 | (wide & 0x3F);
+		wide = wide >> 6;
+		wide |= 0xC0;
+		[[fallthrough]];
+	case 1:
+		target[0] = wide & 0xFF;
+	}
+
+	return count;
+}
+
+size_t utf16_encode_char(unsigned cur_wchar, char16_t * out) noexcept
+{
+	if (cur_wchar < 0x10000) {
+		*out = (char16_t)cur_wchar; return 1;
+	} else if (cur_wchar < (1 << 20)) {
+		unsigned c = cur_wchar - 0x10000;
+		//MSDN:
+		//The first (high) surrogate is a 16-bit code value in the range U+D800 to U+DBFF. The second (low) surrogate is a 16-bit code value in the range U+DC00 to U+DFFF. Using surrogates, Unicode can support over one million characters. For more details about surrogates, refer to The Unicode Standard, version 2.0.
+		out[0] = (char16_t)(0xD800 | (0x3FF & (c >> 10)));
+		out[1] = (char16_t)(0xDC00 | (0x3FF & c));
+		return 2;
+	} else {
+		*out = '?'; return 1;
+	}
+}
+
+size_t utf16_decode_char(const char16_t * p_source, unsigned * p_out, size_t p_source_length) noexcept {
+	if (p_source_length == 0) { *p_out = 0; return 0; } else if (p_source_length == 1) {
+		*p_out = p_source[0];
+		return 1;
+	} else {
+		size_t retval = 0;
+		unsigned decoded = p_source[0];
+		if (decoded != 0)
+		{
+			retval = 1;
+			if ((decoded & 0xFC00) == 0xD800)
+			{
+				unsigned low = p_source[1];
+				if ((low & 0xFC00) == 0xDC00)
+				{
+					decoded = 0x10000 + (((decoded & 0x3FF) << 10) | (low & 0x3FF));
+					retval = 2;
+				}
+			}
+		}
+		*p_out = decoded;
+		return retval;
+	}
+}
+
+unsigned utf8_get_char(const char * src)
+{
+	unsigned rv = 0;
+	utf8_decode_char(src, rv);
+	return rv;
+}
+
+
+size_t utf8_char_len(const char * s, size_t max) noexcept
+{
+	unsigned dummy;
+	return utf8_decode_char(s, dummy, max);
+}
+
+size_t skip_utf8_chars(const char * ptr, size_t count) noexcept
+{
+	size_t num = 0;
+	for (; count && ptr[num]; count--)
+	{
+		size_t d = utf8_char_len(ptr + num, (size_t)(-1));
+		if (d <= 0) break;
+		num += d;
+	}
+	return num;
+}
+
+bool is_valid_utf8(const char * param, size_t max) {
+	size_t walk = 0;
+	while (walk < max && param[walk] != 0) {
+		size_t d;
+		unsigned dummy;
+		d = utf8_decode_char(param + walk, dummy, max - walk);
+		if (d == 0) return false;
+		walk += d;
+		if (walk > max) {
+			// should not get here
+			return false;
+		}
+	}
+	return true;
+}
+
+bool is_canonical_utf8(const char * param, size_t max) {
+    char scratch[6];
+    size_t walk = 0;
+    while (walk < max && param[walk] != 0) {
+        size_t d;
+        unsigned c;
+        d = utf8_decode_char(param + walk, c, max - walk);
+        if (d == 0) return false; // bad UTF-8
+        walk += d;
+        if (walk > max) {
+            // should not get here
+            return false;
+        }
+        if (utf8_encode_char(c, scratch) != d) return false;
+    }
+    return true;
+
+}