Mercurial > foo_out_sdl
diff foosdk/sdk/pfc/pocket_char_ops.h @ 1:20d02a178406 default tip
*: check in everything else
yay
| author | Paper <paper@tflc.us> |
|---|---|
| date | Mon, 05 Jan 2026 02:15:46 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/foosdk/sdk/pfc/pocket_char_ops.h Mon Jan 05 02:15:46 2026 -0500 @@ -0,0 +1,269 @@ +#pragma once + +// Standalone header (no dependencies) with implementations of PFC UTF-8 & UTF-16 manipulation routines + +static const uint8_t mask_tab[6] = { 0x80,0xE0,0xF0,0xF8,0xFC,0xFE }; + +static const uint8_t val_tab[6] = { 0,0xC0,0xE0,0xF0,0xF8,0xFC }; + +size_t utf8_char_len_from_header(char p_c) noexcept +{ + size_t cnt = 0; + for (;;) + { + if ((p_c & mask_tab[cnt]) == val_tab[cnt]) break; + if (++cnt >= 6) return 0; + } + + return cnt + 1; + +} +size_t utf8_decode_char(const char *p_utf8, unsigned & wide) noexcept { + const uint8_t * utf8 = (const uint8_t*)p_utf8; + const size_t max = 6; + + if (utf8[0]<0x80) { + wide = utf8[0]; + return utf8[0]>0 ? 1 : 0; + } + wide = 0; + + unsigned res = 0; + unsigned n; + unsigned cnt = 0; + for (;;) + { + if ((*utf8&mask_tab[cnt]) == val_tab[cnt]) break; + if (++cnt >= max) return 0; + } + cnt++; + + if (cnt == 2 && !(*utf8 & 0x1E)) return 0; + + if (cnt == 1) + res = *utf8; + else + res = (0xFF >> (cnt + 1))&*utf8; + + for (n = 1; n<cnt; n++) + { + if ((utf8[n] & 0xC0) != 0x80) + return 0; + if (!res && n == 2 && !((utf8[n] & 0x7F) >> (7 - cnt))) + return 0; + + res = (res << 6) | (utf8[n] & 0x3F); + } + + wide = res; + + return cnt; +} + +size_t utf8_decode_char(const char *p_utf8, unsigned & wide, size_t max) noexcept +{ + const uint8_t * utf8 = (const uint8_t*)p_utf8; + + if (max == 0) { + wide = 0; + return 0; + } + + if (utf8[0]<0x80) { + wide = utf8[0]; + return utf8[0]>0 ? 1 : 0; + } + if (max>6) max = 6; + wide = 0; + + unsigned res = 0; + unsigned n; + unsigned cnt = 0; + for (;;) + { + if ((*utf8&mask_tab[cnt]) == val_tab[cnt]) break; + if (++cnt >= max) return 0; + } + cnt++; + + if (cnt == 2 && !(*utf8 & 0x1E)) return 0; + + if (cnt == 1) + res = *utf8; + else + res = (0xFF >> (cnt + 1))&*utf8; + + for (n = 1; n<cnt; n++) + { + if ((utf8[n] & 0xC0) != 0x80) + return 0; + if (!res && n == 2 && !((utf8[n] & 0x7F) >> (7 - cnt))) + return 0; + + res = (res << 6) | (utf8[n] & 0x3F); + } + + wide = res; + + return cnt; +} + + +size_t utf8_encode_char(unsigned wide, char * target) noexcept +{ + size_t count; + + if (wide < 0x80) + count = 1; + else if (wide < 0x800) + count = 2; + else if (wide < 0x10000) + count = 3; + else if (wide < 0x200000) + count = 4; + else if (wide < 0x4000000) + count = 5; + else if (wide <= 0x7FFFFFFF) + count = 6; + else + return 0; + //if (count>max) return 0; + + if (target == 0) + return count; + + switch (count) + { + case 6: + target[5] = 0x80 | (wide & 0x3F); + wide = wide >> 6; + wide |= 0x4000000; + [[fallthrough]]; + case 5: + target[4] = 0x80 | (wide & 0x3F); + wide = wide >> 6; + wide |= 0x200000; + [[fallthrough]]; + case 4: + target[3] = 0x80 | (wide & 0x3F); + wide = wide >> 6; + wide |= 0x10000; + [[fallthrough]]; + case 3: + target[2] = 0x80 | (wide & 0x3F); + wide = wide >> 6; + wide |= 0x800; + [[fallthrough]]; + case 2: + target[1] = 0x80 | (wide & 0x3F); + wide = wide >> 6; + wide |= 0xC0; + [[fallthrough]]; + case 1: + target[0] = wide & 0xFF; + } + + return count; +} + +size_t utf16_encode_char(unsigned cur_wchar, char16_t * out) noexcept +{ + if (cur_wchar < 0x10000) { + *out = (char16_t)cur_wchar; return 1; + } else if (cur_wchar < (1 << 20)) { + unsigned c = cur_wchar - 0x10000; + //MSDN: + //The first (high) surrogate is a 16-bit code value in the range U+D800 to U+DBFF. The second (low) surrogate is a 16-bit code value in the range U+DC00 to U+DFFF. Using surrogates, Unicode can support over one million characters. For more details about surrogates, refer to The Unicode Standard, version 2.0. + out[0] = (char16_t)(0xD800 | (0x3FF & (c >> 10))); + out[1] = (char16_t)(0xDC00 | (0x3FF & c)); + return 2; + } else { + *out = '?'; return 1; + } +} + +size_t utf16_decode_char(const char16_t * p_source, unsigned * p_out, size_t p_source_length) noexcept { + if (p_source_length == 0) { *p_out = 0; return 0; } else if (p_source_length == 1) { + *p_out = p_source[0]; + return 1; + } else { + size_t retval = 0; + unsigned decoded = p_source[0]; + if (decoded != 0) + { + retval = 1; + if ((decoded & 0xFC00) == 0xD800) + { + unsigned low = p_source[1]; + if ((low & 0xFC00) == 0xDC00) + { + decoded = 0x10000 + (((decoded & 0x3FF) << 10) | (low & 0x3FF)); + retval = 2; + } + } + } + *p_out = decoded; + return retval; + } +} + +unsigned utf8_get_char(const char * src) +{ + unsigned rv = 0; + utf8_decode_char(src, rv); + return rv; +} + + +size_t utf8_char_len(const char * s, size_t max) noexcept +{ + unsigned dummy; + return utf8_decode_char(s, dummy, max); +} + +size_t skip_utf8_chars(const char * ptr, size_t count) noexcept +{ + size_t num = 0; + for (; count && ptr[num]; count--) + { + size_t d = utf8_char_len(ptr + num, (size_t)(-1)); + if (d <= 0) break; + num += d; + } + return num; +} + +bool is_valid_utf8(const char * param, size_t max) { + size_t walk = 0; + while (walk < max && param[walk] != 0) { + size_t d; + unsigned dummy; + d = utf8_decode_char(param + walk, dummy, max - walk); + if (d == 0) return false; + walk += d; + if (walk > max) { + // should not get here + return false; + } + } + return true; +} + +bool is_canonical_utf8(const char * param, size_t max) { + char scratch[6]; + size_t walk = 0; + while (walk < max && param[walk] != 0) { + size_t d; + unsigned c; + d = utf8_decode_char(param + walk, c, max - walk); + if (d == 0) return false; // bad UTF-8 + walk += d; + if (walk > max) { + // should not get here + return false; + } + if (utf8_encode_char(c, scratch) != d) return false; + } + return true; + +}
