Mercurial > foo_out_sdl
comparison foosdk/sdk/pfc/pocket_char_ops.h @ 1:20d02a178406 default tip
*: check in everything else
yay
| author | Paper <paper@tflc.us> |
|---|---|
| date | Mon, 05 Jan 2026 02:15:46 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:e9bb126753e7 | 1:20d02a178406 |
|---|---|
| 1 #pragma once | |
| 2 | |
| 3 // Standalone header (no dependencies) with implementations of PFC UTF-8 & UTF-16 manipulation routines | |
| 4 | |
| 5 static const uint8_t mask_tab[6] = { 0x80,0xE0,0xF0,0xF8,0xFC,0xFE }; | |
| 6 | |
| 7 static const uint8_t val_tab[6] = { 0,0xC0,0xE0,0xF0,0xF8,0xFC }; | |
| 8 | |
| 9 size_t utf8_char_len_from_header(char p_c) noexcept | |
| 10 { | |
| 11 size_t cnt = 0; | |
| 12 for (;;) | |
| 13 { | |
| 14 if ((p_c & mask_tab[cnt]) == val_tab[cnt]) break; | |
| 15 if (++cnt >= 6) return 0; | |
| 16 } | |
| 17 | |
| 18 return cnt + 1; | |
| 19 | |
| 20 } | |
| 21 size_t utf8_decode_char(const char *p_utf8, unsigned & wide) noexcept { | |
| 22 const uint8_t * utf8 = (const uint8_t*)p_utf8; | |
| 23 const size_t max = 6; | |
| 24 | |
| 25 if (utf8[0]<0x80) { | |
| 26 wide = utf8[0]; | |
| 27 return utf8[0]>0 ? 1 : 0; | |
| 28 } | |
| 29 wide = 0; | |
| 30 | |
| 31 unsigned res = 0; | |
| 32 unsigned n; | |
| 33 unsigned cnt = 0; | |
| 34 for (;;) | |
| 35 { | |
| 36 if ((*utf8&mask_tab[cnt]) == val_tab[cnt]) break; | |
| 37 if (++cnt >= max) return 0; | |
| 38 } | |
| 39 cnt++; | |
| 40 | |
| 41 if (cnt == 2 && !(*utf8 & 0x1E)) return 0; | |
| 42 | |
| 43 if (cnt == 1) | |
| 44 res = *utf8; | |
| 45 else | |
| 46 res = (0xFF >> (cnt + 1))&*utf8; | |
| 47 | |
| 48 for (n = 1; n<cnt; n++) | |
| 49 { | |
| 50 if ((utf8[n] & 0xC0) != 0x80) | |
| 51 return 0; | |
| 52 if (!res && n == 2 && !((utf8[n] & 0x7F) >> (7 - cnt))) | |
| 53 return 0; | |
| 54 | |
| 55 res = (res << 6) | (utf8[n] & 0x3F); | |
| 56 } | |
| 57 | |
| 58 wide = res; | |
| 59 | |
| 60 return cnt; | |
| 61 } | |
| 62 | |
| 63 size_t utf8_decode_char(const char *p_utf8, unsigned & wide, size_t max) noexcept | |
| 64 { | |
| 65 const uint8_t * utf8 = (const uint8_t*)p_utf8; | |
| 66 | |
| 67 if (max == 0) { | |
| 68 wide = 0; | |
| 69 return 0; | |
| 70 } | |
| 71 | |
| 72 if (utf8[0]<0x80) { | |
| 73 wide = utf8[0]; | |
| 74 return utf8[0]>0 ? 1 : 0; | |
| 75 } | |
| 76 if (max>6) max = 6; | |
| 77 wide = 0; | |
| 78 | |
| 79 unsigned res = 0; | |
| 80 unsigned n; | |
| 81 unsigned cnt = 0; | |
| 82 for (;;) | |
| 83 { | |
| 84 if ((*utf8&mask_tab[cnt]) == val_tab[cnt]) break; | |
| 85 if (++cnt >= max) return 0; | |
| 86 } | |
| 87 cnt++; | |
| 88 | |
| 89 if (cnt == 2 && !(*utf8 & 0x1E)) return 0; | |
| 90 | |
| 91 if (cnt == 1) | |
| 92 res = *utf8; | |
| 93 else | |
| 94 res = (0xFF >> (cnt + 1))&*utf8; | |
| 95 | |
| 96 for (n = 1; n<cnt; n++) | |
| 97 { | |
| 98 if ((utf8[n] & 0xC0) != 0x80) | |
| 99 return 0; | |
| 100 if (!res && n == 2 && !((utf8[n] & 0x7F) >> (7 - cnt))) | |
| 101 return 0; | |
| 102 | |
| 103 res = (res << 6) | (utf8[n] & 0x3F); | |
| 104 } | |
| 105 | |
| 106 wide = res; | |
| 107 | |
| 108 return cnt; | |
| 109 } | |
| 110 | |
| 111 | |
| 112 size_t utf8_encode_char(unsigned wide, char * target) noexcept | |
| 113 { | |
| 114 size_t count; | |
| 115 | |
| 116 if (wide < 0x80) | |
| 117 count = 1; | |
| 118 else if (wide < 0x800) | |
| 119 count = 2; | |
| 120 else if (wide < 0x10000) | |
| 121 count = 3; | |
| 122 else if (wide < 0x200000) | |
| 123 count = 4; | |
| 124 else if (wide < 0x4000000) | |
| 125 count = 5; | |
| 126 else if (wide <= 0x7FFFFFFF) | |
| 127 count = 6; | |
| 128 else | |
| 129 return 0; | |
| 130 //if (count>max) return 0; | |
| 131 | |
| 132 if (target == 0) | |
| 133 return count; | |
| 134 | |
| 135 switch (count) | |
| 136 { | |
| 137 case 6: | |
| 138 target[5] = 0x80 | (wide & 0x3F); | |
| 139 wide = wide >> 6; | |
| 140 wide |= 0x4000000; | |
| 141 [[fallthrough]]; | |
| 142 case 5: | |
| 143 target[4] = 0x80 | (wide & 0x3F); | |
| 144 wide = wide >> 6; | |
| 145 wide |= 0x200000; | |
| 146 [[fallthrough]]; | |
| 147 case 4: | |
| 148 target[3] = 0x80 | (wide & 0x3F); | |
| 149 wide = wide >> 6; | |
| 150 wide |= 0x10000; | |
| 151 [[fallthrough]]; | |
| 152 case 3: | |
| 153 target[2] = 0x80 | (wide & 0x3F); | |
| 154 wide = wide >> 6; | |
| 155 wide |= 0x800; | |
| 156 [[fallthrough]]; | |
| 157 case 2: | |
| 158 target[1] = 0x80 | (wide & 0x3F); | |
| 159 wide = wide >> 6; | |
| 160 wide |= 0xC0; | |
| 161 [[fallthrough]]; | |
| 162 case 1: | |
| 163 target[0] = wide & 0xFF; | |
| 164 } | |
| 165 | |
| 166 return count; | |
| 167 } | |
| 168 | |
| 169 size_t utf16_encode_char(unsigned cur_wchar, char16_t * out) noexcept | |
| 170 { | |
| 171 if (cur_wchar < 0x10000) { | |
| 172 *out = (char16_t)cur_wchar; return 1; | |
| 173 } else if (cur_wchar < (1 << 20)) { | |
| 174 unsigned c = cur_wchar - 0x10000; | |
| 175 //MSDN: | |
| 176 //The first (high) surrogate is a 16-bit code value in the range U+D800 to U+DBFF. The second (low) surrogate is a 16-bit code value in the range U+DC00 to U+DFFF. Using surrogates, Unicode can support over one million characters. For more details about surrogates, refer to The Unicode Standard, version 2.0. | |
| 177 out[0] = (char16_t)(0xD800 | (0x3FF & (c >> 10))); | |
| 178 out[1] = (char16_t)(0xDC00 | (0x3FF & c)); | |
| 179 return 2; | |
| 180 } else { | |
| 181 *out = '?'; return 1; | |
| 182 } | |
| 183 } | |
| 184 | |
| 185 size_t utf16_decode_char(const char16_t * p_source, unsigned * p_out, size_t p_source_length) noexcept { | |
| 186 if (p_source_length == 0) { *p_out = 0; return 0; } else if (p_source_length == 1) { | |
| 187 *p_out = p_source[0]; | |
| 188 return 1; | |
| 189 } else { | |
| 190 size_t retval = 0; | |
| 191 unsigned decoded = p_source[0]; | |
| 192 if (decoded != 0) | |
| 193 { | |
| 194 retval = 1; | |
| 195 if ((decoded & 0xFC00) == 0xD800) | |
| 196 { | |
| 197 unsigned low = p_source[1]; | |
| 198 if ((low & 0xFC00) == 0xDC00) | |
| 199 { | |
| 200 decoded = 0x10000 + (((decoded & 0x3FF) << 10) | (low & 0x3FF)); | |
| 201 retval = 2; | |
| 202 } | |
| 203 } | |
| 204 } | |
| 205 *p_out = decoded; | |
| 206 return retval; | |
| 207 } | |
| 208 } | |
| 209 | |
| 210 unsigned utf8_get_char(const char * src) | |
| 211 { | |
| 212 unsigned rv = 0; | |
| 213 utf8_decode_char(src, rv); | |
| 214 return rv; | |
| 215 } | |
| 216 | |
| 217 | |
| 218 size_t utf8_char_len(const char * s, size_t max) noexcept | |
| 219 { | |
| 220 unsigned dummy; | |
| 221 return utf8_decode_char(s, dummy, max); | |
| 222 } | |
| 223 | |
| 224 size_t skip_utf8_chars(const char * ptr, size_t count) noexcept | |
| 225 { | |
| 226 size_t num = 0; | |
| 227 for (; count && ptr[num]; count--) | |
| 228 { | |
| 229 size_t d = utf8_char_len(ptr + num, (size_t)(-1)); | |
| 230 if (d <= 0) break; | |
| 231 num += d; | |
| 232 } | |
| 233 return num; | |
| 234 } | |
| 235 | |
| 236 bool is_valid_utf8(const char * param, size_t max) { | |
| 237 size_t walk = 0; | |
| 238 while (walk < max && param[walk] != 0) { | |
| 239 size_t d; | |
| 240 unsigned dummy; | |
| 241 d = utf8_decode_char(param + walk, dummy, max - walk); | |
| 242 if (d == 0) return false; | |
| 243 walk += d; | |
| 244 if (walk > max) { | |
| 245 // should not get here | |
| 246 return false; | |
| 247 } | |
| 248 } | |
| 249 return true; | |
| 250 } | |
| 251 | |
| 252 bool is_canonical_utf8(const char * param, size_t max) { | |
| 253 char scratch[6]; | |
| 254 size_t walk = 0; | |
| 255 while (walk < max && param[walk] != 0) { | |
| 256 size_t d; | |
| 257 unsigned c; | |
| 258 d = utf8_decode_char(param + walk, c, max - walk); | |
| 259 if (d == 0) return false; // bad UTF-8 | |
| 260 walk += d; | |
| 261 if (walk > max) { | |
| 262 // should not get here | |
| 263 return false; | |
| 264 } | |
| 265 if (utf8_encode_char(c, scratch) != d) return false; | |
| 266 } | |
| 267 return true; | |
| 268 | |
| 269 } |
