|
1
|
1 #pragma once
|
|
|
2
|
|
|
3 // Standalone header (no dependencies) with implementations of PFC UTF-8 & UTF-16 manipulation routines
|
|
|
4
|
|
|
5 static const uint8_t mask_tab[6] = { 0x80,0xE0,0xF0,0xF8,0xFC,0xFE };
|
|
|
6
|
|
|
7 static const uint8_t val_tab[6] = { 0,0xC0,0xE0,0xF0,0xF8,0xFC };
|
|
|
8
|
|
|
9 size_t utf8_char_len_from_header(char p_c) noexcept
|
|
|
10 {
|
|
|
11 size_t cnt = 0;
|
|
|
12 for (;;)
|
|
|
13 {
|
|
|
14 if ((p_c & mask_tab[cnt]) == val_tab[cnt]) break;
|
|
|
15 if (++cnt >= 6) return 0;
|
|
|
16 }
|
|
|
17
|
|
|
18 return cnt + 1;
|
|
|
19
|
|
|
20 }
|
|
|
21 size_t utf8_decode_char(const char *p_utf8, unsigned & wide) noexcept {
|
|
|
22 const uint8_t * utf8 = (const uint8_t*)p_utf8;
|
|
|
23 const size_t max = 6;
|
|
|
24
|
|
|
25 if (utf8[0]<0x80) {
|
|
|
26 wide = utf8[0];
|
|
|
27 return utf8[0]>0 ? 1 : 0;
|
|
|
28 }
|
|
|
29 wide = 0;
|
|
|
30
|
|
|
31 unsigned res = 0;
|
|
|
32 unsigned n;
|
|
|
33 unsigned cnt = 0;
|
|
|
34 for (;;)
|
|
|
35 {
|
|
|
36 if ((*utf8&mask_tab[cnt]) == val_tab[cnt]) break;
|
|
|
37 if (++cnt >= max) return 0;
|
|
|
38 }
|
|
|
39 cnt++;
|
|
|
40
|
|
|
41 if (cnt == 2 && !(*utf8 & 0x1E)) return 0;
|
|
|
42
|
|
|
43 if (cnt == 1)
|
|
|
44 res = *utf8;
|
|
|
45 else
|
|
|
46 res = (0xFF >> (cnt + 1))&*utf8;
|
|
|
47
|
|
|
48 for (n = 1; n<cnt; n++)
|
|
|
49 {
|
|
|
50 if ((utf8[n] & 0xC0) != 0x80)
|
|
|
51 return 0;
|
|
|
52 if (!res && n == 2 && !((utf8[n] & 0x7F) >> (7 - cnt)))
|
|
|
53 return 0;
|
|
|
54
|
|
|
55 res = (res << 6) | (utf8[n] & 0x3F);
|
|
|
56 }
|
|
|
57
|
|
|
58 wide = res;
|
|
|
59
|
|
|
60 return cnt;
|
|
|
61 }
|
|
|
62
|
|
|
63 size_t utf8_decode_char(const char *p_utf8, unsigned & wide, size_t max) noexcept
|
|
|
64 {
|
|
|
65 const uint8_t * utf8 = (const uint8_t*)p_utf8;
|
|
|
66
|
|
|
67 if (max == 0) {
|
|
|
68 wide = 0;
|
|
|
69 return 0;
|
|
|
70 }
|
|
|
71
|
|
|
72 if (utf8[0]<0x80) {
|
|
|
73 wide = utf8[0];
|
|
|
74 return utf8[0]>0 ? 1 : 0;
|
|
|
75 }
|
|
|
76 if (max>6) max = 6;
|
|
|
77 wide = 0;
|
|
|
78
|
|
|
79 unsigned res = 0;
|
|
|
80 unsigned n;
|
|
|
81 unsigned cnt = 0;
|
|
|
82 for (;;)
|
|
|
83 {
|
|
|
84 if ((*utf8&mask_tab[cnt]) == val_tab[cnt]) break;
|
|
|
85 if (++cnt >= max) return 0;
|
|
|
86 }
|
|
|
87 cnt++;
|
|
|
88
|
|
|
89 if (cnt == 2 && !(*utf8 & 0x1E)) return 0;
|
|
|
90
|
|
|
91 if (cnt == 1)
|
|
|
92 res = *utf8;
|
|
|
93 else
|
|
|
94 res = (0xFF >> (cnt + 1))&*utf8;
|
|
|
95
|
|
|
96 for (n = 1; n<cnt; n++)
|
|
|
97 {
|
|
|
98 if ((utf8[n] & 0xC0) != 0x80)
|
|
|
99 return 0;
|
|
|
100 if (!res && n == 2 && !((utf8[n] & 0x7F) >> (7 - cnt)))
|
|
|
101 return 0;
|
|
|
102
|
|
|
103 res = (res << 6) | (utf8[n] & 0x3F);
|
|
|
104 }
|
|
|
105
|
|
|
106 wide = res;
|
|
|
107
|
|
|
108 return cnt;
|
|
|
109 }
|
|
|
110
|
|
|
111
|
|
|
112 size_t utf8_encode_char(unsigned wide, char * target) noexcept
|
|
|
113 {
|
|
|
114 size_t count;
|
|
|
115
|
|
|
116 if (wide < 0x80)
|
|
|
117 count = 1;
|
|
|
118 else if (wide < 0x800)
|
|
|
119 count = 2;
|
|
|
120 else if (wide < 0x10000)
|
|
|
121 count = 3;
|
|
|
122 else if (wide < 0x200000)
|
|
|
123 count = 4;
|
|
|
124 else if (wide < 0x4000000)
|
|
|
125 count = 5;
|
|
|
126 else if (wide <= 0x7FFFFFFF)
|
|
|
127 count = 6;
|
|
|
128 else
|
|
|
129 return 0;
|
|
|
130 //if (count>max) return 0;
|
|
|
131
|
|
|
132 if (target == 0)
|
|
|
133 return count;
|
|
|
134
|
|
|
135 switch (count)
|
|
|
136 {
|
|
|
137 case 6:
|
|
|
138 target[5] = 0x80 | (wide & 0x3F);
|
|
|
139 wide = wide >> 6;
|
|
|
140 wide |= 0x4000000;
|
|
|
141 [[fallthrough]];
|
|
|
142 case 5:
|
|
|
143 target[4] = 0x80 | (wide & 0x3F);
|
|
|
144 wide = wide >> 6;
|
|
|
145 wide |= 0x200000;
|
|
|
146 [[fallthrough]];
|
|
|
147 case 4:
|
|
|
148 target[3] = 0x80 | (wide & 0x3F);
|
|
|
149 wide = wide >> 6;
|
|
|
150 wide |= 0x10000;
|
|
|
151 [[fallthrough]];
|
|
|
152 case 3:
|
|
|
153 target[2] = 0x80 | (wide & 0x3F);
|
|
|
154 wide = wide >> 6;
|
|
|
155 wide |= 0x800;
|
|
|
156 [[fallthrough]];
|
|
|
157 case 2:
|
|
|
158 target[1] = 0x80 | (wide & 0x3F);
|
|
|
159 wide = wide >> 6;
|
|
|
160 wide |= 0xC0;
|
|
|
161 [[fallthrough]];
|
|
|
162 case 1:
|
|
|
163 target[0] = wide & 0xFF;
|
|
|
164 }
|
|
|
165
|
|
|
166 return count;
|
|
|
167 }
|
|
|
168
|
|
|
169 size_t utf16_encode_char(unsigned cur_wchar, char16_t * out) noexcept
|
|
|
170 {
|
|
|
171 if (cur_wchar < 0x10000) {
|
|
|
172 *out = (char16_t)cur_wchar; return 1;
|
|
|
173 } else if (cur_wchar < (1 << 20)) {
|
|
|
174 unsigned c = cur_wchar - 0x10000;
|
|
|
175 //MSDN:
|
|
|
176 //The first (high) surrogate is a 16-bit code value in the range U+D800 to U+DBFF. The second (low) surrogate is a 16-bit code value in the range U+DC00 to U+DFFF. Using surrogates, Unicode can support over one million characters. For more details about surrogates, refer to The Unicode Standard, version 2.0.
|
|
|
177 out[0] = (char16_t)(0xD800 | (0x3FF & (c >> 10)));
|
|
|
178 out[1] = (char16_t)(0xDC00 | (0x3FF & c));
|
|
|
179 return 2;
|
|
|
180 } else {
|
|
|
181 *out = '?'; return 1;
|
|
|
182 }
|
|
|
183 }
|
|
|
184
|
|
|
185 size_t utf16_decode_char(const char16_t * p_source, unsigned * p_out, size_t p_source_length) noexcept {
|
|
|
186 if (p_source_length == 0) { *p_out = 0; return 0; } else if (p_source_length == 1) {
|
|
|
187 *p_out = p_source[0];
|
|
|
188 return 1;
|
|
|
189 } else {
|
|
|
190 size_t retval = 0;
|
|
|
191 unsigned decoded = p_source[0];
|
|
|
192 if (decoded != 0)
|
|
|
193 {
|
|
|
194 retval = 1;
|
|
|
195 if ((decoded & 0xFC00) == 0xD800)
|
|
|
196 {
|
|
|
197 unsigned low = p_source[1];
|
|
|
198 if ((low & 0xFC00) == 0xDC00)
|
|
|
199 {
|
|
|
200 decoded = 0x10000 + (((decoded & 0x3FF) << 10) | (low & 0x3FF));
|
|
|
201 retval = 2;
|
|
|
202 }
|
|
|
203 }
|
|
|
204 }
|
|
|
205 *p_out = decoded;
|
|
|
206 return retval;
|
|
|
207 }
|
|
|
208 }
|
|
|
209
|
|
|
210 unsigned utf8_get_char(const char * src)
|
|
|
211 {
|
|
|
212 unsigned rv = 0;
|
|
|
213 utf8_decode_char(src, rv);
|
|
|
214 return rv;
|
|
|
215 }
|
|
|
216
|
|
|
217
|
|
|
218 size_t utf8_char_len(const char * s, size_t max) noexcept
|
|
|
219 {
|
|
|
220 unsigned dummy;
|
|
|
221 return utf8_decode_char(s, dummy, max);
|
|
|
222 }
|
|
|
223
|
|
|
224 size_t skip_utf8_chars(const char * ptr, size_t count) noexcept
|
|
|
225 {
|
|
|
226 size_t num = 0;
|
|
|
227 for (; count && ptr[num]; count--)
|
|
|
228 {
|
|
|
229 size_t d = utf8_char_len(ptr + num, (size_t)(-1));
|
|
|
230 if (d <= 0) break;
|
|
|
231 num += d;
|
|
|
232 }
|
|
|
233 return num;
|
|
|
234 }
|
|
|
235
|
|
|
236 bool is_valid_utf8(const char * param, size_t max) {
|
|
|
237 size_t walk = 0;
|
|
|
238 while (walk < max && param[walk] != 0) {
|
|
|
239 size_t d;
|
|
|
240 unsigned dummy;
|
|
|
241 d = utf8_decode_char(param + walk, dummy, max - walk);
|
|
|
242 if (d == 0) return false;
|
|
|
243 walk += d;
|
|
|
244 if (walk > max) {
|
|
|
245 // should not get here
|
|
|
246 return false;
|
|
|
247 }
|
|
|
248 }
|
|
|
249 return true;
|
|
|
250 }
|
|
|
251
|
|
|
252 bool is_canonical_utf8(const char * param, size_t max) {
|
|
|
253 char scratch[6];
|
|
|
254 size_t walk = 0;
|
|
|
255 while (walk < max && param[walk] != 0) {
|
|
|
256 size_t d;
|
|
|
257 unsigned c;
|
|
|
258 d = utf8_decode_char(param + walk, c, max - walk);
|
|
|
259 if (d == 0) return false; // bad UTF-8
|
|
|
260 walk += d;
|
|
|
261 if (walk > max) {
|
|
|
262 // should not get here
|
|
|
263 return false;
|
|
|
264 }
|
|
|
265 if (utf8_encode_char(c, scratch) != d) return false;
|
|
|
266 }
|
|
|
267 return true;
|
|
|
268
|
|
|
269 }
|