Mercurial > minori
comparison dep/utf8proc/utf8proc.c @ 265:ff0b2052b234
*: add missing utf8proc files
I'm an idiot LOL
author | Paper <paper@paper.us.eu.org> |
---|---|
date | Thu, 11 Apr 2024 10:22:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
264:9a04802848c0 | 265:ff0b2052b234 |
---|---|
1 /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */ | |
2 /* | |
3 * Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors. | |
4 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany | |
5 * | |
6 * Permission is hereby granted, free of charge, to any person obtaining a | |
7 * copy of this software and associated documentation files (the "Software"), | |
8 * to deal in the Software without restriction, including without limitation | |
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
10 * and/or sell copies of the Software, and to permit persons to whom the | |
11 * Software is furnished to do so, subject to the following conditions: | |
12 * | |
13 * The above copyright notice and this permission notice shall be included in | |
14 * all copies or substantial portions of the Software. | |
15 * | |
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
22 * DEALINGS IN THE SOFTWARE. | |
23 */ | |
24 | |
25 /* | |
26 * This library contains derived data from a modified version of the | |
27 * Unicode data files. | |
28 * | |
29 * The original data files are available at | |
30 * https://www.unicode.org/Public/UNIDATA/ | |
31 * | |
32 * Please notice the copyright statement in the file "utf8proc_data.c". | |
33 */ | |
34 | |
35 | |
36 /* | |
37 * File name: utf8proc.c | |
38 * | |
39 * Description: | |
40 * Implementation of libutf8proc. | |
41 */ | |
42 | |
43 | |
44 #include "utf8proc.h" | |
45 | |
46 #ifndef SSIZE_MAX | |
47 #define SSIZE_MAX ((size_t)SIZE_MAX/2) | |
48 #endif | |
49 #ifndef UINT16_MAX | |
50 # define UINT16_MAX 65535U | |
51 #endif | |
52 | |
53 #include "utf8proc_data.c" | |
54 | |
55 | |
56 UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = { | |
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
69 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
71 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
72 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; | |
73 | |
74 #define UTF8PROC_HANGUL_SBASE 0xAC00 | |
75 #define UTF8PROC_HANGUL_LBASE 0x1100 | |
76 #define UTF8PROC_HANGUL_VBASE 0x1161 | |
77 #define UTF8PROC_HANGUL_TBASE 0x11A7 | |
78 #define UTF8PROC_HANGUL_LCOUNT 19 | |
79 #define UTF8PROC_HANGUL_VCOUNT 21 | |
80 #define UTF8PROC_HANGUL_TCOUNT 28 | |
81 #define UTF8PROC_HANGUL_NCOUNT 588 | |
82 #define UTF8PROC_HANGUL_SCOUNT 11172 | |
83 /* END is exclusive */ | |
84 #define UTF8PROC_HANGUL_L_START 0x1100 | |
85 #define UTF8PROC_HANGUL_L_END 0x115A | |
86 #define UTF8PROC_HANGUL_L_FILLER 0x115F | |
87 #define UTF8PROC_HANGUL_V_START 0x1160 | |
88 #define UTF8PROC_HANGUL_V_END 0x11A3 | |
89 #define UTF8PROC_HANGUL_T_START 0x11A8 | |
90 #define UTF8PROC_HANGUL_T_END 0x11FA | |
91 #define UTF8PROC_HANGUL_S_START 0xAC00 | |
92 #define UTF8PROC_HANGUL_S_END 0xD7A4 | |
93 | |
94 /* Should follow semantic-versioning rules (semver.org) based on API | |
95 compatibility. (Note that the shared-library version number will | |
96 be different, being based on ABI compatibility.): */ | |
97 #define STRINGIZEx(x) #x | |
98 #define STRINGIZE(x) STRINGIZEx(x) | |
99 UTF8PROC_DLLEXPORT const char *utf8proc_version(void) { | |
100 return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) ""; | |
101 } | |
102 | |
103 UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) { | |
104 return "15.1.0"; | |
105 } | |
106 | |
107 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { | |
108 switch (errcode) { | |
109 case UTF8PROC_ERROR_NOMEM: | |
110 return "Memory for processing UTF-8 data could not be allocated."; | |
111 case UTF8PROC_ERROR_OVERFLOW: | |
112 return "UTF-8 string is too long to be processed."; | |
113 case UTF8PROC_ERROR_INVALIDUTF8: | |
114 return "Invalid UTF-8 string"; | |
115 case UTF8PROC_ERROR_NOTASSIGNED: | |
116 return "Unassigned Unicode code point found in UTF-8 string."; | |
117 case UTF8PROC_ERROR_INVALIDOPTS: | |
118 return "Invalid options for UTF-8 processing chosen."; | |
119 default: | |
120 return "An unknown error occurred while processing UTF-8 data."; | |
121 } | |
122 } | |
123 | |
124 #define utf_cont(ch) (((ch) & 0xc0) == 0x80) | |
125 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( | |
126 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst | |
127 ) { | |
128 utf8proc_int32_t uc; | |
129 const utf8proc_uint8_t *end; | |
130 | |
131 *dst = -1; | |
132 if (!strlen) return 0; | |
133 end = str + ((strlen < 0) ? 4 : strlen); | |
134 uc = *str++; | |
135 if (uc < 0x80) { | |
136 *dst = uc; | |
137 return 1; | |
138 } | |
139 // Must be between 0xc2 and 0xf4 inclusive to be valid | |
140 if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; | |
141 if (uc < 0xe0) { // 2-byte sequence | |
142 // Must have valid continuation character | |
143 if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; | |
144 *dst = ((uc & 0x1f)<<6) | (*str & 0x3f); | |
145 return 2; | |
146 } | |
147 if (uc < 0xf0) { // 3-byte sequence | |
148 if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) | |
149 return UTF8PROC_ERROR_INVALIDUTF8; | |
150 // Check for surrogate chars | |
151 if (uc == 0xed && *str > 0x9f) | |
152 return UTF8PROC_ERROR_INVALIDUTF8; | |
153 uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f); | |
154 if (uc < 0x800) | |
155 return UTF8PROC_ERROR_INVALIDUTF8; | |
156 *dst = uc; | |
157 return 3; | |
158 } | |
159 // 4-byte sequence | |
160 // Must have 3 valid continuation characters | |
161 if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2])) | |
162 return UTF8PROC_ERROR_INVALIDUTF8; | |
163 // Make sure in correct range (0x10000 - 0x10ffff) | |
164 if (uc == 0xf0) { | |
165 if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8; | |
166 } else if (uc == 0xf4) { | |
167 if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8; | |
168 } | |
169 *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f); | |
170 return 4; | |
171 } | |
172 | |
173 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { | |
174 return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000); | |
175 } | |
176 | |
177 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { | |
178 if (uc < 0x00) { | |
179 return 0; | |
180 } else if (uc < 0x80) { | |
181 dst[0] = (utf8proc_uint8_t) uc; | |
182 return 1; | |
183 } else if (uc < 0x800) { | |
184 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); | |
185 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
186 return 2; | |
187 // Note: we allow encoding 0xd800-0xdfff here, so as not to change | |
188 // the API, however, these are actually invalid in UTF-8 | |
189 } else if (uc < 0x10000) { | |
190 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); | |
191 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | |
192 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
193 return 3; | |
194 } else if (uc < 0x110000) { | |
195 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); | |
196 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); | |
197 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | |
198 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
199 return 4; | |
200 } else return 0; | |
201 } | |
202 | |
203 /* internal version used for inserting 0xff bytes between graphemes */ | |
204 static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { | |
205 if (uc < 0x00) { | |
206 if (uc == -1) { /* internal value used for grapheme breaks */ | |
207 dst[0] = (utf8proc_uint8_t)0xFF; | |
208 return 1; | |
209 } | |
210 return 0; | |
211 } else if (uc < 0x80) { | |
212 dst[0] = (utf8proc_uint8_t)uc; | |
213 return 1; | |
214 } else if (uc < 0x800) { | |
215 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); | |
216 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
217 return 2; | |
218 } else if (uc < 0x10000) { | |
219 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); | |
220 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | |
221 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
222 return 3; | |
223 } else if (uc < 0x110000) { | |
224 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); | |
225 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); | |
226 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | |
227 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
228 return 4; | |
229 } else return 0; | |
230 } | |
231 | |
232 /* internal "unsafe" version that does not check whether uc is in range */ | |
233 static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) { | |
234 /* ASSERT: uc >= 0 && uc < 0x110000 */ | |
235 return utf8proc_properties + ( | |
236 utf8proc_stage2table[ | |
237 utf8proc_stage1table[uc >> 8] + (uc & 0xFF) | |
238 ] | |
239 ); | |
240 } | |
241 | |
242 UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) { | |
243 return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc); | |
244 } | |
245 | |
246 /* return whether there is a grapheme break between boundclasses lbc and tbc | |
247 (according to the definition of extended grapheme clusters) | |
248 | |
249 Rule numbering refers to TR29 Version 29 (Unicode 9.0.0): | |
250 http://www.unicode.org/reports/tr29/tr29-29.html | |
251 | |
252 CAVEATS: | |
253 Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences) | |
254 and GB 12/13 (regional indicator code points) require knowledge of previous characters | |
255 and are thus not handled by this function. This may result in an incorrect break before | |
256 an E_Modifier class codepoint and an incorrectly missing break between two | |
257 REGIONAL_INDICATOR class code points if such support does not exist in the caller. | |
258 | |
259 See the special support in grapheme_break_extended, for required bookkeeping by the caller. | |
260 */ | |
261 static utf8proc_bool grapheme_break_simple(int lbc, int tbc) { | |
262 return | |
263 (lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1 | |
264 (lbc == UTF8PROC_BOUNDCLASS_CR && // GB3 | |
265 tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // --- | |
266 (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4 | |
267 (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5 | |
268 (lbc == UTF8PROC_BOUNDCLASS_L && // GB6 | |
269 (tbc == UTF8PROC_BOUNDCLASS_L || // --- | |
270 tbc == UTF8PROC_BOUNDCLASS_V || // --- | |
271 tbc == UTF8PROC_BOUNDCLASS_LV || // --- | |
272 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // --- | |
273 ((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7 | |
274 lbc == UTF8PROC_BOUNDCLASS_V) && // --- | |
275 (tbc == UTF8PROC_BOUNDCLASS_V || // --- | |
276 tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // --- | |
277 ((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8 | |
278 lbc == UTF8PROC_BOUNDCLASS_T) && // --- | |
279 tbc == UTF8PROC_BOUNDCLASS_T) ? false : // --- | |
280 (tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9 | |
281 tbc == UTF8PROC_BOUNDCLASS_ZWJ || // --- | |
282 tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a | |
283 lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b | |
284 (lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below) | |
285 tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ---- | |
286 (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below) | |
287 tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ---- | |
288 true; // GB999 | |
289 } | |
290 | |
291 static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state) | |
292 { | |
293 if (state) { | |
294 int state_bc, state_icb; /* boundclass and indic_conjunct_break state */ | |
295 if (*state == 0) { /* state initialization */ | |
296 state_bc = lbc; | |
297 state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE; | |
298 } | |
299 else { /* lbc and licb are already encoded in *state */ | |
300 state_bc = *state & 0xff; // 1st byte of state is bound class | |
301 state_icb = *state >> 8; // 2nd byte of state is indic conjunct break | |
302 } | |
303 | |
304 utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) && | |
305 !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER | |
306 && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c | |
307 | |
308 // Special support for GB9c. Don't break between two consonants | |
309 // separated 1+ linker characters and 0+ extend characters in any order. | |
310 // After a consonant, we enter LINKER state after at least one linker. | |
311 if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT | |
312 || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT | |
313 || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND) | |
314 state_icb = ticb; | |
315 else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER) | |
316 state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ? | |
317 UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb; | |
318 | |
319 // Special support for GB 12/13 made possible by GB999. After two RI | |
320 // class codepoints we want to force a break. Do this by resetting the | |
321 // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break | |
322 // after that character according to GB999 (unless of course such a break is | |
323 // forbidden by a different rule such as GB9). | |
324 if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) | |
325 state_bc = UTF8PROC_BOUNDCLASS_OTHER; | |
326 // Special support for GB11 (emoji extend* zwj / emoji) | |
327 else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) { | |
328 if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji | |
329 state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC; | |
330 else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ) | |
331 state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo | |
332 else | |
333 state_bc = tbc; | |
334 } | |
335 else | |
336 state_bc = tbc; | |
337 | |
338 *state = state_bc + (state_icb << 8); | |
339 return break_permitted; | |
340 } | |
341 else | |
342 return grapheme_break_simple(lbc, tbc); | |
343 } | |
344 | |
345 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful( | |
346 utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) { | |
347 | |
348 const utf8proc_property_t *p1 = utf8proc_get_property(c1); | |
349 const utf8proc_property_t *p2 = utf8proc_get_property(c2); | |
350 return grapheme_break_extended(p1->boundclass, | |
351 p2->boundclass, | |
352 p1->indic_conjunct_break, | |
353 p2->indic_conjunct_break, | |
354 state); | |
355 } | |
356 | |
357 | |
358 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( | |
359 utf8proc_int32_t c1, utf8proc_int32_t c2) { | |
360 return utf8proc_grapheme_break_stateful(c1, c2, NULL); | |
361 } | |
362 | |
363 static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) | |
364 { | |
365 utf8proc_int32_t entry_cp = **entry; | |
366 if ((entry_cp & 0xF800) == 0xD800) { | |
367 *entry = *entry + 1; | |
368 entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF); | |
369 entry_cp += 0x10000; | |
370 } | |
371 return entry_cp; | |
372 } | |
373 | |
374 static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex) | |
375 { | |
376 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex]; | |
377 return seqindex_decode_entry(&entry); | |
378 } | |
379 | |
380 static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { | |
381 utf8proc_ssize_t written = 0; | |
382 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF]; | |
383 int len = seqindex >> 14; | |
384 if (len >= 3) { | |
385 len = *entry; | |
386 entry++; | |
387 } | |
388 for (; len >= 0; entry++, len--) { | |
389 utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry); | |
390 | |
391 written += utf8proc_decompose_char(entry_cp, dst+written, | |
392 (bufsize > written) ? (bufsize - written) : 0, options, | |
393 last_boundclass); | |
394 if (written < 0) return UTF8PROC_ERROR_OVERFLOW; | |
395 } | |
396 return written; | |
397 } | |
398 | |
399 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) | |
400 { | |
401 utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex; | |
402 return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c; | |
403 } | |
404 | |
405 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) | |
406 { | |
407 utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex; | |
408 return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; | |
409 } | |
410 | |
411 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c) | |
412 { | |
413 utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex; | |
414 return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; | |
415 } | |
416 | |
417 UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c) | |
418 { | |
419 const utf8proc_property_t *p = utf8proc_get_property(c); | |
420 return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX; | |
421 } | |
422 | |
423 UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c) | |
424 { | |
425 const utf8proc_property_t *p = utf8proc_get_property(c); | |
426 return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT; | |
427 } | |
428 | |
429 /* return a character width analogous to wcwidth (except portable and | |
430 hopefully less buggy than most system wcwidth functions). */ | |
431 UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { | |
432 return utf8proc_get_property(c)->charwidth; | |
433 } | |
434 | |
435 UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { | |
436 return (utf8proc_category_t) utf8proc_get_property(c)->category; | |
437 } | |
438 | |
439 UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { | |
440 static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; | |
441 return s[utf8proc_category(c)]; | |
442 } | |
443 | |
444 #define utf8proc_decompose_lump(replacement_uc) \ | |
445 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ | |
446 options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass) | |
447 | |
448 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { | |
449 const utf8proc_property_t *property; | |
450 utf8proc_propval_t category; | |
451 utf8proc_int32_t hangul_sindex; | |
452 if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED; | |
453 property = unsafe_get_property(uc); | |
454 category = property->category; | |
455 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; | |
456 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { | |
457 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { | |
458 utf8proc_int32_t hangul_tindex; | |
459 if (bufsize >= 1) { | |
460 dst[0] = UTF8PROC_HANGUL_LBASE + | |
461 hangul_sindex / UTF8PROC_HANGUL_NCOUNT; | |
462 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + | |
463 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; | |
464 } | |
465 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; | |
466 if (!hangul_tindex) return 2; | |
467 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; | |
468 return 3; | |
469 } | |
470 } | |
471 if (options & UTF8PROC_REJECTNA) { | |
472 if (!category) return UTF8PROC_ERROR_NOTASSIGNED; | |
473 } | |
474 if (options & UTF8PROC_IGNORE) { | |
475 if (property->ignorable) return 0; | |
476 } | |
477 if (options & UTF8PROC_STRIPNA) { | |
478 if (!category) return 0; | |
479 } | |
480 if (options & UTF8PROC_LUMP) { | |
481 if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); | |
482 if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) | |
483 utf8proc_decompose_lump(0x0027); | |
484 if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) | |
485 utf8proc_decompose_lump(0x002D); | |
486 if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); | |
487 if (uc == 0x2236) utf8proc_decompose_lump(0x003A); | |
488 if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) | |
489 utf8proc_decompose_lump(0x003C); | |
490 if (uc == 0x203A || uc == 0x232A || uc == 0x3009) | |
491 utf8proc_decompose_lump(0x003E); | |
492 if (uc == 0x2216) utf8proc_decompose_lump(0x005C); | |
493 if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) | |
494 utf8proc_decompose_lump(0x005E); | |
495 if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) | |
496 utf8proc_decompose_lump(0x005F); | |
497 if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); | |
498 if (uc == 0x2223) utf8proc_decompose_lump(0x007C); | |
499 if (uc == 0x223C) utf8proc_decompose_lump(0x007E); | |
500 if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { | |
501 if (category == UTF8PROC_CATEGORY_ZL || | |
502 category == UTF8PROC_CATEGORY_ZP) | |
503 utf8proc_decompose_lump(0x000A); | |
504 } | |
505 } | |
506 if (options & UTF8PROC_STRIPMARK) { | |
507 if (category == UTF8PROC_CATEGORY_MN || | |
508 category == UTF8PROC_CATEGORY_MC || | |
509 category == UTF8PROC_CATEGORY_ME) return 0; | |
510 } | |
511 if (options & UTF8PROC_CASEFOLD) { | |
512 if (property->casefold_seqindex != UINT16_MAX) { | |
513 return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass); | |
514 } | |
515 } | |
516 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { | |
517 if (property->decomp_seqindex != UINT16_MAX && | |
518 (!property->decomp_type || (options & UTF8PROC_COMPAT))) { | |
519 return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass); | |
520 } | |
521 } | |
522 if (options & UTF8PROC_CHARBOUND) { | |
523 utf8proc_bool boundary; | |
524 boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break, | |
525 last_boundclass); | |
526 if (boundary) { | |
527 if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */ | |
528 if (bufsize >= 2) dst[1] = uc; | |
529 return 2; | |
530 } | |
531 } | |
532 if (bufsize >= 1) *dst = uc; | |
533 return 1; | |
534 } | |
535 | |
536 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( | |
537 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, | |
538 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options | |
539 ) { | |
540 return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL); | |
541 } | |
542 | |
543 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( | |
544 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, | |
545 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, | |
546 utf8proc_custom_func custom_func, void *custom_data | |
547 ) { | |
548 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ | |
549 utf8proc_ssize_t wpos = 0; | |
550 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) | |
551 return UTF8PROC_ERROR_INVALIDOPTS; | |
552 if ((options & UTF8PROC_STRIPMARK) && | |
553 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) | |
554 return UTF8PROC_ERROR_INVALIDOPTS; | |
555 { | |
556 utf8proc_int32_t uc; | |
557 utf8proc_ssize_t rpos = 0; | |
558 utf8proc_ssize_t decomp_result; | |
559 int boundclass = UTF8PROC_BOUNDCLASS_START; | |
560 while (1) { | |
561 if (options & UTF8PROC_NULLTERM) { | |
562 rpos += utf8proc_iterate(str + rpos, -1, &uc); | |
563 /* checking of return value is not necessary, | |
564 as 'uc' is < 0 in case of error */ | |
565 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; | |
566 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; | |
567 if (uc == 0) break; | |
568 } else { | |
569 if (rpos >= strlen) break; | |
570 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); | |
571 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; | |
572 } | |
573 if (custom_func != NULL) { | |
574 uc = custom_func(uc, custom_data); /* user-specified custom mapping */ | |
575 } | |
576 decomp_result = utf8proc_decompose_char( | |
577 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, | |
578 &boundclass | |
579 ); | |
580 if (decomp_result < 0) return decomp_result; | |
581 wpos += decomp_result; | |
582 /* prohibiting integer overflows due to too long strings: */ | |
583 if (wpos < 0 || | |
584 wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2)) | |
585 return UTF8PROC_ERROR_OVERFLOW; | |
586 } | |
587 } | |
588 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { | |
589 utf8proc_ssize_t pos = 0; | |
590 while (pos < wpos-1) { | |
591 utf8proc_int32_t uc1, uc2; | |
592 const utf8proc_property_t *property1, *property2; | |
593 uc1 = buffer[pos]; | |
594 uc2 = buffer[pos+1]; | |
595 property1 = unsafe_get_property(uc1); | |
596 property2 = unsafe_get_property(uc2); | |
597 if (property1->combining_class > property2->combining_class && | |
598 property2->combining_class > 0) { | |
599 buffer[pos] = uc2; | |
600 buffer[pos+1] = uc1; | |
601 if (pos > 0) pos--; else pos++; | |
602 } else { | |
603 pos++; | |
604 } | |
605 } | |
606 } | |
607 return wpos; | |
608 } | |
609 | |
610 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { | |
611 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */ | |
612 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { | |
613 utf8proc_ssize_t rpos; | |
614 utf8proc_ssize_t wpos = 0; | |
615 utf8proc_int32_t uc; | |
616 for (rpos = 0; rpos < length; rpos++) { | |
617 uc = buffer[rpos]; | |
618 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; | |
619 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || | |
620 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { | |
621 if (options & UTF8PROC_NLF2LS) { | |
622 if (options & UTF8PROC_NLF2PS) { | |
623 buffer[wpos++] = 0x000A; | |
624 } else { | |
625 buffer[wpos++] = 0x2028; | |
626 } | |
627 } else { | |
628 if (options & UTF8PROC_NLF2PS) { | |
629 buffer[wpos++] = 0x2029; | |
630 } else { | |
631 buffer[wpos++] = 0x0020; | |
632 } | |
633 } | |
634 } else if ((options & UTF8PROC_STRIPCC) && | |
635 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { | |
636 if (uc == 0x0009) buffer[wpos++] = 0x0020; | |
637 } else { | |
638 buffer[wpos++] = uc; | |
639 } | |
640 } | |
641 length = wpos; | |
642 } | |
643 if (options & UTF8PROC_COMPOSE) { | |
644 utf8proc_int32_t *starter = NULL; | |
645 utf8proc_int32_t current_char; | |
646 const utf8proc_property_t *starter_property = NULL, *current_property; | |
647 utf8proc_propval_t max_combining_class = -1; | |
648 utf8proc_ssize_t rpos; | |
649 utf8proc_ssize_t wpos = 0; | |
650 utf8proc_int32_t composition; | |
651 for (rpos = 0; rpos < length; rpos++) { | |
652 current_char = buffer[rpos]; | |
653 current_property = unsafe_get_property(current_char); | |
654 if (starter && current_property->combining_class > max_combining_class) { | |
655 /* combination perhaps possible */ | |
656 utf8proc_int32_t hangul_lindex; | |
657 utf8proc_int32_t hangul_sindex; | |
658 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; | |
659 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { | |
660 utf8proc_int32_t hangul_vindex; | |
661 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; | |
662 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { | |
663 *starter = UTF8PROC_HANGUL_SBASE + | |
664 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * | |
665 UTF8PROC_HANGUL_TCOUNT; | |
666 starter_property = NULL; | |
667 continue; | |
668 } | |
669 } | |
670 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; | |
671 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && | |
672 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { | |
673 utf8proc_int32_t hangul_tindex; | |
674 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; | |
675 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { | |
676 *starter += hangul_tindex; | |
677 starter_property = NULL; | |
678 continue; | |
679 } | |
680 } | |
681 if (!starter_property) { | |
682 starter_property = unsafe_get_property(*starter); | |
683 } | |
684 if (starter_property->comb_index < 0x8000 && | |
685 current_property->comb_index != UINT16_MAX && | |
686 current_property->comb_index >= 0x8000) { | |
687 int sidx = starter_property->comb_index; | |
688 int idx = current_property->comb_index & 0x3FFF; | |
689 if (idx >= utf8proc_combinations[sidx] && idx <= utf8proc_combinations[sidx + 1] ) { | |
690 idx += sidx + 2 - utf8proc_combinations[sidx]; | |
691 if (current_property->comb_index & 0x4000) { | |
692 composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1]; | |
693 } else | |
694 composition = utf8proc_combinations[idx]; | |
695 | |
696 if (composition > 0 && (!(options & UTF8PROC_STABLE) || | |
697 !(unsafe_get_property(composition)->comp_exclusion))) { | |
698 *starter = composition; | |
699 starter_property = NULL; | |
700 continue; | |
701 } | |
702 } | |
703 } | |
704 } | |
705 buffer[wpos] = current_char; | |
706 if (current_property->combining_class) { | |
707 if (current_property->combining_class > max_combining_class) { | |
708 max_combining_class = current_property->combining_class; | |
709 } | |
710 } else { | |
711 starter = buffer + wpos; | |
712 starter_property = NULL; | |
713 max_combining_class = -1; | |
714 } | |
715 wpos++; | |
716 } | |
717 length = wpos; | |
718 } | |
719 return length; | |
720 } | |
721 | |
722 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { | |
723 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored | |
724 ASSERT: 'buffer' has one spare byte of free space at the end! */ | |
725 length = utf8proc_normalize_utf32(buffer, length, options); | |
726 if (length < 0) return length; | |
727 { | |
728 utf8proc_ssize_t rpos, wpos = 0; | |
729 utf8proc_int32_t uc; | |
730 if (options & UTF8PROC_CHARBOUND) { | |
731 for (rpos = 0; rpos < length; rpos++) { | |
732 uc = buffer[rpos]; | |
733 wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); | |
734 } | |
735 } else { | |
736 for (rpos = 0; rpos < length; rpos++) { | |
737 uc = buffer[rpos]; | |
738 wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); | |
739 } | |
740 } | |
741 ((utf8proc_uint8_t *)buffer)[wpos] = 0; | |
742 return wpos; | |
743 } | |
744 } | |
745 | |
746 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( | |
747 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options | |
748 ) { | |
749 return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL); | |
750 } | |
751 | |
752 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( | |
753 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, | |
754 utf8proc_custom_func custom_func, void *custom_data | |
755 ) { | |
756 utf8proc_int32_t *buffer; | |
757 utf8proc_ssize_t result; | |
758 *dstptr = NULL; | |
759 result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); | |
760 if (result < 0) return result; | |
761 buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1); | |
762 if (!buffer) return UTF8PROC_ERROR_NOMEM; | |
763 result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); | |
764 if (result < 0) { | |
765 free(buffer); | |
766 return result; | |
767 } | |
768 result = utf8proc_reencode(buffer, result, options); | |
769 if (result < 0) { | |
770 free(buffer); | |
771 return result; | |
772 } | |
773 { | |
774 utf8proc_int32_t *newptr; | |
775 newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1); | |
776 if (newptr) buffer = newptr; | |
777 } | |
778 *dstptr = (utf8proc_uint8_t *)buffer; | |
779 return result; | |
780 } | |
781 | |
782 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { | |
783 utf8proc_uint8_t *retval; | |
784 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | |
785 UTF8PROC_DECOMPOSE); | |
786 return retval; | |
787 } | |
788 | |
789 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { | |
790 utf8proc_uint8_t *retval; | |
791 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | |
792 UTF8PROC_COMPOSE); | |
793 return retval; | |
794 } | |
795 | |
796 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { | |
797 utf8proc_uint8_t *retval; | |
798 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | |
799 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); | |
800 return retval; | |
801 } | |
802 | |
803 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { | |
804 utf8proc_uint8_t *retval; | |
805 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | |
806 UTF8PROC_COMPOSE | UTF8PROC_COMPAT); | |
807 return retval; | |
808 } | |
809 | |
810 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) { | |
811 utf8proc_uint8_t *retval; | |
812 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | |
813 UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE); | |
814 return retval; | |
815 } |