Mercurial > minori
comparison dep/utf8proc/utf8proc.c @ 265:ff0b2052b234
*: add missing utf8proc files
I'm an idiot LOL
| author | Paper <paper@paper.us.eu.org> |
|---|---|
| date | Thu, 11 Apr 2024 10:22:05 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 264:9a04802848c0 | 265:ff0b2052b234 |
|---|---|
| 1 /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */ | |
| 2 /* | |
| 3 * Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors. | |
| 4 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany | |
| 5 * | |
| 6 * Permission is hereby granted, free of charge, to any person obtaining a | |
| 7 * copy of this software and associated documentation files (the "Software"), | |
| 8 * to deal in the Software without restriction, including without limitation | |
| 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
| 10 * and/or sell copies of the Software, and to permit persons to whom the | |
| 11 * Software is furnished to do so, subject to the following conditions: | |
| 12 * | |
| 13 * The above copyright notice and this permission notice shall be included in | |
| 14 * all copies or substantial portions of the Software. | |
| 15 * | |
| 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
| 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
| 22 * DEALINGS IN THE SOFTWARE. | |
| 23 */ | |
| 24 | |
| 25 /* | |
| 26 * This library contains derived data from a modified version of the | |
| 27 * Unicode data files. | |
| 28 * | |
| 29 * The original data files are available at | |
| 30 * https://www.unicode.org/Public/UNIDATA/ | |
| 31 * | |
| 32 * Please notice the copyright statement in the file "utf8proc_data.c". | |
| 33 */ | |
| 34 | |
| 35 | |
| 36 /* | |
| 37 * File name: utf8proc.c | |
| 38 * | |
| 39 * Description: | |
| 40 * Implementation of libutf8proc. | |
| 41 */ | |
| 42 | |
| 43 | |
| 44 #include "utf8proc.h" | |
| 45 | |
| 46 #ifndef SSIZE_MAX | |
| 47 #define SSIZE_MAX ((size_t)SIZE_MAX/2) | |
| 48 #endif | |
| 49 #ifndef UINT16_MAX | |
| 50 # define UINT16_MAX 65535U | |
| 51 #endif | |
| 52 | |
| 53 #include "utf8proc_data.c" | |
| 54 | |
| 55 | |
| 56 UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = { | |
| 57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 69 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
| 70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
| 71 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
| 72 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; | |
| 73 | |
| 74 #define UTF8PROC_HANGUL_SBASE 0xAC00 | |
| 75 #define UTF8PROC_HANGUL_LBASE 0x1100 | |
| 76 #define UTF8PROC_HANGUL_VBASE 0x1161 | |
| 77 #define UTF8PROC_HANGUL_TBASE 0x11A7 | |
| 78 #define UTF8PROC_HANGUL_LCOUNT 19 | |
| 79 #define UTF8PROC_HANGUL_VCOUNT 21 | |
| 80 #define UTF8PROC_HANGUL_TCOUNT 28 | |
| 81 #define UTF8PROC_HANGUL_NCOUNT 588 | |
| 82 #define UTF8PROC_HANGUL_SCOUNT 11172 | |
| 83 /* END is exclusive */ | |
| 84 #define UTF8PROC_HANGUL_L_START 0x1100 | |
| 85 #define UTF8PROC_HANGUL_L_END 0x115A | |
| 86 #define UTF8PROC_HANGUL_L_FILLER 0x115F | |
| 87 #define UTF8PROC_HANGUL_V_START 0x1160 | |
| 88 #define UTF8PROC_HANGUL_V_END 0x11A3 | |
| 89 #define UTF8PROC_HANGUL_T_START 0x11A8 | |
| 90 #define UTF8PROC_HANGUL_T_END 0x11FA | |
| 91 #define UTF8PROC_HANGUL_S_START 0xAC00 | |
| 92 #define UTF8PROC_HANGUL_S_END 0xD7A4 | |
| 93 | |
| 94 /* Should follow semantic-versioning rules (semver.org) based on API | |
| 95 compatibility. (Note that the shared-library version number will | |
| 96 be different, being based on ABI compatibility.): */ | |
| 97 #define STRINGIZEx(x) #x | |
| 98 #define STRINGIZE(x) STRINGIZEx(x) | |
| 99 UTF8PROC_DLLEXPORT const char *utf8proc_version(void) { | |
| 100 return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) ""; | |
| 101 } | |
| 102 | |
| 103 UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) { | |
| 104 return "15.1.0"; | |
| 105 } | |
| 106 | |
| 107 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { | |
| 108 switch (errcode) { | |
| 109 case UTF8PROC_ERROR_NOMEM: | |
| 110 return "Memory for processing UTF-8 data could not be allocated."; | |
| 111 case UTF8PROC_ERROR_OVERFLOW: | |
| 112 return "UTF-8 string is too long to be processed."; | |
| 113 case UTF8PROC_ERROR_INVALIDUTF8: | |
| 114 return "Invalid UTF-8 string"; | |
| 115 case UTF8PROC_ERROR_NOTASSIGNED: | |
| 116 return "Unassigned Unicode code point found in UTF-8 string."; | |
| 117 case UTF8PROC_ERROR_INVALIDOPTS: | |
| 118 return "Invalid options for UTF-8 processing chosen."; | |
| 119 default: | |
| 120 return "An unknown error occurred while processing UTF-8 data."; | |
| 121 } | |
| 122 } | |
| 123 | |
| 124 #define utf_cont(ch) (((ch) & 0xc0) == 0x80) | |
| 125 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( | |
| 126 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst | |
| 127 ) { | |
| 128 utf8proc_int32_t uc; | |
| 129 const utf8proc_uint8_t *end; | |
| 130 | |
| 131 *dst = -1; | |
| 132 if (!strlen) return 0; | |
| 133 end = str + ((strlen < 0) ? 4 : strlen); | |
| 134 uc = *str++; | |
| 135 if (uc < 0x80) { | |
| 136 *dst = uc; | |
| 137 return 1; | |
| 138 } | |
| 139 // Must be between 0xc2 and 0xf4 inclusive to be valid | |
| 140 if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; | |
| 141 if (uc < 0xe0) { // 2-byte sequence | |
| 142 // Must have valid continuation character | |
| 143 if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; | |
| 144 *dst = ((uc & 0x1f)<<6) | (*str & 0x3f); | |
| 145 return 2; | |
| 146 } | |
| 147 if (uc < 0xf0) { // 3-byte sequence | |
| 148 if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) | |
| 149 return UTF8PROC_ERROR_INVALIDUTF8; | |
| 150 // Check for surrogate chars | |
| 151 if (uc == 0xed && *str > 0x9f) | |
| 152 return UTF8PROC_ERROR_INVALIDUTF8; | |
| 153 uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f); | |
| 154 if (uc < 0x800) | |
| 155 return UTF8PROC_ERROR_INVALIDUTF8; | |
| 156 *dst = uc; | |
| 157 return 3; | |
| 158 } | |
| 159 // 4-byte sequence | |
| 160 // Must have 3 valid continuation characters | |
| 161 if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2])) | |
| 162 return UTF8PROC_ERROR_INVALIDUTF8; | |
| 163 // Make sure in correct range (0x10000 - 0x10ffff) | |
| 164 if (uc == 0xf0) { | |
| 165 if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8; | |
| 166 } else if (uc == 0xf4) { | |
| 167 if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8; | |
| 168 } | |
| 169 *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f); | |
| 170 return 4; | |
| 171 } | |
| 172 | |
| 173 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { | |
| 174 return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000); | |
| 175 } | |
| 176 | |
| 177 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { | |
| 178 if (uc < 0x00) { | |
| 179 return 0; | |
| 180 } else if (uc < 0x80) { | |
| 181 dst[0] = (utf8proc_uint8_t) uc; | |
| 182 return 1; | |
| 183 } else if (uc < 0x800) { | |
| 184 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); | |
| 185 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
| 186 return 2; | |
| 187 // Note: we allow encoding 0xd800-0xdfff here, so as not to change | |
| 188 // the API, however, these are actually invalid in UTF-8 | |
| 189 } else if (uc < 0x10000) { | |
| 190 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); | |
| 191 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | |
| 192 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
| 193 return 3; | |
| 194 } else if (uc < 0x110000) { | |
| 195 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); | |
| 196 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); | |
| 197 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | |
| 198 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
| 199 return 4; | |
| 200 } else return 0; | |
| 201 } | |
| 202 | |
| 203 /* internal version used for inserting 0xff bytes between graphemes */ | |
| 204 static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { | |
| 205 if (uc < 0x00) { | |
| 206 if (uc == -1) { /* internal value used for grapheme breaks */ | |
| 207 dst[0] = (utf8proc_uint8_t)0xFF; | |
| 208 return 1; | |
| 209 } | |
| 210 return 0; | |
| 211 } else if (uc < 0x80) { | |
| 212 dst[0] = (utf8proc_uint8_t)uc; | |
| 213 return 1; | |
| 214 } else if (uc < 0x800) { | |
| 215 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); | |
| 216 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
| 217 return 2; | |
| 218 } else if (uc < 0x10000) { | |
| 219 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); | |
| 220 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | |
| 221 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
| 222 return 3; | |
| 223 } else if (uc < 0x110000) { | |
| 224 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); | |
| 225 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); | |
| 226 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); | |
| 227 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); | |
| 228 return 4; | |
| 229 } else return 0; | |
| 230 } | |
| 231 | |
| 232 /* internal "unsafe" version that does not check whether uc is in range */ | |
| 233 static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) { | |
| 234 /* ASSERT: uc >= 0 && uc < 0x110000 */ | |
| 235 return utf8proc_properties + ( | |
| 236 utf8proc_stage2table[ | |
| 237 utf8proc_stage1table[uc >> 8] + (uc & 0xFF) | |
| 238 ] | |
| 239 ); | |
| 240 } | |
| 241 | |
| 242 UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) { | |
| 243 return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc); | |
| 244 } | |
| 245 | |
| 246 /* return whether there is a grapheme break between boundclasses lbc and tbc | |
| 247 (according to the definition of extended grapheme clusters) | |
| 248 | |
| 249 Rule numbering refers to TR29 Version 29 (Unicode 9.0.0): | |
| 250 http://www.unicode.org/reports/tr29/tr29-29.html | |
| 251 | |
| 252 CAVEATS: | |
| 253 Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences) | |
| 254 and GB 12/13 (regional indicator code points) require knowledge of previous characters | |
| 255 and are thus not handled by this function. This may result in an incorrect break before | |
| 256 an E_Modifier class codepoint and an incorrectly missing break between two | |
| 257 REGIONAL_INDICATOR class code points if such support does not exist in the caller. | |
| 258 | |
| 259 See the special support in grapheme_break_extended, for required bookkeeping by the caller. | |
| 260 */ | |
| 261 static utf8proc_bool grapheme_break_simple(int lbc, int tbc) { | |
| 262 return | |
| 263 (lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1 | |
| 264 (lbc == UTF8PROC_BOUNDCLASS_CR && // GB3 | |
| 265 tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // --- | |
| 266 (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4 | |
| 267 (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5 | |
| 268 (lbc == UTF8PROC_BOUNDCLASS_L && // GB6 | |
| 269 (tbc == UTF8PROC_BOUNDCLASS_L || // --- | |
| 270 tbc == UTF8PROC_BOUNDCLASS_V || // --- | |
| 271 tbc == UTF8PROC_BOUNDCLASS_LV || // --- | |
| 272 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // --- | |
| 273 ((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7 | |
| 274 lbc == UTF8PROC_BOUNDCLASS_V) && // --- | |
| 275 (tbc == UTF8PROC_BOUNDCLASS_V || // --- | |
| 276 tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // --- | |
| 277 ((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8 | |
| 278 lbc == UTF8PROC_BOUNDCLASS_T) && // --- | |
| 279 tbc == UTF8PROC_BOUNDCLASS_T) ? false : // --- | |
| 280 (tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9 | |
| 281 tbc == UTF8PROC_BOUNDCLASS_ZWJ || // --- | |
| 282 tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a | |
| 283 lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b | |
| 284 (lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below) | |
| 285 tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ---- | |
| 286 (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below) | |
| 287 tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ---- | |
| 288 true; // GB999 | |
| 289 } | |
| 290 | |
| 291 static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state) | |
| 292 { | |
| 293 if (state) { | |
| 294 int state_bc, state_icb; /* boundclass and indic_conjunct_break state */ | |
| 295 if (*state == 0) { /* state initialization */ | |
| 296 state_bc = lbc; | |
| 297 state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE; | |
| 298 } | |
| 299 else { /* lbc and licb are already encoded in *state */ | |
| 300 state_bc = *state & 0xff; // 1st byte of state is bound class | |
| 301 state_icb = *state >> 8; // 2nd byte of state is indic conjunct break | |
| 302 } | |
| 303 | |
| 304 utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) && | |
| 305 !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER | |
| 306 && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c | |
| 307 | |
| 308 // Special support for GB9c. Don't break between two consonants | |
| 309 // separated 1+ linker characters and 0+ extend characters in any order. | |
| 310 // After a consonant, we enter LINKER state after at least one linker. | |
| 311 if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT | |
| 312 || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT | |
| 313 || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND) | |
| 314 state_icb = ticb; | |
| 315 else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER) | |
| 316 state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ? | |
| 317 UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb; | |
| 318 | |
| 319 // Special support for GB 12/13 made possible by GB999. After two RI | |
| 320 // class codepoints we want to force a break. Do this by resetting the | |
| 321 // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break | |
| 322 // after that character according to GB999 (unless of course such a break is | |
| 323 // forbidden by a different rule such as GB9). | |
| 324 if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) | |
| 325 state_bc = UTF8PROC_BOUNDCLASS_OTHER; | |
| 326 // Special support for GB11 (emoji extend* zwj / emoji) | |
| 327 else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) { | |
| 328 if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji | |
| 329 state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC; | |
| 330 else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ) | |
| 331 state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo | |
| 332 else | |
| 333 state_bc = tbc; | |
| 334 } | |
| 335 else | |
| 336 state_bc = tbc; | |
| 337 | |
| 338 *state = state_bc + (state_icb << 8); | |
| 339 return break_permitted; | |
| 340 } | |
| 341 else | |
| 342 return grapheme_break_simple(lbc, tbc); | |
| 343 } | |
| 344 | |
| 345 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful( | |
| 346 utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) { | |
| 347 | |
| 348 const utf8proc_property_t *p1 = utf8proc_get_property(c1); | |
| 349 const utf8proc_property_t *p2 = utf8proc_get_property(c2); | |
| 350 return grapheme_break_extended(p1->boundclass, | |
| 351 p2->boundclass, | |
| 352 p1->indic_conjunct_break, | |
| 353 p2->indic_conjunct_break, | |
| 354 state); | |
| 355 } | |
| 356 | |
| 357 | |
| 358 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( | |
| 359 utf8proc_int32_t c1, utf8proc_int32_t c2) { | |
| 360 return utf8proc_grapheme_break_stateful(c1, c2, NULL); | |
| 361 } | |
| 362 | |
| 363 static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) | |
| 364 { | |
| 365 utf8proc_int32_t entry_cp = **entry; | |
| 366 if ((entry_cp & 0xF800) == 0xD800) { | |
| 367 *entry = *entry + 1; | |
| 368 entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF); | |
| 369 entry_cp += 0x10000; | |
| 370 } | |
| 371 return entry_cp; | |
| 372 } | |
| 373 | |
| 374 static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex) | |
| 375 { | |
| 376 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex]; | |
| 377 return seqindex_decode_entry(&entry); | |
| 378 } | |
| 379 | |
| 380 static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { | |
| 381 utf8proc_ssize_t written = 0; | |
| 382 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF]; | |
| 383 int len = seqindex >> 14; | |
| 384 if (len >= 3) { | |
| 385 len = *entry; | |
| 386 entry++; | |
| 387 } | |
| 388 for (; len >= 0; entry++, len--) { | |
| 389 utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry); | |
| 390 | |
| 391 written += utf8proc_decompose_char(entry_cp, dst+written, | |
| 392 (bufsize > written) ? (bufsize - written) : 0, options, | |
| 393 last_boundclass); | |
| 394 if (written < 0) return UTF8PROC_ERROR_OVERFLOW; | |
| 395 } | |
| 396 return written; | |
| 397 } | |
| 398 | |
| 399 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) | |
| 400 { | |
| 401 utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex; | |
| 402 return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c; | |
| 403 } | |
| 404 | |
| 405 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) | |
| 406 { | |
| 407 utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex; | |
| 408 return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; | |
| 409 } | |
| 410 | |
| 411 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c) | |
| 412 { | |
| 413 utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex; | |
| 414 return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c; | |
| 415 } | |
| 416 | |
| 417 UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c) | |
| 418 { | |
| 419 const utf8proc_property_t *p = utf8proc_get_property(c); | |
| 420 return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX; | |
| 421 } | |
| 422 | |
| 423 UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c) | |
| 424 { | |
| 425 const utf8proc_property_t *p = utf8proc_get_property(c); | |
| 426 return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT; | |
| 427 } | |
| 428 | |
| 429 /* return a character width analogous to wcwidth (except portable and | |
| 430 hopefully less buggy than most system wcwidth functions). */ | |
| 431 UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { | |
| 432 return utf8proc_get_property(c)->charwidth; | |
| 433 } | |
| 434 | |
| 435 UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { | |
| 436 return (utf8proc_category_t) utf8proc_get_property(c)->category; | |
| 437 } | |
| 438 | |
| 439 UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { | |
| 440 static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; | |
| 441 return s[utf8proc_category(c)]; | |
| 442 } | |
| 443 | |
| 444 #define utf8proc_decompose_lump(replacement_uc) \ | |
| 445 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ | |
| 446 options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass) | |
| 447 | |
| 448 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { | |
| 449 const utf8proc_property_t *property; | |
| 450 utf8proc_propval_t category; | |
| 451 utf8proc_int32_t hangul_sindex; | |
| 452 if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED; | |
| 453 property = unsafe_get_property(uc); | |
| 454 category = property->category; | |
| 455 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; | |
| 456 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { | |
| 457 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { | |
| 458 utf8proc_int32_t hangul_tindex; | |
| 459 if (bufsize >= 1) { | |
| 460 dst[0] = UTF8PROC_HANGUL_LBASE + | |
| 461 hangul_sindex / UTF8PROC_HANGUL_NCOUNT; | |
| 462 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + | |
| 463 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; | |
| 464 } | |
| 465 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; | |
| 466 if (!hangul_tindex) return 2; | |
| 467 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; | |
| 468 return 3; | |
| 469 } | |
| 470 } | |
| 471 if (options & UTF8PROC_REJECTNA) { | |
| 472 if (!category) return UTF8PROC_ERROR_NOTASSIGNED; | |
| 473 } | |
| 474 if (options & UTF8PROC_IGNORE) { | |
| 475 if (property->ignorable) return 0; | |
| 476 } | |
| 477 if (options & UTF8PROC_STRIPNA) { | |
| 478 if (!category) return 0; | |
| 479 } | |
| 480 if (options & UTF8PROC_LUMP) { | |
| 481 if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); | |
| 482 if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) | |
| 483 utf8proc_decompose_lump(0x0027); | |
| 484 if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) | |
| 485 utf8proc_decompose_lump(0x002D); | |
| 486 if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); | |
| 487 if (uc == 0x2236) utf8proc_decompose_lump(0x003A); | |
| 488 if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) | |
| 489 utf8proc_decompose_lump(0x003C); | |
| 490 if (uc == 0x203A || uc == 0x232A || uc == 0x3009) | |
| 491 utf8proc_decompose_lump(0x003E); | |
| 492 if (uc == 0x2216) utf8proc_decompose_lump(0x005C); | |
| 493 if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) | |
| 494 utf8proc_decompose_lump(0x005E); | |
| 495 if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) | |
| 496 utf8proc_decompose_lump(0x005F); | |
| 497 if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); | |
| 498 if (uc == 0x2223) utf8proc_decompose_lump(0x007C); | |
| 499 if (uc == 0x223C) utf8proc_decompose_lump(0x007E); | |
| 500 if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { | |
| 501 if (category == UTF8PROC_CATEGORY_ZL || | |
| 502 category == UTF8PROC_CATEGORY_ZP) | |
| 503 utf8proc_decompose_lump(0x000A); | |
| 504 } | |
| 505 } | |
| 506 if (options & UTF8PROC_STRIPMARK) { | |
| 507 if (category == UTF8PROC_CATEGORY_MN || | |
| 508 category == UTF8PROC_CATEGORY_MC || | |
| 509 category == UTF8PROC_CATEGORY_ME) return 0; | |
| 510 } | |
| 511 if (options & UTF8PROC_CASEFOLD) { | |
| 512 if (property->casefold_seqindex != UINT16_MAX) { | |
| 513 return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass); | |
| 514 } | |
| 515 } | |
| 516 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { | |
| 517 if (property->decomp_seqindex != UINT16_MAX && | |
| 518 (!property->decomp_type || (options & UTF8PROC_COMPAT))) { | |
| 519 return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass); | |
| 520 } | |
| 521 } | |
| 522 if (options & UTF8PROC_CHARBOUND) { | |
| 523 utf8proc_bool boundary; | |
| 524 boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break, | |
| 525 last_boundclass); | |
| 526 if (boundary) { | |
| 527 if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */ | |
| 528 if (bufsize >= 2) dst[1] = uc; | |
| 529 return 2; | |
| 530 } | |
| 531 } | |
| 532 if (bufsize >= 1) *dst = uc; | |
| 533 return 1; | |
| 534 } | |
| 535 | |
| 536 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( | |
| 537 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, | |
| 538 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options | |
| 539 ) { | |
| 540 return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL); | |
| 541 } | |
| 542 | |
| 543 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( | |
| 544 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, | |
| 545 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, | |
| 546 utf8proc_custom_func custom_func, void *custom_data | |
| 547 ) { | |
| 548 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ | |
| 549 utf8proc_ssize_t wpos = 0; | |
| 550 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) | |
| 551 return UTF8PROC_ERROR_INVALIDOPTS; | |
| 552 if ((options & UTF8PROC_STRIPMARK) && | |
| 553 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) | |
| 554 return UTF8PROC_ERROR_INVALIDOPTS; | |
| 555 { | |
| 556 utf8proc_int32_t uc; | |
| 557 utf8proc_ssize_t rpos = 0; | |
| 558 utf8proc_ssize_t decomp_result; | |
| 559 int boundclass = UTF8PROC_BOUNDCLASS_START; | |
| 560 while (1) { | |
| 561 if (options & UTF8PROC_NULLTERM) { | |
| 562 rpos += utf8proc_iterate(str + rpos, -1, &uc); | |
| 563 /* checking of return value is not necessary, | |
| 564 as 'uc' is < 0 in case of error */ | |
| 565 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; | |
| 566 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; | |
| 567 if (uc == 0) break; | |
| 568 } else { | |
| 569 if (rpos >= strlen) break; | |
| 570 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); | |
| 571 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; | |
| 572 } | |
| 573 if (custom_func != NULL) { | |
| 574 uc = custom_func(uc, custom_data); /* user-specified custom mapping */ | |
| 575 } | |
| 576 decomp_result = utf8proc_decompose_char( | |
| 577 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, | |
| 578 &boundclass | |
| 579 ); | |
| 580 if (decomp_result < 0) return decomp_result; | |
| 581 wpos += decomp_result; | |
| 582 /* prohibiting integer overflows due to too long strings: */ | |
| 583 if (wpos < 0 || | |
| 584 wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2)) | |
| 585 return UTF8PROC_ERROR_OVERFLOW; | |
| 586 } | |
| 587 } | |
| 588 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { | |
| 589 utf8proc_ssize_t pos = 0; | |
| 590 while (pos < wpos-1) { | |
| 591 utf8proc_int32_t uc1, uc2; | |
| 592 const utf8proc_property_t *property1, *property2; | |
| 593 uc1 = buffer[pos]; | |
| 594 uc2 = buffer[pos+1]; | |
| 595 property1 = unsafe_get_property(uc1); | |
| 596 property2 = unsafe_get_property(uc2); | |
| 597 if (property1->combining_class > property2->combining_class && | |
| 598 property2->combining_class > 0) { | |
| 599 buffer[pos] = uc2; | |
| 600 buffer[pos+1] = uc1; | |
| 601 if (pos > 0) pos--; else pos++; | |
| 602 } else { | |
| 603 pos++; | |
| 604 } | |
| 605 } | |
| 606 } | |
| 607 return wpos; | |
| 608 } | |
| 609 | |
| 610 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { | |
| 611 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */ | |
| 612 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { | |
| 613 utf8proc_ssize_t rpos; | |
| 614 utf8proc_ssize_t wpos = 0; | |
| 615 utf8proc_int32_t uc; | |
| 616 for (rpos = 0; rpos < length; rpos++) { | |
| 617 uc = buffer[rpos]; | |
| 618 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; | |
| 619 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || | |
| 620 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { | |
| 621 if (options & UTF8PROC_NLF2LS) { | |
| 622 if (options & UTF8PROC_NLF2PS) { | |
| 623 buffer[wpos++] = 0x000A; | |
| 624 } else { | |
| 625 buffer[wpos++] = 0x2028; | |
| 626 } | |
| 627 } else { | |
| 628 if (options & UTF8PROC_NLF2PS) { | |
| 629 buffer[wpos++] = 0x2029; | |
| 630 } else { | |
| 631 buffer[wpos++] = 0x0020; | |
| 632 } | |
| 633 } | |
| 634 } else if ((options & UTF8PROC_STRIPCC) && | |
| 635 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { | |
| 636 if (uc == 0x0009) buffer[wpos++] = 0x0020; | |
| 637 } else { | |
| 638 buffer[wpos++] = uc; | |
| 639 } | |
| 640 } | |
| 641 length = wpos; | |
| 642 } | |
| 643 if (options & UTF8PROC_COMPOSE) { | |
| 644 utf8proc_int32_t *starter = NULL; | |
| 645 utf8proc_int32_t current_char; | |
| 646 const utf8proc_property_t *starter_property = NULL, *current_property; | |
| 647 utf8proc_propval_t max_combining_class = -1; | |
| 648 utf8proc_ssize_t rpos; | |
| 649 utf8proc_ssize_t wpos = 0; | |
| 650 utf8proc_int32_t composition; | |
| 651 for (rpos = 0; rpos < length; rpos++) { | |
| 652 current_char = buffer[rpos]; | |
| 653 current_property = unsafe_get_property(current_char); | |
| 654 if (starter && current_property->combining_class > max_combining_class) { | |
| 655 /* combination perhaps possible */ | |
| 656 utf8proc_int32_t hangul_lindex; | |
| 657 utf8proc_int32_t hangul_sindex; | |
| 658 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; | |
| 659 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { | |
| 660 utf8proc_int32_t hangul_vindex; | |
| 661 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; | |
| 662 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { | |
| 663 *starter = UTF8PROC_HANGUL_SBASE + | |
| 664 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * | |
| 665 UTF8PROC_HANGUL_TCOUNT; | |
| 666 starter_property = NULL; | |
| 667 continue; | |
| 668 } | |
| 669 } | |
| 670 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; | |
| 671 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && | |
| 672 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { | |
| 673 utf8proc_int32_t hangul_tindex; | |
| 674 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; | |
| 675 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { | |
| 676 *starter += hangul_tindex; | |
| 677 starter_property = NULL; | |
| 678 continue; | |
| 679 } | |
| 680 } | |
| 681 if (!starter_property) { | |
| 682 starter_property = unsafe_get_property(*starter); | |
| 683 } | |
| 684 if (starter_property->comb_index < 0x8000 && | |
| 685 current_property->comb_index != UINT16_MAX && | |
| 686 current_property->comb_index >= 0x8000) { | |
| 687 int sidx = starter_property->comb_index; | |
| 688 int idx = current_property->comb_index & 0x3FFF; | |
| 689 if (idx >= utf8proc_combinations[sidx] && idx <= utf8proc_combinations[sidx + 1] ) { | |
| 690 idx += sidx + 2 - utf8proc_combinations[sidx]; | |
| 691 if (current_property->comb_index & 0x4000) { | |
| 692 composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1]; | |
| 693 } else | |
| 694 composition = utf8proc_combinations[idx]; | |
| 695 | |
| 696 if (composition > 0 && (!(options & UTF8PROC_STABLE) || | |
| 697 !(unsafe_get_property(composition)->comp_exclusion))) { | |
| 698 *starter = composition; | |
| 699 starter_property = NULL; | |
| 700 continue; | |
| 701 } | |
| 702 } | |
| 703 } | |
| 704 } | |
| 705 buffer[wpos] = current_char; | |
| 706 if (current_property->combining_class) { | |
| 707 if (current_property->combining_class > max_combining_class) { | |
| 708 max_combining_class = current_property->combining_class; | |
| 709 } | |
| 710 } else { | |
| 711 starter = buffer + wpos; | |
| 712 starter_property = NULL; | |
| 713 max_combining_class = -1; | |
| 714 } | |
| 715 wpos++; | |
| 716 } | |
| 717 length = wpos; | |
| 718 } | |
| 719 return length; | |
| 720 } | |
| 721 | |
| 722 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { | |
| 723 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored | |
| 724 ASSERT: 'buffer' has one spare byte of free space at the end! */ | |
| 725 length = utf8proc_normalize_utf32(buffer, length, options); | |
| 726 if (length < 0) return length; | |
| 727 { | |
| 728 utf8proc_ssize_t rpos, wpos = 0; | |
| 729 utf8proc_int32_t uc; | |
| 730 if (options & UTF8PROC_CHARBOUND) { | |
| 731 for (rpos = 0; rpos < length; rpos++) { | |
| 732 uc = buffer[rpos]; | |
| 733 wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); | |
| 734 } | |
| 735 } else { | |
| 736 for (rpos = 0; rpos < length; rpos++) { | |
| 737 uc = buffer[rpos]; | |
| 738 wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); | |
| 739 } | |
| 740 } | |
| 741 ((utf8proc_uint8_t *)buffer)[wpos] = 0; | |
| 742 return wpos; | |
| 743 } | |
| 744 } | |
| 745 | |
| 746 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( | |
| 747 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options | |
| 748 ) { | |
| 749 return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL); | |
| 750 } | |
| 751 | |
| 752 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( | |
| 753 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, | |
| 754 utf8proc_custom_func custom_func, void *custom_data | |
| 755 ) { | |
| 756 utf8proc_int32_t *buffer; | |
| 757 utf8proc_ssize_t result; | |
| 758 *dstptr = NULL; | |
| 759 result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); | |
| 760 if (result < 0) return result; | |
| 761 buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1); | |
| 762 if (!buffer) return UTF8PROC_ERROR_NOMEM; | |
| 763 result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); | |
| 764 if (result < 0) { | |
| 765 free(buffer); | |
| 766 return result; | |
| 767 } | |
| 768 result = utf8proc_reencode(buffer, result, options); | |
| 769 if (result < 0) { | |
| 770 free(buffer); | |
| 771 return result; | |
| 772 } | |
| 773 { | |
| 774 utf8proc_int32_t *newptr; | |
| 775 newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1); | |
| 776 if (newptr) buffer = newptr; | |
| 777 } | |
| 778 *dstptr = (utf8proc_uint8_t *)buffer; | |
| 779 return result; | |
| 780 } | |
| 781 | |
| 782 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { | |
| 783 utf8proc_uint8_t *retval; | |
| 784 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | |
| 785 UTF8PROC_DECOMPOSE); | |
| 786 return retval; | |
| 787 } | |
| 788 | |
| 789 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { | |
| 790 utf8proc_uint8_t *retval; | |
| 791 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | |
| 792 UTF8PROC_COMPOSE); | |
| 793 return retval; | |
| 794 } | |
| 795 | |
| 796 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { | |
| 797 utf8proc_uint8_t *retval; | |
| 798 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | |
| 799 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); | |
| 800 return retval; | |
| 801 } | |
| 802 | |
| 803 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { | |
| 804 utf8proc_uint8_t *retval; | |
| 805 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | |
| 806 UTF8PROC_COMPOSE | UTF8PROC_COMPAT); | |
| 807 return retval; | |
| 808 } | |
| 809 | |
| 810 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) { | |
| 811 utf8proc_uint8_t *retval; | |
| 812 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | | |
| 813 UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE); | |
| 814 return retval; | |
| 815 } |
