annotate dep/utf8proc/utf8proc.c @ 337:a7d4e5107531

dep/animone: REFACTOR ALL THE THINGS 1: animone now has its own syntax divergent from anisthesia, making different platforms actually have their own sections 2: process names in animone are now called `comm' (this will probably break things). this is what its called in bsd/linux so I'm just going to use it everywhere 3: the X11 code now checks for the existence of a UTF-8 window title and passes it if available 4: ANYTHING THATS NOT LINUX IS 100% UNTESTED AND CAN AND WILL BREAK! I still actually need to test the bsd code. to be honest I'm probably going to move all of the bsds into separate files because they're all essentially different operating systems at this point
author Paper <paper@paper.us.eu.org>
date Wed, 19 Jun 2024 12:51:15 -0400
parents ff0b2052b234
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
265
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
1 /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
2 /*
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
3 * Copyright (c) 2014-2021 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
4 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
5 *
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
6 * Permission is hereby granted, free of charge, to any person obtaining a
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
7 * copy of this software and associated documentation files (the "Software"),
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
8 * to deal in the Software without restriction, including without limitation
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
10 * and/or sell copies of the Software, and to permit persons to whom the
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
11 * Software is furnished to do so, subject to the following conditions:
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
12 *
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
13 * The above copyright notice and this permission notice shall be included in
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
14 * all copies or substantial portions of the Software.
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
15 *
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
22 * DEALINGS IN THE SOFTWARE.
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
23 */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
24
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
25 /*
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
26 * This library contains derived data from a modified version of the
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
27 * Unicode data files.
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
28 *
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
29 * The original data files are available at
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
30 * https://www.unicode.org/Public/UNIDATA/
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
31 *
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
32 * Please notice the copyright statement in the file "utf8proc_data.c".
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
33 */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
34
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
35
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
36 /*
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
37 * File name: utf8proc.c
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
38 *
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
39 * Description:
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
40 * Implementation of libutf8proc.
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
41 */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
42
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
43
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
44 #include "utf8proc.h"
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
45
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
46 #ifndef SSIZE_MAX
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
47 #define SSIZE_MAX ((size_t)SIZE_MAX/2)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
48 #endif
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
49 #ifndef UINT16_MAX
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
50 # define UINT16_MAX 65535U
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
51 #endif
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
52
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
53 #include "utf8proc_data.c"
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
54
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
55
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
56 UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
69 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
71 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
72 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
73
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
74 #define UTF8PROC_HANGUL_SBASE 0xAC00
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
75 #define UTF8PROC_HANGUL_LBASE 0x1100
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
76 #define UTF8PROC_HANGUL_VBASE 0x1161
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
77 #define UTF8PROC_HANGUL_TBASE 0x11A7
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
78 #define UTF8PROC_HANGUL_LCOUNT 19
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
79 #define UTF8PROC_HANGUL_VCOUNT 21
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
80 #define UTF8PROC_HANGUL_TCOUNT 28
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
81 #define UTF8PROC_HANGUL_NCOUNT 588
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
82 #define UTF8PROC_HANGUL_SCOUNT 11172
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
83 /* END is exclusive */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
84 #define UTF8PROC_HANGUL_L_START 0x1100
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
85 #define UTF8PROC_HANGUL_L_END 0x115A
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
86 #define UTF8PROC_HANGUL_L_FILLER 0x115F
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
87 #define UTF8PROC_HANGUL_V_START 0x1160
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
88 #define UTF8PROC_HANGUL_V_END 0x11A3
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
89 #define UTF8PROC_HANGUL_T_START 0x11A8
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
90 #define UTF8PROC_HANGUL_T_END 0x11FA
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
91 #define UTF8PROC_HANGUL_S_START 0xAC00
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
92 #define UTF8PROC_HANGUL_S_END 0xD7A4
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
93
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
94 /* Should follow semantic-versioning rules (semver.org) based on API
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
95 compatibility. (Note that the shared-library version number will
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
96 be different, being based on ABI compatibility.): */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
97 #define STRINGIZEx(x) #x
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
98 #define STRINGIZE(x) STRINGIZEx(x)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
99 UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
100 return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
101 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
102
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
103 UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
104 return "15.1.0";
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
105 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
106
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
107 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
108 switch (errcode) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
109 case UTF8PROC_ERROR_NOMEM:
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
110 return "Memory for processing UTF-8 data could not be allocated.";
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
111 case UTF8PROC_ERROR_OVERFLOW:
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
112 return "UTF-8 string is too long to be processed.";
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
113 case UTF8PROC_ERROR_INVALIDUTF8:
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
114 return "Invalid UTF-8 string";
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
115 case UTF8PROC_ERROR_NOTASSIGNED:
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
116 return "Unassigned Unicode code point found in UTF-8 string.";
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
117 case UTF8PROC_ERROR_INVALIDOPTS:
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
118 return "Invalid options for UTF-8 processing chosen.";
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
119 default:
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
120 return "An unknown error occurred while processing UTF-8 data.";
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
121 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
122 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
123
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
124 #define utf_cont(ch) (((ch) & 0xc0) == 0x80)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
125 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
126 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
127 ) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
128 utf8proc_int32_t uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
129 const utf8proc_uint8_t *end;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
130
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
131 *dst = -1;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
132 if (!strlen) return 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
133 end = str + ((strlen < 0) ? 4 : strlen);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
134 uc = *str++;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
135 if (uc < 0x80) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
136 *dst = uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
137 return 1;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
138 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
139 // Must be between 0xc2 and 0xf4 inclusive to be valid
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
140 if ((utf8proc_uint32_t)(uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
141 if (uc < 0xe0) { // 2-byte sequence
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
142 // Must have valid continuation character
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
143 if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
144 *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
145 return 2;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
146 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
147 if (uc < 0xf0) { // 3-byte sequence
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
148 if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
149 return UTF8PROC_ERROR_INVALIDUTF8;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
150 // Check for surrogate chars
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
151 if (uc == 0xed && *str > 0x9f)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
152 return UTF8PROC_ERROR_INVALIDUTF8;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
153 uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
154 if (uc < 0x800)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
155 return UTF8PROC_ERROR_INVALIDUTF8;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
156 *dst = uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
157 return 3;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
158 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
159 // 4-byte sequence
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
160 // Must have 3 valid continuation characters
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
161 if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
162 return UTF8PROC_ERROR_INVALIDUTF8;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
163 // Make sure in correct range (0x10000 - 0x10ffff)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
164 if (uc == 0xf0) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
165 if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
166 } else if (uc == 0xf4) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
167 if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
168 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
169 *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
170 return 4;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
171 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
172
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
173 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
174 return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
175 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
176
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
177 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
178 if (uc < 0x00) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
179 return 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
180 } else if (uc < 0x80) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
181 dst[0] = (utf8proc_uint8_t) uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
182 return 1;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
183 } else if (uc < 0x800) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
184 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
185 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
186 return 2;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
187 // Note: we allow encoding 0xd800-0xdfff here, so as not to change
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
188 // the API, however, these are actually invalid in UTF-8
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
189 } else if (uc < 0x10000) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
190 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
191 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
192 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
193 return 3;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
194 } else if (uc < 0x110000) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
195 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
196 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
197 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
198 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
199 return 4;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
200 } else return 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
201 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
202
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
203 /* internal version used for inserting 0xff bytes between graphemes */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
204 static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
205 if (uc < 0x00) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
206 if (uc == -1) { /* internal value used for grapheme breaks */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
207 dst[0] = (utf8proc_uint8_t)0xFF;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
208 return 1;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
209 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
210 return 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
211 } else if (uc < 0x80) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
212 dst[0] = (utf8proc_uint8_t)uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
213 return 1;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
214 } else if (uc < 0x800) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
215 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
216 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
217 return 2;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
218 } else if (uc < 0x10000) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
219 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
220 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
221 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
222 return 3;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
223 } else if (uc < 0x110000) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
224 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
225 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
226 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
227 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
228 return 4;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
229 } else return 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
230 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
231
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
232 /* internal "unsafe" version that does not check whether uc is in range */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
233 static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
234 /* ASSERT: uc >= 0 && uc < 0x110000 */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
235 return utf8proc_properties + (
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
236 utf8proc_stage2table[
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
237 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
238 ]
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
239 );
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
240 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
241
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
242 UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
243 return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
244 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
245
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
246 /* return whether there is a grapheme break between boundclasses lbc and tbc
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
247 (according to the definition of extended grapheme clusters)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
248
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
249 Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
250 http://www.unicode.org/reports/tr29/tr29-29.html
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
251
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
252 CAVEATS:
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
253 Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
254 and GB 12/13 (regional indicator code points) require knowledge of previous characters
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
255 and are thus not handled by this function. This may result in an incorrect break before
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
256 an E_Modifier class codepoint and an incorrectly missing break between two
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
257 REGIONAL_INDICATOR class code points if such support does not exist in the caller.
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
258
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
259 See the special support in grapheme_break_extended, for required bookkeeping by the caller.
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
260 */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
261 static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
262 return
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
263 (lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
264 (lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
265 tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
266 (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
267 (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
268 (lbc == UTF8PROC_BOUNDCLASS_L && // GB6
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
269 (tbc == UTF8PROC_BOUNDCLASS_L || // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
270 tbc == UTF8PROC_BOUNDCLASS_V || // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
271 tbc == UTF8PROC_BOUNDCLASS_LV || // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
272 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
273 ((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
274 lbc == UTF8PROC_BOUNDCLASS_V) && // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
275 (tbc == UTF8PROC_BOUNDCLASS_V || // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
276 tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
277 ((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
278 lbc == UTF8PROC_BOUNDCLASS_T) && // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
279 tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
280 (tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
281 tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
282 tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
283 lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
284 (lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
285 tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
286 (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
287 tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
288 true; // GB999
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
289 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
290
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
291 static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
292 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
293 if (state) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
294 int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
295 if (*state == 0) { /* state initialization */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
296 state_bc = lbc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
297 state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
298 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
299 else { /* lbc and licb are already encoded in *state */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
300 state_bc = *state & 0xff; // 1st byte of state is bound class
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
301 state_icb = *state >> 8; // 2nd byte of state is indic conjunct break
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
302 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
303
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
304 utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
305 !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
306 && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
307
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
308 // Special support for GB9c. Don't break between two consonants
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
309 // separated 1+ linker characters and 0+ extend characters in any order.
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
310 // After a consonant, we enter LINKER state after at least one linker.
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
311 if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
312 || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
313 || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
314 state_icb = ticb;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
315 else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
316 state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
317 UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
318
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
319 // Special support for GB 12/13 made possible by GB999. After two RI
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
320 // class codepoints we want to force a break. Do this by resetting the
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
321 // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
322 // after that character according to GB999 (unless of course such a break is
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
323 // forbidden by a different rule such as GB9).
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
324 if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
325 state_bc = UTF8PROC_BOUNDCLASS_OTHER;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
326 // Special support for GB11 (emoji extend* zwj / emoji)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
327 else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
328 if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
329 state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
330 else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
331 state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
332 else
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
333 state_bc = tbc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
334 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
335 else
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
336 state_bc = tbc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
337
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
338 *state = state_bc + (state_icb << 8);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
339 return break_permitted;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
340 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
341 else
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
342 return grapheme_break_simple(lbc, tbc);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
343 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
344
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
345 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
346 utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
347
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
348 const utf8proc_property_t *p1 = utf8proc_get_property(c1);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
349 const utf8proc_property_t *p2 = utf8proc_get_property(c2);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
350 return grapheme_break_extended(p1->boundclass,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
351 p2->boundclass,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
352 p1->indic_conjunct_break,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
353 p2->indic_conjunct_break,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
354 state);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
355 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
356
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
357
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
358 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
359 utf8proc_int32_t c1, utf8proc_int32_t c2) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
360 return utf8proc_grapheme_break_stateful(c1, c2, NULL);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
361 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
362
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
363 static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
364 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
365 utf8proc_int32_t entry_cp = **entry;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
366 if ((entry_cp & 0xF800) == 0xD800) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
367 *entry = *entry + 1;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
368 entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
369 entry_cp += 0x10000;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
370 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
371 return entry_cp;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
372 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
373
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
374 static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
375 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
376 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
377 return seqindex_decode_entry(&entry);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
378 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
379
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
380 static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
381 utf8proc_ssize_t written = 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
382 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
383 int len = seqindex >> 14;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
384 if (len >= 3) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
385 len = *entry;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
386 entry++;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
387 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
388 for (; len >= 0; entry++, len--) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
389 utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
390
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
391 written += utf8proc_decompose_char(entry_cp, dst+written,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
392 (bufsize > written) ? (bufsize - written) : 0, options,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
393 last_boundclass);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
394 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
395 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
396 return written;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
397 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
398
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
399 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
400 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
401 utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
402 return cl != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cl) : c;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
403 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
404
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
405 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
406 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
407 utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
408 return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
409 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
410
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
411 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
412 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
413 utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
414 return cu != UINT16_MAX ? seqindex_decode_index((utf8proc_uint32_t)cu) : c;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
415 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
416
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
417 UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
418 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
419 const utf8proc_property_t *p = utf8proc_get_property(c);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
420 return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
421 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
422
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
423 UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
424 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
425 const utf8proc_property_t *p = utf8proc_get_property(c);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
426 return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
427 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
428
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
429 /* return a character width analogous to wcwidth (except portable and
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
430 hopefully less buggy than most system wcwidth functions). */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
431 UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
432 return utf8proc_get_property(c)->charwidth;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
433 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
434
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
435 UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
436 return (utf8proc_category_t) utf8proc_get_property(c)->category;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
437 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
438
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
439 UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
440 static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
441 return s[utf8proc_category(c)];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
442 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
443
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
444 #define utf8proc_decompose_lump(replacement_uc) \
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
445 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
446 options & ~(unsigned int)UTF8PROC_LUMP, last_boundclass)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
447
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
448 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
449 const utf8proc_property_t *property;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
450 utf8proc_propval_t category;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
451 utf8proc_int32_t hangul_sindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
452 if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
453 property = unsafe_get_property(uc);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
454 category = property->category;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
455 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
456 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
457 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
458 utf8proc_int32_t hangul_tindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
459 if (bufsize >= 1) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
460 dst[0] = UTF8PROC_HANGUL_LBASE +
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
461 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
462 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
463 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
464 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
465 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
466 if (!hangul_tindex) return 2;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
467 if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
468 return 3;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
469 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
470 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
471 if (options & UTF8PROC_REJECTNA) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
472 if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
473 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
474 if (options & UTF8PROC_IGNORE) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
475 if (property->ignorable) return 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
476 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
477 if (options & UTF8PROC_STRIPNA) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
478 if (!category) return 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
479 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
480 if (options & UTF8PROC_LUMP) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
481 if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
482 if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
483 utf8proc_decompose_lump(0x0027);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
484 if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
485 utf8proc_decompose_lump(0x002D);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
486 if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
487 if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
488 if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
489 utf8proc_decompose_lump(0x003C);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
490 if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
491 utf8proc_decompose_lump(0x003E);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
492 if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
493 if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
494 utf8proc_decompose_lump(0x005E);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
495 if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
496 utf8proc_decompose_lump(0x005F);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
497 if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
498 if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
499 if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
500 if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
501 if (category == UTF8PROC_CATEGORY_ZL ||
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
502 category == UTF8PROC_CATEGORY_ZP)
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
503 utf8proc_decompose_lump(0x000A);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
504 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
505 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
506 if (options & UTF8PROC_STRIPMARK) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
507 if (category == UTF8PROC_CATEGORY_MN ||
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
508 category == UTF8PROC_CATEGORY_MC ||
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
509 category == UTF8PROC_CATEGORY_ME) return 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
510 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
511 if (options & UTF8PROC_CASEFOLD) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
512 if (property->casefold_seqindex != UINT16_MAX) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
513 return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
514 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
515 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
516 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
517 if (property->decomp_seqindex != UINT16_MAX &&
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
518 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
519 return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
520 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
521 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
522 if (options & UTF8PROC_CHARBOUND) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
523 utf8proc_bool boundary;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
524 boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
525 last_boundclass);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
526 if (boundary) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
527 if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
528 if (bufsize >= 2) dst[1] = uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
529 return 2;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
530 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
531 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
532 if (bufsize >= 1) *dst = uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
533 return 1;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
534 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
535
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
536 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
537 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
538 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
539 ) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
540 return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
541 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
542
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
543 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
544 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
545 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
546 utf8proc_custom_func custom_func, void *custom_data
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
547 ) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
548 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
549 utf8proc_ssize_t wpos = 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
550 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
551 return UTF8PROC_ERROR_INVALIDOPTS;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
552 if ((options & UTF8PROC_STRIPMARK) &&
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
553 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
554 return UTF8PROC_ERROR_INVALIDOPTS;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
555 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
556 utf8proc_int32_t uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
557 utf8proc_ssize_t rpos = 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
558 utf8proc_ssize_t decomp_result;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
559 int boundclass = UTF8PROC_BOUNDCLASS_START;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
560 while (1) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
561 if (options & UTF8PROC_NULLTERM) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
562 rpos += utf8proc_iterate(str + rpos, -1, &uc);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
563 /* checking of return value is not necessary,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
564 as 'uc' is < 0 in case of error */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
565 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
566 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
567 if (uc == 0) break;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
568 } else {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
569 if (rpos >= strlen) break;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
570 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
571 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
572 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
573 if (custom_func != NULL) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
574 uc = custom_func(uc, custom_data); /* user-specified custom mapping */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
575 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
576 decomp_result = utf8proc_decompose_char(
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
577 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
578 &boundclass
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
579 );
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
580 if (decomp_result < 0) return decomp_result;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
581 wpos += decomp_result;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
582 /* prohibiting integer overflows due to too long strings: */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
583 if (wpos < 0 ||
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
584 wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
585 return UTF8PROC_ERROR_OVERFLOW;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
586 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
587 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
588 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
589 utf8proc_ssize_t pos = 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
590 while (pos < wpos-1) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
591 utf8proc_int32_t uc1, uc2;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
592 const utf8proc_property_t *property1, *property2;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
593 uc1 = buffer[pos];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
594 uc2 = buffer[pos+1];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
595 property1 = unsafe_get_property(uc1);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
596 property2 = unsafe_get_property(uc2);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
597 if (property1->combining_class > property2->combining_class &&
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
598 property2->combining_class > 0) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
599 buffer[pos] = uc2;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
600 buffer[pos+1] = uc1;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
601 if (pos > 0) pos--; else pos++;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
602 } else {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
603 pos++;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
604 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
605 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
606 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
607 return wpos;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
608 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
609
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
610 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
611 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
612 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
613 utf8proc_ssize_t rpos;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
614 utf8proc_ssize_t wpos = 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
615 utf8proc_int32_t uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
616 for (rpos = 0; rpos < length; rpos++) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
617 uc = buffer[rpos];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
618 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
619 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
620 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
621 if (options & UTF8PROC_NLF2LS) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
622 if (options & UTF8PROC_NLF2PS) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
623 buffer[wpos++] = 0x000A;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
624 } else {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
625 buffer[wpos++] = 0x2028;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
626 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
627 } else {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
628 if (options & UTF8PROC_NLF2PS) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
629 buffer[wpos++] = 0x2029;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
630 } else {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
631 buffer[wpos++] = 0x0020;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
632 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
633 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
634 } else if ((options & UTF8PROC_STRIPCC) &&
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
635 (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
636 if (uc == 0x0009) buffer[wpos++] = 0x0020;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
637 } else {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
638 buffer[wpos++] = uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
639 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
640 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
641 length = wpos;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
642 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
643 if (options & UTF8PROC_COMPOSE) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
644 utf8proc_int32_t *starter = NULL;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
645 utf8proc_int32_t current_char;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
646 const utf8proc_property_t *starter_property = NULL, *current_property;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
647 utf8proc_propval_t max_combining_class = -1;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
648 utf8proc_ssize_t rpos;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
649 utf8proc_ssize_t wpos = 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
650 utf8proc_int32_t composition;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
651 for (rpos = 0; rpos < length; rpos++) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
652 current_char = buffer[rpos];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
653 current_property = unsafe_get_property(current_char);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
654 if (starter && current_property->combining_class > max_combining_class) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
655 /* combination perhaps possible */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
656 utf8proc_int32_t hangul_lindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
657 utf8proc_int32_t hangul_sindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
658 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
659 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
660 utf8proc_int32_t hangul_vindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
661 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
662 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
663 *starter = UTF8PROC_HANGUL_SBASE +
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
664 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
665 UTF8PROC_HANGUL_TCOUNT;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
666 starter_property = NULL;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
667 continue;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
668 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
669 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
670 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
671 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
672 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
673 utf8proc_int32_t hangul_tindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
674 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
675 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
676 *starter += hangul_tindex;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
677 starter_property = NULL;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
678 continue;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
679 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
680 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
681 if (!starter_property) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
682 starter_property = unsafe_get_property(*starter);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
683 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
684 if (starter_property->comb_index < 0x8000 &&
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
685 current_property->comb_index != UINT16_MAX &&
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
686 current_property->comb_index >= 0x8000) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
687 int sidx = starter_property->comb_index;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
688 int idx = current_property->comb_index & 0x3FFF;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
689 if (idx >= utf8proc_combinations[sidx] && idx <= utf8proc_combinations[sidx + 1] ) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
690 idx += sidx + 2 - utf8proc_combinations[sidx];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
691 if (current_property->comb_index & 0x4000) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
692 composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
693 } else
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
694 composition = utf8proc_combinations[idx];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
695
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
696 if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
697 !(unsafe_get_property(composition)->comp_exclusion))) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
698 *starter = composition;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
699 starter_property = NULL;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
700 continue;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
701 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
702 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
703 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
704 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
705 buffer[wpos] = current_char;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
706 if (current_property->combining_class) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
707 if (current_property->combining_class > max_combining_class) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
708 max_combining_class = current_property->combining_class;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
709 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
710 } else {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
711 starter = buffer + wpos;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
712 starter_property = NULL;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
713 max_combining_class = -1;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
714 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
715 wpos++;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
716 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
717 length = wpos;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
718 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
719 return length;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
720 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
721
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
722 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
723 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
724 ASSERT: 'buffer' has one spare byte of free space at the end! */
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
725 length = utf8proc_normalize_utf32(buffer, length, options);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
726 if (length < 0) return length;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
727 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
728 utf8proc_ssize_t rpos, wpos = 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
729 utf8proc_int32_t uc;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
730 if (options & UTF8PROC_CHARBOUND) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
731 for (rpos = 0; rpos < length; rpos++) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
732 uc = buffer[rpos];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
733 wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
734 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
735 } else {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
736 for (rpos = 0; rpos < length; rpos++) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
737 uc = buffer[rpos];
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
738 wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
739 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
740 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
741 ((utf8proc_uint8_t *)buffer)[wpos] = 0;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
742 return wpos;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
743 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
744 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
745
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
746 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
747 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
748 ) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
749 return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
750 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
751
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
752 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
753 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
754 utf8proc_custom_func custom_func, void *custom_data
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
755 ) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
756 utf8proc_int32_t *buffer;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
757 utf8proc_ssize_t result;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
758 *dstptr = NULL;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
759 result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
760 if (result < 0) return result;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
761 buffer = (utf8proc_int32_t *) malloc(((utf8proc_size_t)result) * sizeof(utf8proc_int32_t) + 1);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
762 if (!buffer) return UTF8PROC_ERROR_NOMEM;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
763 result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
764 if (result < 0) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
765 free(buffer);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
766 return result;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
767 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
768 result = utf8proc_reencode(buffer, result, options);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
769 if (result < 0) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
770 free(buffer);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
771 return result;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
772 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
773 {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
774 utf8proc_int32_t *newptr;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
775 newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
776 if (newptr) buffer = newptr;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
777 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
778 *dstptr = (utf8proc_uint8_t *)buffer;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
779 return result;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
780 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
781
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
782 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
783 utf8proc_uint8_t *retval;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
784 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
785 UTF8PROC_DECOMPOSE);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
786 return retval;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
787 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
788
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
789 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
790 utf8proc_uint8_t *retval;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
791 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
792 UTF8PROC_COMPOSE);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
793 return retval;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
794 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
795
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
796 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
797 utf8proc_uint8_t *retval;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
798 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
799 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
800 return retval;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
801 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
802
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
803 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
804 utf8proc_uint8_t *retval;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
805 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
806 UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
807 return retval;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
808 }
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
809
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
810 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) {
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
811 utf8proc_uint8_t *retval;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
812 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
813 UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
814 return retval;
ff0b2052b234 *: add missing utf8proc files
Paper <paper@paper.us.eu.org>
parents:
diff changeset
815 }