Mercurial > minori
annotate dep/utf8proc/test/graphemetest.c @ 350:daa03aa2262d
sys/glib: general cleanup, use C++ principles, add more suffixes
author | Paper <paper@paper.us.eu.org> |
---|---|
date | Sun, 14 Jul 2024 19:12:40 -0400 |
parents | 1faa72660932 |
children |
rev | line source |
---|---|
343
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
1 #include "tests.h" |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
2 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
3 /* check one line in the format of GraphemeBreakTest.txt */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
4 void checkline(const char *_buf, bool verbose) { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
5 size_t bi = 0, si = 0; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
6 utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
7 const unsigned char *buf = (const unsigned char *) _buf; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
8 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
9 while (buf[bi]) { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
10 bi = skipspaces(buf, bi); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
11 if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
12 src[si++] = '/'; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
13 bi += 2; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
14 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
15 else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
16 bi += 2; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
17 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
18 else if (buf[bi] == '#') { /* start of comments */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
19 break; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
20 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
21 else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
22 src[si++] = '/'; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
23 bi += 1; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
24 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
25 else { /* hex-encoded codepoint */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
26 size_t len = encode((unsigned char*) (src + si), buf + bi) - 1; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
27 while (src[si]) ++si; /* advance to NUL termination */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
28 bi += len; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
29 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
30 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
31 if (si && src[si-1] == '/') |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
32 --si; /* no break after final grapheme */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
33 src[si] = 0; /* NUL-terminate */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
34 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
35 if (si) { /* test utf8proc_map */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
36 utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
37 size_t i = 0, j = 0; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
38 utf8proc_ssize_t glen, k; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
39 utf8proc_uint8_t *g; /* utf8proc_map grapheme results */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
40 while (i < si) { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
41 if (src[i] != '/') |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
42 utf8[j++] = src[i++]; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
43 else |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
44 i++; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
45 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
46 glen = utf8proc_map(utf8, (utf8proc_ssize_t)j, &g, UTF8PROC_CHARBOUND); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
47 if (glen == UTF8PROC_ERROR_INVALIDUTF8) { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
48 /* the test file contains surrogate codepoints, which are only for UTF-16 */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
49 printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
50 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
51 else { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
52 check(glen >= 0, "utf8proc_map error = %s", |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
53 utf8proc_errmsg(glen)); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
54 for (k = 0; k <= glen; ++k) |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
55 if (g[k] == 0xff) |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
56 g[k] = '/'; /* easier-to-read output (/ is not in test strings) */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
57 check(!strcmp((char*)g, (char*)src), |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
58 "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
59 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
60 free(g); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
61 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
62 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
63 if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
64 utf8proc_int32_t state = 0, prev_codepoint = 0; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
65 size_t i = 0; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
66 utf8proc_bool expectbreak = false; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
67 do { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
68 utf8proc_int32_t codepoint; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
69 i += (size_t)utf8proc_iterate(src + i, (utf8proc_ssize_t)(si - i), &codepoint); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
70 check(codepoint >= 0, "invalid UTF-8 data"); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
71 if (codepoint == 0x002F) |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
72 expectbreak = true; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
73 else { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
74 if (prev_codepoint != 0) { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
75 check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state), |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
76 "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
77 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
78 expectbreak = false; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
79 prev_codepoint = codepoint; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
80 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
81 } while (i < si); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
82 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
83 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
84 if (verbose) |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
85 printf("passed grapheme test: \"%s\"\n", (char*) src); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
86 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
87 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
88 int main(int argc, char **argv) |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
89 { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
90 unsigned char buf[8192]; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
91 FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
92 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
93 check(f != NULL, "error opening GraphemeBreakTest.txt"); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
94 while (simple_getline(buf, f) > 0) { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
95 if ((++lineno) % 100 == 0) |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
96 printf("checking line %zd...\n", lineno); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
97 if (buf[0] == '#') continue; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
98 checkline((char *) buf, false); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
99 } |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
100 fclose(f); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
101 printf("Passed tests after %zd lines!\n", lineno); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
102 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
103 printf("Performing regression tests...\n"); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
104 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
105 /* issue 144 */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
106 { |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
107 utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
108 utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
109 utf8proc_ssize_t glen; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
110 utf8proc_uint8_t *g; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
111 glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
112 check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks"); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
113 check(glen != 6, "mishandled u+ffff and u+fffe grapheme breaks"); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
114 free(g); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
115 }; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
116 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
117 /* https://github.com/JuliaLang/julia/issues/37680 */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
118 checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
119 checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
120 checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
121 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
122 /* more GB9c tests */ |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
123 checkline("/ 0915 0300 094d 0300 0924 / 0915 /", true); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
124 checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
125 checkline("/ 0915 0300 0300 / 0924 / 0915 /", true); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
126 checkline("/ 0915 0300 094d 0300 / 0078 /", true); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
127 checkline("/ 0300 094d 0300 / 0924 / 0915 /", true); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
128 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
129 check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test"); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
130 check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test"); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
131 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
132 printf("Passed regression tests!\n"); |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
133 |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
134 return 0; |
1faa72660932
*: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff
changeset
|
135 } |