annotate dep/utf8proc/test/graphemetest.c @ 343:1faa72660932

*: transfer back to cmake from autotools autotools just made lots of things more complicated than they should have and many things broke (i.e. translations)
author Paper <paper@paper.us.eu.org>
date Thu, 20 Jun 2024 05:56:06 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
343
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
1 #include "tests.h"
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
2
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
3 /* check one line in the format of GraphemeBreakTest.txt */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
4 void checkline(const char *_buf, bool verbose) {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
5 size_t bi = 0, si = 0;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
6 utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
7 const unsigned char *buf = (const unsigned char *) _buf;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
8
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
9 while (buf[bi]) {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
10 bi = skipspaces(buf, bi);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
11 if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
12 src[si++] = '/';
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
13 bi += 2;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
14 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
15 else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
16 bi += 2;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
17 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
18 else if (buf[bi] == '#') { /* start of comments */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
19 break;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
20 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
21 else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
22 src[si++] = '/';
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
23 bi += 1;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
24 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
25 else { /* hex-encoded codepoint */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
26 size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
27 while (src[si]) ++si; /* advance to NUL termination */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
28 bi += len;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
29 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
30 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
31 if (si && src[si-1] == '/')
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
32 --si; /* no break after final grapheme */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
33 src[si] = 0; /* NUL-terminate */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
34
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
35 if (si) { /* test utf8proc_map */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
36 utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
37 size_t i = 0, j = 0;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
38 utf8proc_ssize_t glen, k;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
39 utf8proc_uint8_t *g; /* utf8proc_map grapheme results */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
40 while (i < si) {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
41 if (src[i] != '/')
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
42 utf8[j++] = src[i++];
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
43 else
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
44 i++;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
45 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
46 glen = utf8proc_map(utf8, (utf8proc_ssize_t)j, &g, UTF8PROC_CHARBOUND);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
47 if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
48 /* the test file contains surrogate codepoints, which are only for UTF-16 */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
49 printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
50 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
51 else {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
52 check(glen >= 0, "utf8proc_map error = %s",
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
53 utf8proc_errmsg(glen));
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
54 for (k = 0; k <= glen; ++k)
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
55 if (g[k] == 0xff)
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
56 g[k] = '/'; /* easier-to-read output (/ is not in test strings) */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
57 check(!strcmp((char*)g, (char*)src),
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
58 "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
59 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
60 free(g);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
61 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
62
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
63 if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
64 utf8proc_int32_t state = 0, prev_codepoint = 0;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
65 size_t i = 0;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
66 utf8proc_bool expectbreak = false;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
67 do {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
68 utf8proc_int32_t codepoint;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
69 i += (size_t)utf8proc_iterate(src + i, (utf8proc_ssize_t)(si - i), &codepoint);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
70 check(codepoint >= 0, "invalid UTF-8 data");
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
71 if (codepoint == 0x002F)
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
72 expectbreak = true;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
73 else {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
74 if (prev_codepoint != 0) {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
75 check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state),
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
76 "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
77 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
78 expectbreak = false;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
79 prev_codepoint = codepoint;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
80 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
81 } while (i < si);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
82 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
83
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
84 if (verbose)
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
85 printf("passed grapheme test: \"%s\"\n", (char*) src);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
86 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
87
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
88 int main(int argc, char **argv)
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
89 {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
90 unsigned char buf[8192];
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
91 FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
92
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
93 check(f != NULL, "error opening GraphemeBreakTest.txt");
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
94 while (simple_getline(buf, f) > 0) {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
95 if ((++lineno) % 100 == 0)
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
96 printf("checking line %zd...\n", lineno);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
97 if (buf[0] == '#') continue;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
98 checkline((char *) buf, false);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
99 }
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
100 fclose(f);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
101 printf("Passed tests after %zd lines!\n", lineno);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
102
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
103 printf("Performing regression tests...\n");
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
104
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
105 /* issue 144 */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
106 {
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
107 utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
108 utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
109 utf8proc_ssize_t glen;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
110 utf8proc_uint8_t *g;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
111 glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
112 check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks");
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
113 check(glen != 6, "mishandled u+ffff and u+fffe grapheme breaks");
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
114 free(g);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
115 };
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
116
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
117 /* https://github.com/JuliaLang/julia/issues/37680 */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
118 checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
119 checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
120 checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
121
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
122 /* more GB9c tests */
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
123 checkline("/ 0915 0300 094d 0300 0924 / 0915 /", true);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
124 checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
125 checkline("/ 0915 0300 0300 / 0924 / 0915 /", true);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
126 checkline("/ 0915 0300 094d 0300 / 0078 /", true);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
127 checkline("/ 0300 094d 0300 / 0924 / 0915 /", true);
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
128
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
129 check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test");
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
130 check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test");
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
131
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
132 printf("Passed regression tests!\n");
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
133
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
134 return 0;
1faa72660932 *: transfer back to cmake from autotools
Paper <paper@paper.us.eu.org>
parents:
diff changeset
135 }