Mercurial > minori
comparison dep/utf8proc/test/graphemetest.c @ 343:1faa72660932
*: transfer back to cmake from autotools
autotools just made lots of things more complicated than
they should have and many things broke (i.e. translations)
| author | Paper <paper@paper.us.eu.org> |
|---|---|
| date | Thu, 20 Jun 2024 05:56:06 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 342:adb79bdde329 | 343:1faa72660932 |
|---|---|
| 1 #include "tests.h" | |
| 2 | |
| 3 /* check one line in the format of GraphemeBreakTest.txt */ | |
| 4 void checkline(const char *_buf, bool verbose) { | |
| 5 size_t bi = 0, si = 0; | |
| 6 utf8proc_uint8_t src[1024]; /* more than long enough for all of our tests */ | |
| 7 const unsigned char *buf = (const unsigned char *) _buf; | |
| 8 | |
| 9 while (buf[bi]) { | |
| 10 bi = skipspaces(buf, bi); | |
| 11 if (buf[bi] == 0xc3 && buf[bi+1] == 0xb7) { /* U+00f7 = grapheme break */ | |
| 12 src[si++] = '/'; | |
| 13 bi += 2; | |
| 14 } | |
| 15 else if (buf[bi] == 0xc3 && buf[bi+1] == 0x97) { /* U+00d7 = no break */ | |
| 16 bi += 2; | |
| 17 } | |
| 18 else if (buf[bi] == '#') { /* start of comments */ | |
| 19 break; | |
| 20 } | |
| 21 else if (buf[bi] == '/') { /* for convenience, also accept / as grapheme break */ | |
| 22 src[si++] = '/'; | |
| 23 bi += 1; | |
| 24 } | |
| 25 else { /* hex-encoded codepoint */ | |
| 26 size_t len = encode((unsigned char*) (src + si), buf + bi) - 1; | |
| 27 while (src[si]) ++si; /* advance to NUL termination */ | |
| 28 bi += len; | |
| 29 } | |
| 30 } | |
| 31 if (si && src[si-1] == '/') | |
| 32 --si; /* no break after final grapheme */ | |
| 33 src[si] = 0; /* NUL-terminate */ | |
| 34 | |
| 35 if (si) { /* test utf8proc_map */ | |
| 36 utf8proc_uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ | |
| 37 size_t i = 0, j = 0; | |
| 38 utf8proc_ssize_t glen, k; | |
| 39 utf8proc_uint8_t *g; /* utf8proc_map grapheme results */ | |
| 40 while (i < si) { | |
| 41 if (src[i] != '/') | |
| 42 utf8[j++] = src[i++]; | |
| 43 else | |
| 44 i++; | |
| 45 } | |
| 46 glen = utf8proc_map(utf8, (utf8proc_ssize_t)j, &g, UTF8PROC_CHARBOUND); | |
| 47 if (glen == UTF8PROC_ERROR_INVALIDUTF8) { | |
| 48 /* the test file contains surrogate codepoints, which are only for UTF-16 */ | |
| 49 printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno); | |
| 50 } | |
| 51 else { | |
| 52 check(glen >= 0, "utf8proc_map error = %s", | |
| 53 utf8proc_errmsg(glen)); | |
| 54 for (k = 0; k <= glen; ++k) | |
| 55 if (g[k] == 0xff) | |
| 56 g[k] = '/'; /* easier-to-read output (/ is not in test strings) */ | |
| 57 check(!strcmp((char*)g, (char*)src), | |
| 58 "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src); | |
| 59 } | |
| 60 free(g); | |
| 61 } | |
| 62 | |
| 63 if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */ | |
| 64 utf8proc_int32_t state = 0, prev_codepoint = 0; | |
| 65 size_t i = 0; | |
| 66 utf8proc_bool expectbreak = false; | |
| 67 do { | |
| 68 utf8proc_int32_t codepoint; | |
| 69 i += (size_t)utf8proc_iterate(src + i, (utf8proc_ssize_t)(si - i), &codepoint); | |
| 70 check(codepoint >= 0, "invalid UTF-8 data"); | |
| 71 if (codepoint == 0x002F) | |
| 72 expectbreak = true; | |
| 73 else { | |
| 74 if (prev_codepoint != 0) { | |
| 75 check(expectbreak == utf8proc_grapheme_break_stateful(prev_codepoint, codepoint, &state), | |
| 76 "grapheme mismatch: between 0x%04x and 0x%04x in \"%s\"", prev_codepoint, codepoint, (char*) src); | |
| 77 } | |
| 78 expectbreak = false; | |
| 79 prev_codepoint = codepoint; | |
| 80 } | |
| 81 } while (i < si); | |
| 82 } | |
| 83 | |
| 84 if (verbose) | |
| 85 printf("passed grapheme test: \"%s\"\n", (char*) src); | |
| 86 } | |
| 87 | |
| 88 int main(int argc, char **argv) | |
| 89 { | |
| 90 unsigned char buf[8192]; | |
| 91 FILE *f = argc > 1 ? fopen(argv[1], "r") : NULL; | |
| 92 | |
| 93 check(f != NULL, "error opening GraphemeBreakTest.txt"); | |
| 94 while (simple_getline(buf, f) > 0) { | |
| 95 if ((++lineno) % 100 == 0) | |
| 96 printf("checking line %zd...\n", lineno); | |
| 97 if (buf[0] == '#') continue; | |
| 98 checkline((char *) buf, false); | |
| 99 } | |
| 100 fclose(f); | |
| 101 printf("Passed tests after %zd lines!\n", lineno); | |
| 102 | |
| 103 printf("Performing regression tests...\n"); | |
| 104 | |
| 105 /* issue 144 */ | |
| 106 { | |
| 107 utf8proc_uint8_t input[] = {0xef,0xbf,0xbf,0xef,0xbf,0xbe,0x00}; /* "\uffff\ufffe" */ | |
| 108 utf8proc_uint8_t output[] = {0xff,0xef,0xbf,0xbf,0xff,0xef,0xbf,0xbe,0x00}; /* with 0xff grapheme markers */ | |
| 109 utf8proc_ssize_t glen; | |
| 110 utf8proc_uint8_t *g; | |
| 111 glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND); | |
| 112 check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks"); | |
| 113 check(glen != 6, "mishandled u+ffff and u+fffe grapheme breaks"); | |
| 114 free(g); | |
| 115 }; | |
| 116 | |
| 117 /* https://github.com/JuliaLang/julia/issues/37680 */ | |
| 118 checkline("/ 1f1f8 1f1ea / 1f1f8 1f1ea /", true); /* Two swedish flags after each other */ | |
| 119 checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */ | |
| 120 checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */ | |
| 121 | |
| 122 /* more GB9c tests */ | |
| 123 checkline("/ 0915 0300 094d 0300 0924 / 0915 /", true); | |
| 124 checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true); | |
| 125 checkline("/ 0915 0300 0300 / 0924 / 0915 /", true); | |
| 126 checkline("/ 0915 0300 094d 0300 / 0078 /", true); | |
| 127 checkline("/ 0300 094d 0300 / 0924 / 0915 /", true); | |
| 128 | |
| 129 check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test"); | |
| 130 check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test"); | |
| 131 | |
| 132 printf("Passed regression tests!\n"); | |
| 133 | |
| 134 return 0; | |
| 135 } |
