Mercurial > minori
comparison dep/fmt/support/printable.py @ 343:1faa72660932
*: transfer back to cmake from autotools
autotools just made lots of things more complicated than
they should have and many things broke (i.e. translations)
| author | Paper <paper@paper.us.eu.org> |
|---|---|
| date | Thu, 20 Jun 2024 05:56:06 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 342:adb79bdde329 | 343:1faa72660932 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 # This script is based on | |
| 4 # https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py | |
| 5 # distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT. | |
| 6 | |
| 7 # This script uses the following Unicode tables: | |
| 8 # - UnicodeData.txt | |
| 9 | |
| 10 | |
| 11 from collections import namedtuple | |
| 12 import csv | |
| 13 import os | |
| 14 import subprocess | |
| 15 | |
| 16 NUM_CODEPOINTS=0x110000 | |
| 17 | |
| 18 def to_ranges(iter): | |
| 19 current = None | |
| 20 for i in iter: | |
| 21 if current is None or i != current[1] or i in (0x10000, 0x20000): | |
| 22 if current is not None: | |
| 23 yield tuple(current) | |
| 24 current = [i, i + 1] | |
| 25 else: | |
| 26 current[1] += 1 | |
| 27 if current is not None: | |
| 28 yield tuple(current) | |
| 29 | |
| 30 def get_escaped(codepoints): | |
| 31 for c in codepoints: | |
| 32 if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '): | |
| 33 yield c.value | |
| 34 | |
| 35 def get_file(f): | |
| 36 try: | |
| 37 return open(os.path.basename(f)) | |
| 38 except FileNotFoundError: | |
| 39 subprocess.run(["curl", "-O", f], check=True) | |
| 40 return open(os.path.basename(f)) | |
| 41 | |
| 42 Codepoint = namedtuple('Codepoint', 'value class_') | |
| 43 | |
| 44 def get_codepoints(f): | |
| 45 r = csv.reader(f, delimiter=";") | |
| 46 prev_codepoint = 0 | |
| 47 class_first = None | |
| 48 for row in r: | |
| 49 codepoint = int(row[0], 16) | |
| 50 name = row[1] | |
| 51 class_ = row[2] | |
| 52 | |
| 53 if class_first is not None: | |
| 54 if not name.endswith("Last>"): | |
| 55 raise ValueError("Missing Last after First") | |
| 56 | |
| 57 for c in range(prev_codepoint + 1, codepoint): | |
| 58 yield Codepoint(c, class_first) | |
| 59 | |
| 60 class_first = None | |
| 61 if name.endswith("First>"): | |
| 62 class_first = class_ | |
| 63 | |
| 64 yield Codepoint(codepoint, class_) | |
| 65 prev_codepoint = codepoint | |
| 66 | |
| 67 if class_first is not None: | |
| 68 raise ValueError("Missing Last after First") | |
| 69 | |
| 70 for c in range(prev_codepoint + 1, NUM_CODEPOINTS): | |
| 71 yield Codepoint(c, None) | |
| 72 | |
| 73 def compress_singletons(singletons): | |
| 74 uppers = [] # (upper, # items in lowers) | |
| 75 lowers = [] | |
| 76 | |
| 77 for i in singletons: | |
| 78 upper = i >> 8 | |
| 79 lower = i & 0xff | |
| 80 if len(uppers) == 0 or uppers[-1][0] != upper: | |
| 81 uppers.append((upper, 1)) | |
| 82 else: | |
| 83 upper, count = uppers[-1] | |
| 84 uppers[-1] = upper, count + 1 | |
| 85 lowers.append(lower) | |
| 86 | |
| 87 return uppers, lowers | |
| 88 | |
| 89 def compress_normal(normal): | |
| 90 # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f | |
| 91 # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff | |
| 92 compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] | |
| 93 | |
| 94 prev_start = 0 | |
| 95 for start, count in normal: | |
| 96 truelen = start - prev_start | |
| 97 falselen = count | |
| 98 prev_start = start + count | |
| 99 | |
| 100 assert truelen < 0x8000 and falselen < 0x8000 | |
| 101 entry = [] | |
| 102 if truelen > 0x7f: | |
| 103 entry.append(0x80 | (truelen >> 8)) | |
| 104 entry.append(truelen & 0xff) | |
| 105 else: | |
| 106 entry.append(truelen & 0x7f) | |
| 107 if falselen > 0x7f: | |
| 108 entry.append(0x80 | (falselen >> 8)) | |
| 109 entry.append(falselen & 0xff) | |
| 110 else: | |
| 111 entry.append(falselen & 0x7f) | |
| 112 | |
| 113 compressed.append(entry) | |
| 114 | |
| 115 return compressed | |
| 116 | |
| 117 def print_singletons(uppers, lowers, uppersname, lowersname): | |
| 118 print(" static constexpr singleton {}[] = {{".format(uppersname)) | |
| 119 for u, c in uppers: | |
| 120 print(" {{{:#04x}, {}}},".format(u, c)) | |
| 121 print(" };") | |
| 122 print(" static constexpr unsigned char {}[] = {{".format(lowersname)) | |
| 123 for i in range(0, len(lowers), 8): | |
| 124 print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8]))) | |
| 125 print(" };") | |
| 126 | |
| 127 def print_normal(normal, normalname): | |
| 128 print(" static constexpr unsigned char {}[] = {{".format(normalname)) | |
| 129 for v in normal: | |
| 130 print(" {}".format(" ".join("{:#04x},".format(i) for i in v))) | |
| 131 print(" };") | |
| 132 | |
| 133 def main(): | |
| 134 file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt") | |
| 135 | |
| 136 codepoints = get_codepoints(file) | |
| 137 | |
| 138 CUTOFF=0x10000 | |
| 139 singletons0 = [] | |
| 140 singletons1 = [] | |
| 141 normal0 = [] | |
| 142 normal1 = [] | |
| 143 extra = [] | |
| 144 | |
| 145 for a, b in to_ranges(get_escaped(codepoints)): | |
| 146 if a > 2 * CUTOFF: | |
| 147 extra.append((a, b - a)) | |
| 148 elif a == b - 1: | |
| 149 if a & CUTOFF: | |
| 150 singletons1.append(a & ~CUTOFF) | |
| 151 else: | |
| 152 singletons0.append(a) | |
| 153 elif a == b - 2: | |
| 154 if a & CUTOFF: | |
| 155 singletons1.append(a & ~CUTOFF) | |
| 156 singletons1.append((a + 1) & ~CUTOFF) | |
| 157 else: | |
| 158 singletons0.append(a) | |
| 159 singletons0.append(a + 1) | |
| 160 else: | |
| 161 if a >= 2 * CUTOFF: | |
| 162 extra.append((a, b - a)) | |
| 163 elif a & CUTOFF: | |
| 164 normal1.append((a & ~CUTOFF, b - a)) | |
| 165 else: | |
| 166 normal0.append((a, b - a)) | |
| 167 | |
| 168 singletons0u, singletons0l = compress_singletons(singletons0) | |
| 169 singletons1u, singletons1l = compress_singletons(singletons1) | |
| 170 normal0 = compress_normal(normal0) | |
| 171 normal1 = compress_normal(normal1) | |
| 172 | |
| 173 print("""\ | |
| 174 FMT_FUNC auto is_printable(uint32_t cp) -> bool {\ | |
| 175 """) | |
| 176 print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower') | |
| 177 print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower') | |
| 178 print_normal(normal0, 'normal0') | |
| 179 print_normal(normal1, 'normal1') | |
| 180 print("""\ | |
| 181 auto lower = static_cast<uint16_t>(cp); | |
| 182 if (cp < 0x10000) { | |
| 183 return is_printable(lower, singletons0, | |
| 184 sizeof(singletons0) / sizeof(*singletons0), | |
| 185 singletons0_lower, normal0, sizeof(normal0)); | |
| 186 } | |
| 187 if (cp < 0x20000) { | |
| 188 return is_printable(lower, singletons1, | |
| 189 sizeof(singletons1) / sizeof(*singletons1), | |
| 190 singletons1_lower, normal1, sizeof(normal1)); | |
| 191 }\ | |
| 192 """) | |
| 193 for a, b in extra: | |
| 194 print(" if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b)) | |
| 195 print("""\ | |
| 196 return cp < 0x{:x}; | |
| 197 }}\ | |
| 198 """.format(NUM_CODEPOINTS)) | |
| 199 | |
| 200 if __name__ == '__main__': | |
| 201 main() |
