Mercurial > minori
comparison dep/fmt/support/printable.py @ 343:1faa72660932
*: transfer back to cmake from autotools
autotools just made lots of things more complicated than
they should have and many things broke (i.e. translations)
author | Paper <paper@paper.us.eu.org> |
---|---|
date | Thu, 20 Jun 2024 05:56:06 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
342:adb79bdde329 | 343:1faa72660932 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # This script is based on | |
4 # https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py | |
5 # distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT. | |
6 | |
7 # This script uses the following Unicode tables: | |
8 # - UnicodeData.txt | |
9 | |
10 | |
11 from collections import namedtuple | |
12 import csv | |
13 import os | |
14 import subprocess | |
15 | |
16 NUM_CODEPOINTS=0x110000 | |
17 | |
18 def to_ranges(iter): | |
19 current = None | |
20 for i in iter: | |
21 if current is None or i != current[1] or i in (0x10000, 0x20000): | |
22 if current is not None: | |
23 yield tuple(current) | |
24 current = [i, i + 1] | |
25 else: | |
26 current[1] += 1 | |
27 if current is not None: | |
28 yield tuple(current) | |
29 | |
30 def get_escaped(codepoints): | |
31 for c in codepoints: | |
32 if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '): | |
33 yield c.value | |
34 | |
35 def get_file(f): | |
36 try: | |
37 return open(os.path.basename(f)) | |
38 except FileNotFoundError: | |
39 subprocess.run(["curl", "-O", f], check=True) | |
40 return open(os.path.basename(f)) | |
41 | |
42 Codepoint = namedtuple('Codepoint', 'value class_') | |
43 | |
44 def get_codepoints(f): | |
45 r = csv.reader(f, delimiter=";") | |
46 prev_codepoint = 0 | |
47 class_first = None | |
48 for row in r: | |
49 codepoint = int(row[0], 16) | |
50 name = row[1] | |
51 class_ = row[2] | |
52 | |
53 if class_first is not None: | |
54 if not name.endswith("Last>"): | |
55 raise ValueError("Missing Last after First") | |
56 | |
57 for c in range(prev_codepoint + 1, codepoint): | |
58 yield Codepoint(c, class_first) | |
59 | |
60 class_first = None | |
61 if name.endswith("First>"): | |
62 class_first = class_ | |
63 | |
64 yield Codepoint(codepoint, class_) | |
65 prev_codepoint = codepoint | |
66 | |
67 if class_first is not None: | |
68 raise ValueError("Missing Last after First") | |
69 | |
70 for c in range(prev_codepoint + 1, NUM_CODEPOINTS): | |
71 yield Codepoint(c, None) | |
72 | |
73 def compress_singletons(singletons): | |
74 uppers = [] # (upper, # items in lowers) | |
75 lowers = [] | |
76 | |
77 for i in singletons: | |
78 upper = i >> 8 | |
79 lower = i & 0xff | |
80 if len(uppers) == 0 or uppers[-1][0] != upper: | |
81 uppers.append((upper, 1)) | |
82 else: | |
83 upper, count = uppers[-1] | |
84 uppers[-1] = upper, count + 1 | |
85 lowers.append(lower) | |
86 | |
87 return uppers, lowers | |
88 | |
89 def compress_normal(normal): | |
90 # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f | |
91 # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff | |
92 compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)] | |
93 | |
94 prev_start = 0 | |
95 for start, count in normal: | |
96 truelen = start - prev_start | |
97 falselen = count | |
98 prev_start = start + count | |
99 | |
100 assert truelen < 0x8000 and falselen < 0x8000 | |
101 entry = [] | |
102 if truelen > 0x7f: | |
103 entry.append(0x80 | (truelen >> 8)) | |
104 entry.append(truelen & 0xff) | |
105 else: | |
106 entry.append(truelen & 0x7f) | |
107 if falselen > 0x7f: | |
108 entry.append(0x80 | (falselen >> 8)) | |
109 entry.append(falselen & 0xff) | |
110 else: | |
111 entry.append(falselen & 0x7f) | |
112 | |
113 compressed.append(entry) | |
114 | |
115 return compressed | |
116 | |
117 def print_singletons(uppers, lowers, uppersname, lowersname): | |
118 print(" static constexpr singleton {}[] = {{".format(uppersname)) | |
119 for u, c in uppers: | |
120 print(" {{{:#04x}, {}}},".format(u, c)) | |
121 print(" };") | |
122 print(" static constexpr unsigned char {}[] = {{".format(lowersname)) | |
123 for i in range(0, len(lowers), 8): | |
124 print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8]))) | |
125 print(" };") | |
126 | |
127 def print_normal(normal, normalname): | |
128 print(" static constexpr unsigned char {}[] = {{".format(normalname)) | |
129 for v in normal: | |
130 print(" {}".format(" ".join("{:#04x},".format(i) for i in v))) | |
131 print(" };") | |
132 | |
133 def main(): | |
134 file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt") | |
135 | |
136 codepoints = get_codepoints(file) | |
137 | |
138 CUTOFF=0x10000 | |
139 singletons0 = [] | |
140 singletons1 = [] | |
141 normal0 = [] | |
142 normal1 = [] | |
143 extra = [] | |
144 | |
145 for a, b in to_ranges(get_escaped(codepoints)): | |
146 if a > 2 * CUTOFF: | |
147 extra.append((a, b - a)) | |
148 elif a == b - 1: | |
149 if a & CUTOFF: | |
150 singletons1.append(a & ~CUTOFF) | |
151 else: | |
152 singletons0.append(a) | |
153 elif a == b - 2: | |
154 if a & CUTOFF: | |
155 singletons1.append(a & ~CUTOFF) | |
156 singletons1.append((a + 1) & ~CUTOFF) | |
157 else: | |
158 singletons0.append(a) | |
159 singletons0.append(a + 1) | |
160 else: | |
161 if a >= 2 * CUTOFF: | |
162 extra.append((a, b - a)) | |
163 elif a & CUTOFF: | |
164 normal1.append((a & ~CUTOFF, b - a)) | |
165 else: | |
166 normal0.append((a, b - a)) | |
167 | |
168 singletons0u, singletons0l = compress_singletons(singletons0) | |
169 singletons1u, singletons1l = compress_singletons(singletons1) | |
170 normal0 = compress_normal(normal0) | |
171 normal1 = compress_normal(normal1) | |
172 | |
173 print("""\ | |
174 FMT_FUNC auto is_printable(uint32_t cp) -> bool {\ | |
175 """) | |
176 print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower') | |
177 print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower') | |
178 print_normal(normal0, 'normal0') | |
179 print_normal(normal1, 'normal1') | |
180 print("""\ | |
181 auto lower = static_cast<uint16_t>(cp); | |
182 if (cp < 0x10000) { | |
183 return is_printable(lower, singletons0, | |
184 sizeof(singletons0) / sizeof(*singletons0), | |
185 singletons0_lower, normal0, sizeof(normal0)); | |
186 } | |
187 if (cp < 0x20000) { | |
188 return is_printable(lower, singletons1, | |
189 sizeof(singletons1) / sizeof(*singletons1), | |
190 singletons1_lower, normal1, sizeof(normal1)); | |
191 }\ | |
192 """) | |
193 for a, b in extra: | |
194 print(" if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b)) | |
195 print("""\ | |
196 return cp < 0x{:x}; | |
197 }}\ | |
198 """.format(NUM_CODEPOINTS)) | |
199 | |
200 if __name__ == '__main__': | |
201 main() |