comparison dep/fmt/support/printable.py @ 343:1faa72660932

*: transfer back to cmake from autotools autotools just made lots of things more complicated than they should have and many things broke (i.e. translations)
author Paper <paper@paper.us.eu.org>
date Thu, 20 Jun 2024 05:56:06 -0400
parents
children
comparison
equal deleted inserted replaced
342:adb79bdde329 343:1faa72660932
1 #!/usr/bin/env python3
2
3 # This script is based on
4 # https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py
5 # distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT.
6
7 # This script uses the following Unicode tables:
8 # - UnicodeData.txt
9
10
11 from collections import namedtuple
12 import csv
13 import os
14 import subprocess
15
16 NUM_CODEPOINTS=0x110000
17
18 def to_ranges(iter):
19 current = None
20 for i in iter:
21 if current is None or i != current[1] or i in (0x10000, 0x20000):
22 if current is not None:
23 yield tuple(current)
24 current = [i, i + 1]
25 else:
26 current[1] += 1
27 if current is not None:
28 yield tuple(current)
29
30 def get_escaped(codepoints):
31 for c in codepoints:
32 if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
33 yield c.value
34
35 def get_file(f):
36 try:
37 return open(os.path.basename(f))
38 except FileNotFoundError:
39 subprocess.run(["curl", "-O", f], check=True)
40 return open(os.path.basename(f))
41
42 Codepoint = namedtuple('Codepoint', 'value class_')
43
44 def get_codepoints(f):
45 r = csv.reader(f, delimiter=";")
46 prev_codepoint = 0
47 class_first = None
48 for row in r:
49 codepoint = int(row[0], 16)
50 name = row[1]
51 class_ = row[2]
52
53 if class_first is not None:
54 if not name.endswith("Last>"):
55 raise ValueError("Missing Last after First")
56
57 for c in range(prev_codepoint + 1, codepoint):
58 yield Codepoint(c, class_first)
59
60 class_first = None
61 if name.endswith("First>"):
62 class_first = class_
63
64 yield Codepoint(codepoint, class_)
65 prev_codepoint = codepoint
66
67 if class_first is not None:
68 raise ValueError("Missing Last after First")
69
70 for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
71 yield Codepoint(c, None)
72
73 def compress_singletons(singletons):
74 uppers = [] # (upper, # items in lowers)
75 lowers = []
76
77 for i in singletons:
78 upper = i >> 8
79 lower = i & 0xff
80 if len(uppers) == 0 or uppers[-1][0] != upper:
81 uppers.append((upper, 1))
82 else:
83 upper, count = uppers[-1]
84 uppers[-1] = upper, count + 1
85 lowers.append(lower)
86
87 return uppers, lowers
88
89 def compress_normal(normal):
90 # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
91 # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
92 compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
93
94 prev_start = 0
95 for start, count in normal:
96 truelen = start - prev_start
97 falselen = count
98 prev_start = start + count
99
100 assert truelen < 0x8000 and falselen < 0x8000
101 entry = []
102 if truelen > 0x7f:
103 entry.append(0x80 | (truelen >> 8))
104 entry.append(truelen & 0xff)
105 else:
106 entry.append(truelen & 0x7f)
107 if falselen > 0x7f:
108 entry.append(0x80 | (falselen >> 8))
109 entry.append(falselen & 0xff)
110 else:
111 entry.append(falselen & 0x7f)
112
113 compressed.append(entry)
114
115 return compressed
116
117 def print_singletons(uppers, lowers, uppersname, lowersname):
118 print(" static constexpr singleton {}[] = {{".format(uppersname))
119 for u, c in uppers:
120 print(" {{{:#04x}, {}}},".format(u, c))
121 print(" };")
122 print(" static constexpr unsigned char {}[] = {{".format(lowersname))
123 for i in range(0, len(lowers), 8):
124 print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
125 print(" };")
126
127 def print_normal(normal, normalname):
128 print(" static constexpr unsigned char {}[] = {{".format(normalname))
129 for v in normal:
130 print(" {}".format(" ".join("{:#04x},".format(i) for i in v)))
131 print(" };")
132
133 def main():
134 file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
135
136 codepoints = get_codepoints(file)
137
138 CUTOFF=0x10000
139 singletons0 = []
140 singletons1 = []
141 normal0 = []
142 normal1 = []
143 extra = []
144
145 for a, b in to_ranges(get_escaped(codepoints)):
146 if a > 2 * CUTOFF:
147 extra.append((a, b - a))
148 elif a == b - 1:
149 if a & CUTOFF:
150 singletons1.append(a & ~CUTOFF)
151 else:
152 singletons0.append(a)
153 elif a == b - 2:
154 if a & CUTOFF:
155 singletons1.append(a & ~CUTOFF)
156 singletons1.append((a + 1) & ~CUTOFF)
157 else:
158 singletons0.append(a)
159 singletons0.append(a + 1)
160 else:
161 if a >= 2 * CUTOFF:
162 extra.append((a, b - a))
163 elif a & CUTOFF:
164 normal1.append((a & ~CUTOFF, b - a))
165 else:
166 normal0.append((a, b - a))
167
168 singletons0u, singletons0l = compress_singletons(singletons0)
169 singletons1u, singletons1l = compress_singletons(singletons1)
170 normal0 = compress_normal(normal0)
171 normal1 = compress_normal(normal1)
172
173 print("""\
174 FMT_FUNC auto is_printable(uint32_t cp) -> bool {\
175 """)
176 print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower')
177 print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower')
178 print_normal(normal0, 'normal0')
179 print_normal(normal1, 'normal1')
180 print("""\
181 auto lower = static_cast<uint16_t>(cp);
182 if (cp < 0x10000) {
183 return is_printable(lower, singletons0,
184 sizeof(singletons0) / sizeof(*singletons0),
185 singletons0_lower, normal0, sizeof(normal0));
186 }
187 if (cp < 0x20000) {
188 return is_printable(lower, singletons1,
189 sizeof(singletons1) / sizeof(*singletons1),
190 singletons1_lower, normal1, sizeof(normal1));
191 }\
192 """)
193 for a, b in extra:
194 print(" if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b))
195 print("""\
196 return cp < 0x{:x};
197 }}\
198 """.format(NUM_CODEPOINTS))
199
200 if __name__ == '__main__':
201 main()