comparison dep/utf8proc/data/data_generator.rb @ 343:1faa72660932

*: transfer back to cmake from autotools autotools just made lots of things more complicated than they should have and many things broke (i.e. translations)
author Paper <paper@paper.us.eu.org>
date Thu, 20 Jun 2024 05:56:06 -0400
parents
children
comparison
equal deleted inserted replaced
342:adb79bdde329 343:1faa72660932
1 #!/usr/bin/env ruby
2
3 # This file was used to generate the 'unicode_data.c' file by parsing the
4 # Unicode data file 'UnicodeData.txt' of the Unicode Character Database.
5 # It is included for informational purposes only and not intended for
6 # production use.
7
8
9 # Copyright (c) 2018 Steven G. Johnson, Tony Kelman, Keno Fischer,
10 # Benito van der Zander, Michaël Meyer, and other contributors.
11 # Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
12 #
13 # Permission is hereby granted, free of charge, to any person obtaining a
14 # copy of this software and associated documentation files (the "Software"),
15 # to deal in the Software without restriction, including without limitation
16 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 # and/or sell copies of the Software, and to permit persons to whom the
18 # Software is furnished to do so, subject to the following conditions:
19 #
20 # The above copyright notice and this permission notice shall be included in
21 # all copies or substantial portions of the Software.
22 #
23 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 # DEALINGS IN THE SOFTWARE.
30
31
32 # This file contains derived data from a modified version of the
33 # Unicode data files. The following license applies to that data:
34 #
35 # COPYRIGHT AND PERMISSION NOTICE
36 #
37 # Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed
38 # under the Terms of Use in http://www.unicode.org/copyright.html.
39 #
40 # Permission is hereby granted, free of charge, to any person obtaining a
41 # copy of the Unicode data files and any associated documentation (the "Data
42 # Files") or Unicode software and any associated documentation (the
43 # "Software") to deal in the Data Files or Software without restriction,
44 # including without limitation the rights to use, copy, modify, merge,
45 # publish, distribute, and/or sell copies of the Data Files or Software, and
46 # to permit persons to whom the Data Files or Software are furnished to do
47 # so, provided that (a) the above copyright notice(s) and this permission
48 # notice appear with all copies of the Data Files or Software, (b) both the
49 # above copyright notice(s) and this permission notice appear in associated
50 # documentation, and (c) there is clear notice in each modified Data File or
51 # in the Software as well as in the documentation associated with the Data
52 # File(s) or Software that the data or software has been modified.
53 #
54 # THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
55 # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
56 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
57 # THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
58 # INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
59 # CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
60 # USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
61 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
62 # PERFORMANCE OF THE DATA FILES OR SOFTWARE.
63 #
64 # Except as contained in this notice, the name of a copyright holder shall
65 # not be used in advertising or otherwise to promote the sale, use or other
66 # dealings in these Data Files or Software without prior written
67 # authorization of the copyright holder.
68
69
70 $ignorable_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m]
71 $ignorable = []
72 $ignorable_list.each_line do |entry|
73 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
74 $1.hex.upto($2.hex) { |e2| $ignorable << e2 }
75 elsif entry =~ /^[0-9A-F]+/
76 $ignorable << $&.hex
77 end
78 end
79
80 $uppercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Uppercase.*?# Total code points:/m]
81 $uppercase = []
82 $uppercase_list.each_line do |entry|
83 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
84 $1.hex.upto($2.hex) { |e2| $uppercase << e2 }
85 elsif entry =~ /^[0-9A-F]+/
86 $uppercase << $&.hex
87 end
88 end
89
90 $lowercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Lowercase.*?# Total code points:/m]
91 $lowercase = []
92 $lowercase_list.each_line do |entry|
93 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
94 $1.hex.upto($2.hex) { |e2| $lowercase << e2 }
95 elsif entry =~ /^[0-9A-F]+/
96 $lowercase << $&.hex
97 end
98 end
99
100 $icb_linker_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Linker.*?# Total code points:/m]
101 $icb = Hash.new("UTF8PROC_INDIC_CONJUNCT_BREAK_NONE")
102 $icb_linker_list.each_line do |entry|
103 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
104 $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" }
105 elsif entry =~ /^[0-9A-F]+/
106 $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER"
107 end
108 end
109 $icb_consonant_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Consonant.*?# Total code points:/m]
110 $icb_consonant_list.each_line do |entry|
111 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
112 $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" }
113 elsif entry =~ /^[0-9A-F]+/
114 $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT"
115 end
116 end
117 $icb_extend_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Extend.*?# Total code points:/m]
118 $icb_extend_list.each_line do |entry|
119 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
120 $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" }
121 elsif entry =~ /^[0-9A-F]+/
122 $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND"
123 end
124 end
125
126 $grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8')
127 $grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
128 $grapheme_boundclass_list.each_line do |entry|
129 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
130 $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
131 elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
132 $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
133 end
134 end
135
136 $emoji_data_list = File.read("emoji-data.txt", :encoding => 'utf-8')
137 $emoji_data_list.each_line do |entry|
138 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
139 $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" }
140 elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
141 $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC"
142 elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
143 $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" }
144 elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
145 $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND"
146 end
147 end
148
149 $charwidth_list = File.read("CharWidths.txt", :encoding => 'utf-8')
150 $charwidth = Hash.new(0)
151 $charwidth_list.each_line do |entry|
152 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/
153 $1.hex.upto($2.hex) { |e2| $charwidth[e2] = $3.to_i }
154 elsif entry =~ /^([0-9A-F]+)\s*;\s*([0-9]+)/
155 $charwidth[$1.hex] = $2.to_i
156 end
157 end
158
159 $exclusions = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(1\) Script Specifics.*?# Total code points:/m]
160 $exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
161
162 $excl_version = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m]
163 $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }
164
165 $case_folding_string = File.read("CaseFolding.txt", :encoding => 'utf-8')
166 $case_folding = {}
167 $case_folding_string.chomp.split("\n").each do |line|
168 next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i
169 $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex }
170 end
171
172 $int_array = []
173 $int_array_indicies = {}
174
175 def str2c(string, prefix)
176 return "0" if string.nil?
177 return "UTF8PROC_#{prefix}_#{string.upcase}"
178 end
179 def pushary(array)
180 idx = $int_array_indicies[array]
181 unless idx
182 $int_array_indicies[array] = $int_array.length
183 idx = $int_array.length
184 array.each { |entry| $int_array << entry }
185 end
186 return idx
187 end
188 def cpary2utf16encoded(array)
189 return array.flat_map { |cp|
190 if (cp <= 0xFFFF)
191 raise "utf-16 code: #{cp}" if cp & 0b1111100000000000 == 0b1101100000000000
192 cp
193 else
194 temp = cp - 0x10000
195 [(temp >> 10) | 0b1101100000000000, (temp & 0b0000001111111111) | 0b1101110000000000]
196 end
197 }
198 end
199 def cpary2c(array)
200 return "UINT16_MAX" if array.nil? || array.length == 0
201 lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
202 array = cpary2utf16encoded(array)
203 if lencode >= 3 #we have only 2 bits for the length
204 array = [lencode] + array
205 lencode = 3
206 end
207 idx = pushary(array)
208 raise "Array index out of bound" if idx > 0x3FFF
209 return "#{idx | (lencode << 14)}"
210 end
211 def singlecpmap(cp)
212 return "UINT16_MAX" if cp == nil
213 idx = pushary(cpary2utf16encoded([cp]))
214 raise "Array index out of bound" if idx > 0xFFFF
215 return "#{idx}"
216 end
217
218 class UnicodeChar
219 attr_accessor :code, :name, :category, :combining_class, :bidi_class,
220 :decomp_type, :decomp_mapping,
221 :bidi_mirrored,
222 :uppercase_mapping, :lowercase_mapping, :titlecase_mapping,
223 #caches:
224 :c_entry_index, :c_decomp_mapping, :c_case_folding
225 def initialize(line)
226 raise "Could not parse input." unless line =~ /^
227 ([0-9A-F]+); # code
228 ([^;]+); # name
229 ([A-Z]+); # general category
230 ([0-9]+); # canonical combining class
231 ([A-Z]+); # bidi class
232 (<([A-Z]*)>)? # decomposition type
233 ((\ ?[0-9A-F]+)*); # decompomposition mapping
234 ([0-9]*); # decimal digit
235 ([0-9]*); # digit
236 ([^;]*); # numeric
237 ([YN]*); # bidi mirrored
238 ([^;]*); # unicode 1.0 name
239 ([^;]*); # iso comment
240 ([0-9A-F]*); # simple uppercase mapping
241 ([0-9A-F]*); # simple lowercase mapping
242 ([0-9A-F]*)$/ix # simple titlecase mapping
243 @code = $1.hex
244 @name = $2
245 @category = $3
246 @combining_class = Integer($4)
247 @bidi_class = $5
248 @decomp_type = $7
249 @decomp_mapping = ($8=='') ? nil :
250 $8.split.collect { |element| element.hex }
251 @bidi_mirrored = ($13=='Y') ? true : false
252 # issue #130: use nonstandard uppercase ß -> ẞ
253 # issue #195: if character is uppercase but has no lowercase mapping,
254 # then make lowercase mapping = itself (vice versa for lowercase)
255 @uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : ($17=='' && $lowercase.include?(code) ? code : nil)) : $16.hex
256 @lowercase_mapping = ($17=='') ? ($16=='' && $uppercase.include?(code) ? code : nil) : $17.hex
257 @titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
258 end
259 def case_folding
260 $case_folding[code]
261 end
262 def c_entry(comb_indicies)
263 " " <<
264 "{#{str2c category, 'CATEGORY'}, #{combining_class}, " <<
265 "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
266 "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
267 "#{c_decomp_mapping}, " <<
268 "#{c_case_folding}, " <<
269 "#{singlecpmap uppercase_mapping }, " <<
270 "#{singlecpmap lowercase_mapping }, " <<
271 "#{singlecpmap titlecase_mapping }, " <<
272 "#{comb_indicies[code] ? comb_indicies[code]: 'UINT16_MAX'}, " <<
273 "#{bidi_mirrored}, " <<
274 "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
275 "#{$ignorable.include?(code)}, " <<
276 "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
277 "#{$charwidth[code]}, 0, " <<
278 "#{$grapheme_boundclass[code]}, " <<
279 "#{$icb[code]}},\n"
280 end
281 end
282
283 chars = []
284 char_hash = {}
285
286 while gets
287 if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i
288 first = $1.hex
289 gets
290 char = UnicodeChar.new($_)
291 raise "No last character of sequence found." unless
292 $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i
293 last = $1.hex
294 name = "<#{$2}>"
295 for i in first..last
296 char_clone = char.clone
297 char_clone.code = i
298 char_clone.name = name
299 char_hash[char_clone.code] = char_clone
300 chars << char_clone
301 end
302 else
303 char = UnicodeChar.new($_)
304 char_hash[char.code] = char
305 chars << char
306 end
307 end
308
309 comb1st_indicies = {}
310 comb2nd_indicies = {}
311 comb2nd_indicies_sorted_keys = []
312 comb2nd_indicies_nonbasic = {}
313 comb_array = []
314
315 chars.each do |char|
316 if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and
317 char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and
318 char_hash[char.decomp_mapping[0]].combining_class == 0 and
319 not $exclusions.include?(char.code)
320
321 dm0 = char.decomp_mapping[0]
322 dm1 = char.decomp_mapping[1]
323 unless comb1st_indicies[dm0]
324 comb1st_indicies[dm0] = comb1st_indicies.keys.length
325 end
326 unless comb2nd_indicies[dm1]
327 comb2nd_indicies_sorted_keys << dm1
328 comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
329 end
330 comb_array[comb1st_indicies[dm0]] ||= []
331 raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
332 comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
333
334 comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
335 end
336 char.c_decomp_mapping = cpary2c(char.decomp_mapping)
337 char.c_case_folding = cpary2c(char.case_folding)
338 end
339
340 comb_indicies = {}
341 cumoffset = 0
342 comb1st_indicies_lastoffsets = []
343 comb1st_indicies_firstoffsets = []
344 comb1st_indicies.each do |dm0, index|
345 first = nil
346 last = nil
347 offset = 0
348 comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
349 if comb_array[index][b]
350 first = offset unless first
351 last = offset
352 last += 1 if comb2nd_indicies_nonbasic[dm1]
353 end
354 offset += 1
355 offset += 1 if comb2nd_indicies_nonbasic[dm1]
356 end
357 comb1st_indicies_firstoffsets[index] = first
358 comb1st_indicies_lastoffsets[index] = last
359 raise "double index" if comb_indicies[dm0]
360 comb_indicies[dm0] = cumoffset
361 cumoffset += last - first + 1 + 2
362 end
363
364 offset = 0
365 comb2nd_indicies_sorted_keys.each do |dm1|
366 raise "double index" if comb_indicies[dm1]
367 comb_indicies[dm1] = 0x8000 | (comb2nd_indicies[dm1] + offset)
368 raise "too large comb index" if comb2nd_indicies[dm1] + offset > 0x4000
369 if comb2nd_indicies_nonbasic[dm1]
370 comb_indicies[dm1] = comb_indicies[dm1] | 0x4000
371 offset += 1
372 end
373 end
374
375 properties_indicies = {}
376 properties = []
377 chars.each do |char|
378 c_entry = char.c_entry(comb_indicies)
379 char.c_entry_index = properties_indicies[c_entry]
380 unless char.c_entry_index
381 properties_indicies[c_entry] = properties.length
382 char.c_entry_index = properties.length
383 properties << c_entry
384 end
385 end
386
387 stage1 = []
388 stage2 = []
389 for code in 0...0x110000
390 next unless code % 0x100 == 0
391 stage2_entry = []
392 for code2 in code...(code+0x100)
393 if char_hash[code2]
394 stage2_entry << (char_hash[code2].c_entry_index + 1)
395 else
396 stage2_entry << 0
397 end
398 end
399 old_index = stage2.index(stage2_entry)
400 if old_index
401 stage1 << (old_index * 0x100)
402 else
403 stage1 << (stage2.length * 0x100)
404 stage2 << stage2_entry
405 end
406 end
407
408 $stdout << "static const utf8proc_uint16_t utf8proc_sequences[] = {\n "
409 i = 0
410 $int_array.each do |entry|
411 i += 1
412 if i == 8
413 i = 0
414 $stdout << "\n "
415 end
416 $stdout << entry << ", "
417 end
418 $stdout << "};\n\n"
419
420 $stdout << "static const utf8proc_uint16_t utf8proc_stage1table[] = {\n "
421 i = 0
422 stage1.each do |entry|
423 i += 1
424 if i == 8
425 i = 0
426 $stdout << "\n "
427 end
428 $stdout << entry << ", "
429 end
430 $stdout << "};\n\n"
431
432 $stdout << "static const utf8proc_uint16_t utf8proc_stage2table[] = {\n "
433 i = 0
434 stage2.flatten.each do |entry|
435 i += 1
436 if i == 8
437 i = 0
438 $stdout << "\n "
439 end
440 $stdout << entry << ", "
441 end
442 $stdout << "};\n\n"
443
444 $stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
445 $stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},\n"
446 properties.each { |line|
447 $stdout << line
448 }
449 $stdout << "};\n\n"
450
451
452
453 $stdout << "static const utf8proc_uint16_t utf8proc_combinations[] = {\n "
454 i = 0
455 comb1st_indicies.keys.each_index do |a|
456 offset = 0
457 $stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
458 comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
459 break if offset > comb1st_indicies_lastoffsets[a]
460 if offset >= comb1st_indicies_firstoffsets[a]
461 i += 1
462 if i == 8
463 i = 0
464 $stdout << "\n "
465 end
466 v = comb_array[a][b] ? comb_array[a][b] : 0
467 $stdout << (( v & 0xFFFF0000 ) >> 16) << ", " if comb2nd_indicies_nonbasic[dm1]
468 $stdout << (v & 0xFFFF) << ", "
469 end
470 offset += 1
471 offset += 1 if comb2nd_indicies_nonbasic[dm1]
472 end
473 $stdout << "\n"
474 end
475 $stdout << "};\n\n"