Mercurial > minori
comparison dep/utf8proc/data/data_generator.rb @ 343:1faa72660932
*: transfer back to cmake from autotools
autotools just made lots of things more complicated than
they should have and many things broke (i.e. translations)
| author | Paper <paper@paper.us.eu.org> |
|---|---|
| date | Thu, 20 Jun 2024 05:56:06 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 342:adb79bdde329 | 343:1faa72660932 |
|---|---|
| 1 #!/usr/bin/env ruby | |
| 2 | |
| 3 # This file was used to generate the 'unicode_data.c' file by parsing the | |
| 4 # Unicode data file 'UnicodeData.txt' of the Unicode Character Database. | |
| 5 # It is included for informational purposes only and not intended for | |
| 6 # production use. | |
| 7 | |
| 8 | |
| 9 # Copyright (c) 2018 Steven G. Johnson, Tony Kelman, Keno Fischer, | |
| 10 # Benito van der Zander, Michaël Meyer, and other contributors. | |
| 11 # Copyright (c) 2009 Public Software Group e. V., Berlin, Germany | |
| 12 # | |
| 13 # Permission is hereby granted, free of charge, to any person obtaining a | |
| 14 # copy of this software and associated documentation files (the "Software"), | |
| 15 # to deal in the Software without restriction, including without limitation | |
| 16 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
| 17 # and/or sell copies of the Software, and to permit persons to whom the | |
| 18 # Software is furnished to do so, subject to the following conditions: | |
| 19 # | |
| 20 # The above copyright notice and this permission notice shall be included in | |
| 21 # all copies or substantial portions of the Software. | |
| 22 # | |
| 23 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 24 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 25 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 26 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 27 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
| 28 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
| 29 # DEALINGS IN THE SOFTWARE. | |
| 30 | |
| 31 | |
| 32 # This file contains derived data from a modified version of the | |
| 33 # Unicode data files. The following license applies to that data: | |
| 34 # | |
| 35 # COPYRIGHT AND PERMISSION NOTICE | |
| 36 # | |
| 37 # Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed | |
| 38 # under the Terms of Use in http://www.unicode.org/copyright.html. | |
| 39 # | |
| 40 # Permission is hereby granted, free of charge, to any person obtaining a | |
| 41 # copy of the Unicode data files and any associated documentation (the "Data | |
| 42 # Files") or Unicode software and any associated documentation (the | |
| 43 # "Software") to deal in the Data Files or Software without restriction, | |
| 44 # including without limitation the rights to use, copy, modify, merge, | |
| 45 # publish, distribute, and/or sell copies of the Data Files or Software, and | |
| 46 # to permit persons to whom the Data Files or Software are furnished to do | |
| 47 # so, provided that (a) the above copyright notice(s) and this permission | |
| 48 # notice appear with all copies of the Data Files or Software, (b) both the | |
| 49 # above copyright notice(s) and this permission notice appear in associated | |
| 50 # documentation, and (c) there is clear notice in each modified Data File or | |
| 51 # in the Software as well as in the documentation associated with the Data | |
| 52 # File(s) or Software that the data or software has been modified. | |
| 53 # | |
| 54 # THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY | |
| 55 # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
| 56 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF | |
| 57 # THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS | |
| 58 # INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR | |
| 59 # CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF | |
| 60 # USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER | |
| 61 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
| 62 # PERFORMANCE OF THE DATA FILES OR SOFTWARE. | |
| 63 # | |
| 64 # Except as contained in this notice, the name of a copyright holder shall | |
| 65 # not be used in advertising or otherwise to promote the sale, use or other | |
| 66 # dealings in these Data Files or Software without prior written | |
| 67 # authorization of the copyright holder. | |
| 68 | |
| 69 | |
| 70 $ignorable_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Default_Ignorable_Code_Point.*?# Total code points:/m] | |
| 71 $ignorable = [] | |
| 72 $ignorable_list.each_line do |entry| | |
| 73 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ | |
| 74 $1.hex.upto($2.hex) { |e2| $ignorable << e2 } | |
| 75 elsif entry =~ /^[0-9A-F]+/ | |
| 76 $ignorable << $&.hex | |
| 77 end | |
| 78 end | |
| 79 | |
| 80 $uppercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Uppercase.*?# Total code points:/m] | |
| 81 $uppercase = [] | |
| 82 $uppercase_list.each_line do |entry| | |
| 83 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ | |
| 84 $1.hex.upto($2.hex) { |e2| $uppercase << e2 } | |
| 85 elsif entry =~ /^[0-9A-F]+/ | |
| 86 $uppercase << $&.hex | |
| 87 end | |
| 88 end | |
| 89 | |
| 90 $lowercase_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Derived Property: Lowercase.*?# Total code points:/m] | |
| 91 $lowercase = [] | |
| 92 $lowercase_list.each_line do |entry| | |
| 93 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ | |
| 94 $1.hex.upto($2.hex) { |e2| $lowercase << e2 } | |
| 95 elsif entry =~ /^[0-9A-F]+/ | |
| 96 $lowercase << $&.hex | |
| 97 end | |
| 98 end | |
| 99 | |
| 100 $icb_linker_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Linker.*?# Total code points:/m] | |
| 101 $icb = Hash.new("UTF8PROC_INDIC_CONJUNCT_BREAK_NONE") | |
| 102 $icb_linker_list.each_line do |entry| | |
| 103 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ | |
| 104 $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" } | |
| 105 elsif entry =~ /^[0-9A-F]+/ | |
| 106 $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" | |
| 107 end | |
| 108 end | |
| 109 $icb_consonant_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Consonant.*?# Total code points:/m] | |
| 110 $icb_consonant_list.each_line do |entry| | |
| 111 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ | |
| 112 $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" } | |
| 113 elsif entry =~ /^[0-9A-F]+/ | |
| 114 $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" | |
| 115 end | |
| 116 end | |
| 117 $icb_extend_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Extend.*?# Total code points:/m] | |
| 118 $icb_extend_list.each_line do |entry| | |
| 119 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ | |
| 120 $1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" } | |
| 121 elsif entry =~ /^[0-9A-F]+/ | |
| 122 $icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" | |
| 123 end | |
| 124 end | |
| 125 | |
| 126 $grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8') | |
| 127 $grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER") | |
| 128 $grapheme_boundclass_list.each_line do |entry| | |
| 129 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/ | |
| 130 $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase } | |
| 131 elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/ | |
| 132 $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase | |
| 133 end | |
| 134 end | |
| 135 | |
| 136 $emoji_data_list = File.read("emoji-data.txt", :encoding => 'utf-8') | |
| 137 $emoji_data_list.each_line do |entry| | |
| 138 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/ | |
| 139 $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" } | |
| 140 elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/ | |
| 141 $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" | |
| 142 elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/ | |
| 143 $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" } | |
| 144 elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/ | |
| 145 $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND" | |
| 146 end | |
| 147 end | |
| 148 | |
| 149 $charwidth_list = File.read("CharWidths.txt", :encoding => 'utf-8') | |
| 150 $charwidth = Hash.new(0) | |
| 151 $charwidth_list.each_line do |entry| | |
| 152 if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([0-9]+)/ | |
| 153 $1.hex.upto($2.hex) { |e2| $charwidth[e2] = $3.to_i } | |
| 154 elsif entry =~ /^([0-9A-F]+)\s*;\s*([0-9]+)/ | |
| 155 $charwidth[$1.hex] = $2.to_i | |
| 156 end | |
| 157 end | |
| 158 | |
| 159 $exclusions = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(1\) Script Specifics.*?# Total code points:/m] | |
| 160 $exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex } | |
| 161 | |
| 162 $excl_version = File.read("CompositionExclusions.txt", :encoding => 'utf-8')[/# \(2\) Post Composition Version precomposed characters.*?# Total code points:/m] | |
| 163 $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex } | |
| 164 | |
| 165 $case_folding_string = File.read("CaseFolding.txt", :encoding => 'utf-8') | |
| 166 $case_folding = {} | |
| 167 $case_folding_string.chomp.split("\n").each do |line| | |
| 168 next unless line =~ /([0-9A-F]+); [CF]; ([0-9A-F ]+);/i | |
| 169 $case_folding[$1.hex] = $2.split(" ").collect { |e| e.hex } | |
| 170 end | |
| 171 | |
| 172 $int_array = [] | |
| 173 $int_array_indicies = {} | |
| 174 | |
| 175 def str2c(string, prefix) | |
| 176 return "0" if string.nil? | |
| 177 return "UTF8PROC_#{prefix}_#{string.upcase}" | |
| 178 end | |
| 179 def pushary(array) | |
| 180 idx = $int_array_indicies[array] | |
| 181 unless idx | |
| 182 $int_array_indicies[array] = $int_array.length | |
| 183 idx = $int_array.length | |
| 184 array.each { |entry| $int_array << entry } | |
| 185 end | |
| 186 return idx | |
| 187 end | |
| 188 def cpary2utf16encoded(array) | |
| 189 return array.flat_map { |cp| | |
| 190 if (cp <= 0xFFFF) | |
| 191 raise "utf-16 code: #{cp}" if cp & 0b1111100000000000 == 0b1101100000000000 | |
| 192 cp | |
| 193 else | |
| 194 temp = cp - 0x10000 | |
| 195 [(temp >> 10) | 0b1101100000000000, (temp & 0b0000001111111111) | 0b1101110000000000] | |
| 196 end | |
| 197 } | |
| 198 end | |
| 199 def cpary2c(array) | |
| 200 return "UINT16_MAX" if array.nil? || array.length == 0 | |
| 201 lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ... | |
| 202 array = cpary2utf16encoded(array) | |
| 203 if lencode >= 3 #we have only 2 bits for the length | |
| 204 array = [lencode] + array | |
| 205 lencode = 3 | |
| 206 end | |
| 207 idx = pushary(array) | |
| 208 raise "Array index out of bound" if idx > 0x3FFF | |
| 209 return "#{idx | (lencode << 14)}" | |
| 210 end | |
| 211 def singlecpmap(cp) | |
| 212 return "UINT16_MAX" if cp == nil | |
| 213 idx = pushary(cpary2utf16encoded([cp])) | |
| 214 raise "Array index out of bound" if idx > 0xFFFF | |
| 215 return "#{idx}" | |
| 216 end | |
| 217 | |
| 218 class UnicodeChar | |
| 219 attr_accessor :code, :name, :category, :combining_class, :bidi_class, | |
| 220 :decomp_type, :decomp_mapping, | |
| 221 :bidi_mirrored, | |
| 222 :uppercase_mapping, :lowercase_mapping, :titlecase_mapping, | |
| 223 #caches: | |
| 224 :c_entry_index, :c_decomp_mapping, :c_case_folding | |
| 225 def initialize(line) | |
| 226 raise "Could not parse input." unless line =~ /^ | |
| 227 ([0-9A-F]+); # code | |
| 228 ([^;]+); # name | |
| 229 ([A-Z]+); # general category | |
| 230 ([0-9]+); # canonical combining class | |
| 231 ([A-Z]+); # bidi class | |
| 232 (<([A-Z]*)>)? # decomposition type | |
| 233 ((\ ?[0-9A-F]+)*); # decompomposition mapping | |
| 234 ([0-9]*); # decimal digit | |
| 235 ([0-9]*); # digit | |
| 236 ([^;]*); # numeric | |
| 237 ([YN]*); # bidi mirrored | |
| 238 ([^;]*); # unicode 1.0 name | |
| 239 ([^;]*); # iso comment | |
| 240 ([0-9A-F]*); # simple uppercase mapping | |
| 241 ([0-9A-F]*); # simple lowercase mapping | |
| 242 ([0-9A-F]*)$/ix # simple titlecase mapping | |
| 243 @code = $1.hex | |
| 244 @name = $2 | |
| 245 @category = $3 | |
| 246 @combining_class = Integer($4) | |
| 247 @bidi_class = $5 | |
| 248 @decomp_type = $7 | |
| 249 @decomp_mapping = ($8=='') ? nil : | |
| 250 $8.split.collect { |element| element.hex } | |
| 251 @bidi_mirrored = ($13=='Y') ? true : false | |
| 252 # issue #130: use nonstandard uppercase ß -> ẞ | |
| 253 # issue #195: if character is uppercase but has no lowercase mapping, | |
| 254 # then make lowercase mapping = itself (vice versa for lowercase) | |
| 255 @uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : ($17=='' && $lowercase.include?(code) ? code : nil)) : $16.hex | |
| 256 @lowercase_mapping = ($17=='') ? ($16=='' && $uppercase.include?(code) ? code : nil) : $17.hex | |
| 257 @titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex | |
| 258 end | |
| 259 def case_folding | |
| 260 $case_folding[code] | |
| 261 end | |
| 262 def c_entry(comb_indicies) | |
| 263 " " << | |
| 264 "{#{str2c category, 'CATEGORY'}, #{combining_class}, " << | |
| 265 "#{str2c bidi_class, 'BIDI_CLASS'}, " << | |
| 266 "#{str2c decomp_type, 'DECOMP_TYPE'}, " << | |
| 267 "#{c_decomp_mapping}, " << | |
| 268 "#{c_case_folding}, " << | |
| 269 "#{singlecpmap uppercase_mapping }, " << | |
| 270 "#{singlecpmap lowercase_mapping }, " << | |
| 271 "#{singlecpmap titlecase_mapping }, " << | |
| 272 "#{comb_indicies[code] ? comb_indicies[code]: 'UINT16_MAX'}, " << | |
| 273 "#{bidi_mirrored}, " << | |
| 274 "#{$exclusions.include?(code) or $excl_version.include?(code)}, " << | |
| 275 "#{$ignorable.include?(code)}, " << | |
| 276 "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " << | |
| 277 "#{$charwidth[code]}, 0, " << | |
| 278 "#{$grapheme_boundclass[code]}, " << | |
| 279 "#{$icb[code]}},\n" | |
| 280 end | |
| 281 end | |
| 282 | |
| 283 chars = [] | |
| 284 char_hash = {} | |
| 285 | |
| 286 while gets | |
| 287 if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i | |
| 288 first = $1.hex | |
| 289 gets | |
| 290 char = UnicodeChar.new($_) | |
| 291 raise "No last character of sequence found." unless | |
| 292 $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i | |
| 293 last = $1.hex | |
| 294 name = "<#{$2}>" | |
| 295 for i in first..last | |
| 296 char_clone = char.clone | |
| 297 char_clone.code = i | |
| 298 char_clone.name = name | |
| 299 char_hash[char_clone.code] = char_clone | |
| 300 chars << char_clone | |
| 301 end | |
| 302 else | |
| 303 char = UnicodeChar.new($_) | |
| 304 char_hash[char.code] = char | |
| 305 chars << char | |
| 306 end | |
| 307 end | |
| 308 | |
| 309 comb1st_indicies = {} | |
| 310 comb2nd_indicies = {} | |
| 311 comb2nd_indicies_sorted_keys = [] | |
| 312 comb2nd_indicies_nonbasic = {} | |
| 313 comb_array = [] | |
| 314 | |
| 315 chars.each do |char| | |
| 316 if !char.nil? and char.decomp_type.nil? and char.decomp_mapping and | |
| 317 char.decomp_mapping.length == 2 and !char_hash[char.decomp_mapping[0]].nil? and | |
| 318 char_hash[char.decomp_mapping[0]].combining_class == 0 and | |
| 319 not $exclusions.include?(char.code) | |
| 320 | |
| 321 dm0 = char.decomp_mapping[0] | |
| 322 dm1 = char.decomp_mapping[1] | |
| 323 unless comb1st_indicies[dm0] | |
| 324 comb1st_indicies[dm0] = comb1st_indicies.keys.length | |
| 325 end | |
| 326 unless comb2nd_indicies[dm1] | |
| 327 comb2nd_indicies_sorted_keys << dm1 | |
| 328 comb2nd_indicies[dm1] = comb2nd_indicies.keys.length | |
| 329 end | |
| 330 comb_array[comb1st_indicies[dm0]] ||= [] | |
| 331 raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] | |
| 332 comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code | |
| 333 | |
| 334 comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF | |
| 335 end | |
| 336 char.c_decomp_mapping = cpary2c(char.decomp_mapping) | |
| 337 char.c_case_folding = cpary2c(char.case_folding) | |
| 338 end | |
| 339 | |
| 340 comb_indicies = {} | |
| 341 cumoffset = 0 | |
| 342 comb1st_indicies_lastoffsets = [] | |
| 343 comb1st_indicies_firstoffsets = [] | |
| 344 comb1st_indicies.each do |dm0, index| | |
| 345 first = nil | |
| 346 last = nil | |
| 347 offset = 0 | |
| 348 comb2nd_indicies_sorted_keys.each_with_index do |dm1, b| | |
| 349 if comb_array[index][b] | |
| 350 first = offset unless first | |
| 351 last = offset | |
| 352 last += 1 if comb2nd_indicies_nonbasic[dm1] | |
| 353 end | |
| 354 offset += 1 | |
| 355 offset += 1 if comb2nd_indicies_nonbasic[dm1] | |
| 356 end | |
| 357 comb1st_indicies_firstoffsets[index] = first | |
| 358 comb1st_indicies_lastoffsets[index] = last | |
| 359 raise "double index" if comb_indicies[dm0] | |
| 360 comb_indicies[dm0] = cumoffset | |
| 361 cumoffset += last - first + 1 + 2 | |
| 362 end | |
| 363 | |
| 364 offset = 0 | |
| 365 comb2nd_indicies_sorted_keys.each do |dm1| | |
| 366 raise "double index" if comb_indicies[dm1] | |
| 367 comb_indicies[dm1] = 0x8000 | (comb2nd_indicies[dm1] + offset) | |
| 368 raise "too large comb index" if comb2nd_indicies[dm1] + offset > 0x4000 | |
| 369 if comb2nd_indicies_nonbasic[dm1] | |
| 370 comb_indicies[dm1] = comb_indicies[dm1] | 0x4000 | |
| 371 offset += 1 | |
| 372 end | |
| 373 end | |
| 374 | |
| 375 properties_indicies = {} | |
| 376 properties = [] | |
| 377 chars.each do |char| | |
| 378 c_entry = char.c_entry(comb_indicies) | |
| 379 char.c_entry_index = properties_indicies[c_entry] | |
| 380 unless char.c_entry_index | |
| 381 properties_indicies[c_entry] = properties.length | |
| 382 char.c_entry_index = properties.length | |
| 383 properties << c_entry | |
| 384 end | |
| 385 end | |
| 386 | |
| 387 stage1 = [] | |
| 388 stage2 = [] | |
| 389 for code in 0...0x110000 | |
| 390 next unless code % 0x100 == 0 | |
| 391 stage2_entry = [] | |
| 392 for code2 in code...(code+0x100) | |
| 393 if char_hash[code2] | |
| 394 stage2_entry << (char_hash[code2].c_entry_index + 1) | |
| 395 else | |
| 396 stage2_entry << 0 | |
| 397 end | |
| 398 end | |
| 399 old_index = stage2.index(stage2_entry) | |
| 400 if old_index | |
| 401 stage1 << (old_index * 0x100) | |
| 402 else | |
| 403 stage1 << (stage2.length * 0x100) | |
| 404 stage2 << stage2_entry | |
| 405 end | |
| 406 end | |
| 407 | |
| 408 $stdout << "static const utf8proc_uint16_t utf8proc_sequences[] = {\n " | |
| 409 i = 0 | |
| 410 $int_array.each do |entry| | |
| 411 i += 1 | |
| 412 if i == 8 | |
| 413 i = 0 | |
| 414 $stdout << "\n " | |
| 415 end | |
| 416 $stdout << entry << ", " | |
| 417 end | |
| 418 $stdout << "};\n\n" | |
| 419 | |
| 420 $stdout << "static const utf8proc_uint16_t utf8proc_stage1table[] = {\n " | |
| 421 i = 0 | |
| 422 stage1.each do |entry| | |
| 423 i += 1 | |
| 424 if i == 8 | |
| 425 i = 0 | |
| 426 $stdout << "\n " | |
| 427 end | |
| 428 $stdout << entry << ", " | |
| 429 end | |
| 430 $stdout << "};\n\n" | |
| 431 | |
| 432 $stdout << "static const utf8proc_uint16_t utf8proc_stage2table[] = {\n " | |
| 433 i = 0 | |
| 434 stage2.flatten.each do |entry| | |
| 435 i += 1 | |
| 436 if i == 8 | |
| 437 i = 0 | |
| 438 $stdout << "\n " | |
| 439 end | |
| 440 $stdout << entry << ", " | |
| 441 end | |
| 442 $stdout << "};\n\n" | |
| 443 | |
| 444 $stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n" | |
| 445 $stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},\n" | |
| 446 properties.each { |line| | |
| 447 $stdout << line | |
| 448 } | |
| 449 $stdout << "};\n\n" | |
| 450 | |
| 451 | |
| 452 | |
| 453 $stdout << "static const utf8proc_uint16_t utf8proc_combinations[] = {\n " | |
| 454 i = 0 | |
| 455 comb1st_indicies.keys.each_index do |a| | |
| 456 offset = 0 | |
| 457 $stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", " | |
| 458 comb2nd_indicies_sorted_keys.each_with_index do |dm1, b| | |
| 459 break if offset > comb1st_indicies_lastoffsets[a] | |
| 460 if offset >= comb1st_indicies_firstoffsets[a] | |
| 461 i += 1 | |
| 462 if i == 8 | |
| 463 i = 0 | |
| 464 $stdout << "\n " | |
| 465 end | |
| 466 v = comb_array[a][b] ? comb_array[a][b] : 0 | |
| 467 $stdout << (( v & 0xFFFF0000 ) >> 16) << ", " if comb2nd_indicies_nonbasic[dm1] | |
| 468 $stdout << (v & 0xFFFF) << ", " | |
| 469 end | |
| 470 offset += 1 | |
| 471 offset += 1 if comb2nd_indicies_nonbasic[dm1] | |
| 472 end | |
| 473 $stdout << "\n" | |
| 474 end | |
| 475 $stdout << "};\n\n" |
