Mercurial > minori
view dep/anitomy/dep/srell/unicode/updataout3.cpp @ 347:a0aa8c8c4307
dep/anitomy: port to use UCS-4 rather than wide strings
rationale: wide strings are not the same on every platform, and
might not even be Unicode. (while they usually are, its possible
that they are not)
I was *going* to change StringToInt to use a string stream, but
outputting to an integer doesn't seem to work at all with UCS-4,
even though it ought to, so I just rolled my own that uses the
arabic digits only.
author | Paper <paper@paper.us.eu.org> |
---|---|
date | Sun, 23 Jun 2024 10:32:09 -0400 |
parents | |
children |
line wrap: on
line source
// // updataout.cpp: version 3.002 (2023/12/29). // // This is a program that generates srell_updata3.h from: // DerivedCoreProperties.txt // DerivedNormalizationProps.txt // PropList.txt // PropertyValueAliases.txt // ScriptExtensions.txt // Scripts.txt // UnicodeData.txt // emoji-data.txt // emoji-sequences.txt // emoji-zwj-sequences.txt // provided by the Unicode Consortium. The latese versions of them are // available at: // emoji-data.txt: http://www.unicode.org/Public/UNIDATA/emoji/ // emoji-sequences.txt and emoji-zwj-sequences.txt: // http://www.unicode.org/Public/emoji/ // others: http://www.unicode.org/Public/UNIDATA/ // #include <cstdio> #include <cstdlib> #include <cstdarg> #include <string> #include <vector> #include <map> #include <stdexcept> #include <algorithm> // For std::swap in C++98/03 #include <utility> // For std::swap in C++11- #define SRELL_NO_UNICODE_DATA #include "../srell.hpp" #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(disable:4996) #endif namespace updata { static const char *const property_names[] = { // 3 "General_Category:gc", "Script:sc", "Script_Extensions:scx", "" }; static const char *const binary_property_names[] = { // 53 (52+1) // *1: http://unicode.org/reports/tr18/#General_Category_Property // *2: 9th field in UnicodeData.txt "ASCII", // *1 "ASCII_Hex_Digit:AHex", // PropList.txt "Alphabetic:Alpha", // DerivedCoreProperties.txt "Any", // *1 "Assigned", // *1 "Bidi_Control:Bidi_C", // PropList.txt "Bidi_Mirrored:Bidi_M", // *2 "Case_Ignorable:CI", // DerivedCoreProperties.txt "Cased", // DerivedCoreProperties.txt "Changes_When_Casefolded:CWCF", // DerivedCoreProperties.txt "Changes_When_Casemapped:CWCM", // DerivedCoreProperties.txt "Changes_When_Lowercased:CWL", // DerivedCoreProperties.txt "Changes_When_NFKC_Casefolded:CWKCF", // DerivedNormalizationProps.txt "Changes_When_Titlecased:CWT", // DerivedCoreProperties.txt "Changes_When_Uppercased:CWU", // DerivedCoreProperties.txt "Dash", // PropList.txt "Default_Ignorable_Code_Point:DI", // DerivedCoreProperties.txt "Deprecated:Dep", // PropList.txt "Diacritic:Dia", // PropList.txt "Emoji", // emoji-data.txt "Emoji_Component:EComp", // emoji-data.txt "Emoji_Modifier:EMod", // emoji-data.txt "Emoji_Modifier_Base:EBase", // emoji-data.txt "Emoji_Presentation:EPres", // emoji-data.txt "Extended_Pictographic:ExtPict", // emoji-data.txt "Extender:Ext", // PropList.txt "Grapheme_Base:Gr_Base", // DerivedCoreProperties.txt "Grapheme_Extend:Gr_Ext", // DerivedCoreProperties.txt "Hex_Digit:Hex", // PropList.txt "IDS_Binary_Operator:IDSB", // PropList.txt "IDS_Trinary_Operator:IDST", // PropList.txt "ID_Continue:IDC", // DerivedCoreProperties.txt "ID_Start:IDS", // DerivedCoreProperties.txt "Ideographic:Ideo", // PropList.txt "Join_Control:Join_C", // PropList.txt "Logical_Order_Exception:LOE", // PropList.txt "Lowercase:Lower", // DerivedCoreProperties.txt "Math", // DerivedCoreProperties.txt "Noncharacter_Code_Point:NChar", // PropList.txt "Pattern_Syntax:Pat_Syn", // PropList.txt "Pattern_White_Space:Pat_WS", // PropList.txt "Quotation_Mark:QMark", // PropList.txt "Radical", // PropList.txt "Regional_Indicator:RI", // PropList.txt "Sentence_Terminal:STerm", // PropList.txt "Soft_Dotted:SD", // PropList.txt "Terminal_Punctuation:Term", // PropList.txt "Unified_Ideograph:UIdeo", // PropList.txt "Uppercase:Upper", // DerivedCoreProperties.txt "Variation_Selector:VS", // PropList.txt "White_Space:space", // PropList.txt "XID_Continue:XIDC", // DerivedCoreProperties.txt "XID_Start:XIDS", // DerivedCoreProperties.txt // ECMAScript 2019/Unicode 11: // "Extended_Pictographic:ExtPict", // ECMAScript 2021/Unicode 13: // Aliases: EComp, EMod, EBase, EPres, and ExtPict "" }; static const char *const emoseq_property_names[] = { "RGI_Emoji", "Basic_Emoji", // emoji-sequences.txt "Emoji_Keycap_Sequence", // emoji-sequences.txt "RGI_Emoji_Modifier_Sequence", // emoji-sequences.txt "RGI_Emoji_Flag_Sequence", // emoji-sequences.txt "RGI_Emoji_Tag_Sequence", // emoji-sequences.txt "RGI_Emoji_ZWJ_Sequence", // emoji-zwj-sequences.txt "" }; static const char *const gc_values[] = { // 38 "Other:C", "Control:Cc:cntrl", "Format:Cf", "Unassigned:Cn", "Private_Use:Co", "Surrogate:Cs", "Letter:L", "Cased_Letter:LC", "Lowercase_Letter:Ll", "Titlecase_Letter:Lt", "Uppercase_Letter:Lu", "Modifier_Letter:Lm", "Other_Letter:Lo", "Mark:M:Combining_Mark", "Spacing_Mark:Mc", "Enclosing_Mark:Me", "Nonspacing_Mark:Mn", "Number:N", "Decimal_Number:Nd:digit", "Letter_Number:Nl", "Other_Number:No", "Punctuation:P:punct", "Connector_Punctuation:Pc", "Dash_Punctuation:Pd", "Close_Punctuation:Pe", "Final_Punctuation:Pf", "Initial_Punctuation:Pi", "Other_Punctuation:Po", "Open_Punctuation:Ps", "Symbol:S", "Currency_Symbol:Sc", "Modifier_Symbol:Sk", "Math_Symbol:Sm", "Other_Symbol:So", "Separator:Z", "Line_Separator:Zl", "Paragraph_Separator:Zp", "Space_Separator:Zs", "" }; } // namespace updata namespace unishared { template <typename T> std::string to_string(T value, int radix = 10, const int precision = 1) { std::string num; if (radix >= 2 && radix <= 16) { typedef typename std::string::size_type size_type; const bool minus = value < 0 ? (value = 0 - value, true) : false; for (; value; value /= radix) num.push_back("0123456789ABCDEF"[value % radix]); if (precision > 0 && num.size() < static_cast<size_type>(precision)) num.append(static_cast<size_type>(precision) - num.size(), static_cast<char>('0')); if (minus) num.push_back(static_cast<char>('-')); const size_type mid = num.size() / 2; for (size_type i = 0; i < mid; ++i) std::swap(num[i], num[num.size() - i - 1]); } return num; } void throw_error(const char *const s, ...) { char buffer[256]; va_list va; va_start(va, s); std::vsprintf(buffer, s, va); va_end(va); throw std::runtime_error(buffer); } void read_file(std::string &str, const char *const filename, const char *const dir) { const std::string path(std::string(dir ? dir : "") + filename); FILE *const fp = std::fopen(path.c_str(), "r"); std::fprintf(stdout, "Reading '%s'... ", path.c_str()); if (fp) { static const std::size_t bufsize = 4096; char *const buffer = static_cast<char *>(std::malloc(bufsize)); if (buffer) { for (;;) { const std::size_t size = std::fread(buffer, 1, bufsize, fp); if (!size) break; str.append(buffer, size); } std::fclose(fp); std::fputs("done.\n", stdout); std::free(buffer); return; } } std::fputs("failed...", stdout); throw_error("could not open!"); } bool write_file(const char *const filename, const std::string &str) { FILE *const fp = std::fopen(filename, "wb"); std::fprintf(stdout, "Writing '%s'... ", filename); if (fp) { const bool success = std::fwrite(str.c_str(), 1, str.size(), fp) == str.size(); std::fclose(fp); if (success) { std::fputs("done.\n", stdout); return true; } } std::fputs("failed...\n", stdout); return false; } } // namespace unishared struct up_options { const char *outfilename; const char *indir; int version; int errorno; up_options(const int argc, const char *const *const argv) : outfilename("srell_updata3.h") , indir("") , version(301) , errorno(0) { for (int index = 1; index < argc; ++index) { const char firstchar = argv[index][0]; if (firstchar == '-' || firstchar == '/') { const char *const option = argv[index] + 1; if (std::strcmp(option, "o") == 0) { if (index + 1 >= argc) goto NO_ARGUMENT; outfilename = argv[++index]; } else if (std::strcmp(option, "v") == 0) { if (index + 1 >= argc) goto NO_ARGUMENT; version = static_cast<int>(std::strtod(argv[++index], NULL) * 100.0 + 0.5); } else if (std::strcmp(option, "i") == 0 || std::strcmp(option, "id") == 0) { if (index + 1 >= argc) goto NO_ARGUMENT; indir = argv[++index]; } else if (std::strcmp(option, "?") == 0 || std::strcmp(option, "h") == 0) { std::fputs("Usage: updataout2 [options]\nOptions:\n", stdout); std::fputs(" -i <DIRECTORY>\tSame as -id.\n", stdout); std::fputs(" -id <DIRECTORY>\tAssume that input files exist in <DIRECTORY>.\n\t\t\t<DIRECTORY> must ends with '/' or '\\'.\n", stdout); std::fputs(" -o <FILE>\t\tOutput to <FILE>.\n", stdout); // std::fputs(" -v <VERNO>\t\tOutput in the version VERNO format.\n", stdout); errorno = 1; return; } else goto UNKNOWN_OPTION; continue; NO_ARGUMENT: std::fprintf(stdout, "[Error] no argument for \"%s\" specified.\n", argv[index]); errorno = -2; } else { UNKNOWN_OPTION: std::fprintf(stdout, "[Error] unknown option \"%s\" found.\n", argv[index]); errorno = -1; } } } }; // struct up_options class unicode_property { public: unicode_property() : re_colon_(":") { } int create_updata(std::string &outdata, const up_options &opts) { int errorno = opts.errorno; const char *const unidatafilename = "UnicodeData.txt"; const char *const propdatafiles[] = { "PropList.txt", "DerivedCoreProperties.txt", "emoji-data.txt", "DerivedNormalizationProps.txt", "" }; const char *const emodatafiles[] = { "emoji-sequences.txt", "emoji-zwj-sequences.txt", "" }; const char *const scfilename = "Scripts.txt"; const char *const scxfilename = "ScriptExtensions.txt"; const char *const pvafilename = "PropertyValueAliases.txt"; canonicalname_mapper scriptname_maps; strings_type scriptname_aliases; std::string licensetext; rangeholder general_category_values; rangeholder binary_properties; seqholder emoseq_properties; rangeholder scripts; rangeholder scriptextensions; sortedrangeholder combined_properties; sortedseqholder combined_pos; // scriptnameholder ucs_to_scriptname; // codepoint->scriptname. if (errorno) return errorno; try { licensetext = "// "; licensetext += unidatafilename; licensetext += "\n//\n"; read_unidata(general_category_values, binary_properties, unidatafilename, opts.indir); set_additionalbinprops(binary_properties, general_category_values); // for ASCII, Any, Cn. create_compositecategories(general_category_values); // This needs "Cn". read_binprops(binary_properties, licensetext, propdatafiles, opts.indir); #if !defined(SRELL_NO_VMODE) read_emoseq(emoseq_properties, licensetext, emodatafiles, opts.indir); #endif read_scriptnames(scriptname_maps, scriptname_aliases, licensetext, scfilename, pvafilename, opts); read_scripts(scripts, licensetext, scfilename, opts.indir); scriptextensions = scripts; modify_for_scx(scriptextensions, scriptname_maps, licensetext, scxfilename, opts.indir); combine_properties(combined_properties, general_category_values, "gc", updata::gc_values); combine_properties(combined_properties, binary_properties, "bp", updata::binary_property_names); combine_properties(combined_properties, scripts, "sc", scriptname_aliases); combine_properties(combined_properties, scriptextensions, "scx", scriptname_aliases); #if !defined(SRELL_NO_VMODE) combine_pos(combined_pos, emoseq_properties, "bp", updata::emoseq_property_names); #endif do_formatting(outdata, combined_properties, combined_pos, opts.version); licensetext.append(1, '\n'); outdata.insert(0, licensetext); } catch (srell::regex_error &e) { std::printf("\nError: %s,%d\n", e.what(), e.code()); errorno = 1; } catch (std::runtime_error &e) { std::printf("\nError: %s\n", e.what()); errorno = 2; } return errorno; } private: typedef srell::re_detail::ui_l32 ui_l32; typedef srell::re_detail::range_pairs ucprange_array; typedef srell::re_detail::range_pair u32pair; typedef u32pair ucprange; typedef srell::re_detail::range_pair_helper u32rp_helper; typedef u32rp_helper ucprange_helper; typedef std::map<std::string, ucprange_array> rangeholder; typedef srell::re_detail::simple_array<ui_l32> u32array; typedef std::map<std::string, u32array> seqholder; typedef std::vector<std::string> strings_type; typedef std::vector<srell::csub_match> matchranges_type; typedef std::map<ui_l32, std::string> scriptnameholder; typedef std::map<std::string, std::string> name_mapper; typedef std::map<std::string, ui_l32> namenumber_mapper; typedef name_mapper canonicalname_mapper; static const ui_l32 invalid_u32value = srell::re_detail::constants::invalid_u32value; static const ui_l32 compositeclass = invalid_u32value; struct sorted_name_and_ranges { std::string ptype; std::string canonicalname; std::string namealiases; ucprange_array ucpranges; }; typedef std::vector<sorted_name_and_ranges> sortedrangeholder; struct sorted_name_and_seqs { std::string ptype; std::string canonicalname; std::string namealiases; u32array ucpseqs; }; typedef std::vector<sorted_name_and_seqs> sortedseqholder; void split2(matchranges_type &parts, const std::string &data, const char splitter) { std::string::size_type readpos = 0; srell::csub_match csm; csm.matched = true; for (;;) { std::string::size_type lineend = data.find(splitter, readpos); csm.first = data.data() + readpos; if (lineend == std::string::npos) { csm.second = data.data() + data.size(); parts.push_back(csm); break; } csm.second = data.data() + lineend; parts.push_back(csm); ++lineend; readpos = lineend; } } std::string join(const char c, const strings_type &parts, const bool add_final_also = false) { std::string out; for (strings_type::size_type i = 0; i < parts.size(); ++i) out.append(parts[i] + c); if (!add_final_also && out.size()) out.resize(out.size() - 1); return out; } void read_unidata(rangeholder &gc, rangeholder &bp, const char *const unidatafilename, const char *const indir) { const srell::regex re_dataline("^([0-9A-F]+);([^;]*);(([^;]*);(?:[^;]*;){6}([^;]*)(?:;[^;]*){5})$"); const srell::regex re_rangefirst("^<(.*), First>$"); const std::string stringY("Y"); const std::string stringN("N"); ui_l32 prevucp = invalid_u32value; std::string data; matchranges_type lines; srell::cmatch cmatch; // matchranges_type parts; std::string rangename; std::string rangefirstproperty; ui_l32 rangefirstcp = 0; ucprange range; ucprange_array bidi_mirrored_ranges; unishared::read_file(data, unidatafilename, indir); split2(lines, data, '\n'); for (matchranges_type::size_type i = 0; i < lines.size(); ++i) { const srell::csub_match &line = lines[i]; if (srell::regex_match(line.first, line.second, cmatch, re_dataline)) { const srell::cmatch::value_type &codepoint = cmatch[1]; const srell::cmatch::value_type &name = cmatch[2]; const std::string name_string(name.str()); const std::string property(cmatch[3].str()); range.first = range.second = static_cast<ui_l32>(std::strtol(codepoint.first, NULL, 16)); if (prevucp >= range.first && prevucp != invalid_u32value) unishared::throw_error("Out of order: %.4lX >= %.4lX", prevucp, range.first); // parts.clear(); // split2(parts, property, ';'); // if (parts.size() != 13) // unishared::throw_error("number of fields is not 13, but %u\n\t[%s]", parts.size(), line.str().c_str()); // const std::string &general_category = parts[0]; // const std::string &bidi_mirrored = parts[7]; const std::string general_category(cmatch[4].str()); const std::string bidi_mirrored(cmatch[5].str()); prevucp = range.first; if (rangename.size()) { if (name_string.compare("<" + rangename + ", Last>") != 0) unishared::throw_error("<%s, Last> does not follow its First line.\n\t%s follows insteadly.", rangename.c_str(), name_string.c_str()); if (property != rangefirstproperty) { unishared::throw_error("\"%s\": properties of First and Last are different.\n\tFirst: %s\n\tLast: %s", rangename.c_str(), rangefirstproperty.c_str(), property.c_str()); } range.first = rangefirstcp; rangename.clear(); } else if (srell::regex_match(name.first, name.second, cmatch, re_rangefirst)) { rangename = cmatch[1]; rangefirstproperty = property; rangefirstcp = range.first; continue; } // Registers "general_category" value. gc[general_category].join(range); // Registers "bidi_mirrored" value. if (bidi_mirrored == stringY) { bidi_mirrored_ranges.join(range); } else if (bidi_mirrored != stringN) unishared::throw_error("Unknown Bidi_Mirrored value [%s] in %s.", bidi_mirrored.c_str(), line.str().c_str()); } else if (line.first != line.second) unishared::throw_error("Unknown format [%s]", line.str().c_str()); } bp["Bidi_Mirrored"] = bidi_mirrored_ranges; } void read_scriptnames(canonicalname_mapper &sn_maps, strings_type &sn_aliases, std::string &licensetext, const char *const scfilename, const char *const pvafilename, const up_options &opts) { const srell::regex re_scline("^[0-9A-Fa-f.]+\\s*;\\s*(\\S+)"); const srell::regex re_pvaline("scx?\\s*;\\s*(\\S.*)\\r?\\n?"); const srell::regex re_split("[ ;]+"); ui_l32 count = 0; std::string data; matchranges_type lines; srell::cmatch cmatch; namenumber_mapper seennames; unishared::read_file(data, scfilename, opts.indir); lines.clear(); split2(lines, data, '\n'); for (matchranges_type::size_type i = 0; i < lines.size(); ++i) { const srell::csub_match &line = lines[i]; if (srell::regex_search(line.first, line.second, cmatch, re_scline, srell::regex_constants::match_continuous)) { const std::string scname(cmatch.str(1)); if (!seennames.count(scname)) { seennames[scname] = count++; } } } if (opts.version >= 300) { seennames["Unknown"] = count++; sn_aliases.resize(count); } typedef std::vector<srell::csub_match> scnames_type; canonicalname_mapper aliases_tmp; scnames_type scnames; data.clear(); unishared::read_file(data, pvafilename, opts.indir); lines.clear(); split2(lines, data, '\n'); matchranges_type::size_type i = read_license(licensetext, lines, 0); for (; i < lines.size(); ++i) { const srell::csub_match &line = lines[i]; if (srell::regex_match(line.first, line.second, cmatch, re_pvaline, srell::regex_constants::match_continuous)) { scnames.clear(); re_split.split(scnames, cmatch[1].first, cmatch[1].second); if (scnames.size() >= 2) { const std::string canonicalname(scnames[1]); if (seennames.count(canonicalname)) { std::string aliases(canonicalname); for (scnames_type::size_type i = 0; i < scnames.size(); ++i) { const std::string scname(scnames[i].str()); sn_maps[scname] = canonicalname; if ((opts.version < 300 && i != 1) || (opts.version >= 300 && scname != canonicalname)) { aliases += ':'; aliases += scname; } } if (opts.version >= 300) sn_aliases[seennames[canonicalname]] = aliases; else aliases_tmp[canonicalname] = aliases; } } } } if (opts.version < 300) { for (canonicalname_mapper::const_iterator it = aliases_tmp.begin(); it != aliases_tmp.end(); ++it) sn_aliases.push_back(it->second); } } matchranges_type::size_type read_license(std::string &licensetext, const matchranges_type &lines, matchranges_type::size_type pos) { static const srell::regex re_license("^#[ \\t]*(\\S.*)?$"); srell::cmatch cm; for (; pos < lines.size(); ++pos) { const srell::csub_match &line = lines[pos]; if (srell::regex_search(line.first, line.second, cm, re_license, srell::regex_constants::match_continuous)) { const std::string comment(cm[1].str()); if (comment.size()) licensetext += "// " + comment + '\n'; else { licensetext += "//\n"; break; } } } return pos; } // binary properties created from UnicodeData.txt. void set_additionalbinprops(rangeholder &bp, rangeholder &gc) { ucprange_array assigned_ranges; for (rangeholder::iterator it = gc.begin(); it != gc.end(); ++it) assigned_ranges.merge(it->second); bp["Any"].join(ucprange_helper(0x0000, 0x10ffff)); bp["ASCII"].join(ucprange_helper(0x0000, 0x007f)); bp["Assigned"]; // Only creates. No data. // bp["Assigned"] = assigned_ranges; assigned_ranges.negation(); gc["Cn"] = assigned_ranges; } void create_compositecategory(rangeholder &gc, const char *const newname, const char *const *categories) { ucprange_array array; ui_l32 total = 0; array.append_newpair(ucprange_helper(compositeclass, 0)); for (; **categories; ++categories) { const char *const c = *categories; const ui_l32 count = static_cast<ui_l32>(gc[*categories].size()); array.append_newpair(ucprange_helper(c[0], c[1])); array.append_newpair(ucprange_helper(count, 0)); total += count; } array[0].second = total; gc[newname] = array; } void create_compositecategories(rangeholder &gc) { const char *const categoryLC[] = { "Ll", "Lt", "Lu", "" }; const char *const categoryL[] = { "Ll", "Lt", "Lu", "Lm", "Lo", "" }; const char *const categoryM[] = { "Mc", "Me", "Mn", "" }; const char *const categoryN[] = { "Nd", "Nl", "No", "" }; const char *const categoryC[] = { "Cc", "Cf", "Cn", "Co", "Cs", "" }; const char *const categoryP[] = { "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps", "" }; const char *const categoryZ[] = { "Zl", "Zp", "Zs", "" }; const char *const categoryS[] = { "Sc", "Sk", "Sm", "So", "" }; create_compositecategory(gc, "LC", categoryLC); create_compositecategory(gc, "L", categoryL); create_compositecategory(gc, "M", categoryM); create_compositecategory(gc, "N", categoryN); create_compositecategory(gc, "C", categoryC); create_compositecategory(gc, "P", categoryP); create_compositecategory(gc, "Z", categoryZ); create_compositecategory(gc, "S", categoryS); } void read_binprops(rangeholder &bp, std::string &licensetext, const char *const *propdatafiles, const char *const indir) { static const srell::regex re_propfmt("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*([^\\s;#]+)\\s*"); // (#.*)?$"); ucprange range; std::string data; matchranges_type lines; srell::cmatch cmatch; for (; **propdatafiles; ++propdatafiles) { data.clear(); unishared::read_file(data, *propdatafiles, indir); lines.clear(); split2(lines, data, '\n'); matchranges_type::size_type i = read_license(licensetext, lines, 0); for (; i < lines.size(); ++i) { const srell::csub_match &line = lines[i]; if (srell::regex_search(line.first, line.second, cmatch, re_propfmt, srell::regex_constants::match_continuous)) { const srell::cmatch::value_type &begin = cmatch[1]; const srell::cmatch::value_type &end = cmatch[2]; const srell::cmatch::value_type &property = cmatch[3]; // const srell::cmatch::value_type &comment = cmatch[4]; range.first = static_cast<ui_l32>(std::strtol(begin.first, NULL, 16)); range.second = end.matched ? static_cast<ui_l32>(std::strtol(end.first, NULL, 16)) : range.first; bp[property.str()].join(range); } } } } void read_emoseq(seqholder &emsq, std::string &licensetext, const char *const *emodatafiles, const char *const indir) { const srell::regex re_emsqfmt("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,})|((?:\\s+[0-9A-Fa-f]{4,})+))?\\s*;\\s*([^\\s;#]+)\\s*"); // (?:\\s*;[^#]*)(#.*)?$"); const srell::regex re_emsq2fmt("\\s*([0-9A-Fa-f]{4,})"); std::string data; matchranges_type lines; srell::cmatch cmatch; for (; **emodatafiles; ++emodatafiles) { data.clear(); unishared::read_file(data, *emodatafiles, indir); lines.clear(); split2(lines, data, '\n'); matchranges_type::size_type i = read_license(licensetext, lines, 0); for (; i < lines.size(); ++i) { const srell::csub_match &line = lines[i]; if (srell::regex_search(line.first, line.second, cmatch, re_emsqfmt, srell::regex_constants::match_continuous)) { const srell::cmatch::value_type &begin = cmatch[1]; const srell::cmatch::value_type &end = cmatch[2]; const srell::cmatch::value_type &seqs = cmatch[3]; const std::string seqname = cmatch[4].str(); // const srell::cmatch::value_type &comment = cmatch[5]; const ui_l32 first = static_cast<ui_l32>(std::strtol(begin.first, NULL, 16)); if (seqs.matched) { const u32array::size_type orgsize = emsq[seqname].size(); srell::cregex_iterator2 it(seqs.first, seqs.second, re_emsq2fmt, srell::regex_constants::match_continuous); ui_l32 count = 2; emsq[seqname].push_backncr(0); // Number of code points. emsq[seqname].push_back(first); for (; !it.done(); ++it, ++count) { const srell::cmatch::value_type &ucp = (*it)[1]; emsq[seqname].push_back(static_cast<ui_l32>(std::strtol(ucp.first, NULL, 16))); } emsq[seqname][orgsize] = count; } else { if (end.matched) { emsq[seqname].push_backncr(1); // Range. emsq[seqname].push_back(first); emsq[seqname].push_back(static_cast<ui_l32>(std::strtol(end.first, NULL, 16))); } else { emsq[seqname].push_backncr(2); // Single code point. emsq[seqname].push_back(first); } } } } } for (seqholder::iterator it = emsq.begin(); it != emsq.end(); ++it) { if (it->second.size() & 1) { std::printf("[Info] Padding added to \"%s\" (%u).\n", it->first.c_str(), static_cast<unsigned int>(it->second.size())); it->second.push_backncr(0); } } emsq["RGI_Emoji"].push_backncr(compositeclass); // Dummy data. } void read_scripts(rangeholder &sc, std::string &licensetext, const char *const filename, const char *const indir) { const srell::regex re_scriptdata("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*([^\\s;#]+)\\s*"); // (#.*)?$"); ucprange range; std::string data; matchranges_type lines; srell::cmatch cmatch; ucprange_array assigned_ranges; data.clear(); unishared::read_file(data, filename, indir); lines.clear(); split2(lines, data, '\n'); matchranges_type::size_type i = read_license(licensetext, lines, 0); for (; i < lines.size(); ++i) { const srell::csub_match &line = lines[i]; if (srell::regex_search(line.first, line.second, cmatch, re_scriptdata, srell::regex_constants::match_continuous)) { const srell::cmatch::value_type &begin = cmatch[1]; const srell::cmatch::value_type &end = cmatch[2]; const srell::cmatch::value_type &scriptname = cmatch[3]; // const srell::cmatch::value_type &comment = cmatch[4]; range.first = static_cast<ui_l32>(std::strtol(begin.first, NULL, 16)); range.second = end.matched ? static_cast<ui_l32>(std::strtol(end.first, NULL, 16)) : range.first; sc[scriptname].join(range); assigned_ranges.join(range); } } assigned_ranges.negation(); sc["Unknown"] = assigned_ranges; } canonicalname_mapper load_canonicalnames(const char *const *names) { canonicalname_mapper canonicalnames; matchranges_type parts; for (; **names; ++names) { parts.clear(); split2(parts, *names, ':'); const std::string canonicalname(parts[0].str()); for (matchranges_type::size_type i = 0; i < parts.size(); ++i) { canonicalnames[parts[i].str()] = canonicalname; } } return canonicalnames; } void modify_for_scx(rangeholder &scx, const canonicalname_mapper &canonicalnames, std::string &licensetext, const char *const filename, const char *const indir) { const srell::regex re_scxdata("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*([^\\s;#][^;#]*[^\\s;#])\\s*", srell::regex::multiline); // (#.*)?$"); const srell::regex re_space(" "); const std::string name_common("Common"); const std::string name_inherited("Inherited"); ucprange_array common = scx[name_common]; ucprange_array inherited = scx[name_inherited]; ucprange range; std::map<std::string, bool> warning_out; std::string data; matchranges_type lines; srell::cmatch cmatch; unishared::read_file(data, filename, indir); lines.clear(); split2(lines, data, '\n'); matchranges_type::size_type i = read_license(licensetext, lines, 0); for (; i < lines.size(); ++i) { const srell::csub_match &line = lines[i]; if (srell::regex_search(line.first, line.second, cmatch, re_scxdata, srell::regex_constants::match_continuous)) { const srell::cmatch::value_type &begin = cmatch[1]; const srell::cmatch::value_type &end = cmatch[2]; const srell::cmatch::value_type &scxnames = cmatch[3]; // const srell::cmatch::value_type &comment = cmatch[4]; range.first = static_cast<ui_l32>(std::strtol(begin.str().c_str(), NULL, 16)); range.second = end.matched ? static_cast<ui_l32>(std::strtol(end.str().c_str(), NULL, 16)) : range.first; common.remove_range(range); inherited.remove_range(range); srell::cregex_iterator2 rei2s(scxnames.first, scxnames.second, re_space); for (rei2s.split_begin();; rei2s.split_next()) { const std::string scriptname(!rei2s.done() ? rei2s.split_range() : rei2s.split_remainder()); if (scriptname.size()) { const canonicalname_mapper::const_iterator it = canonicalnames.find(scriptname); if (it != canonicalnames.end()) scx[it->second].join(range); else { // unishared::throw_error("Canonical name for \"%s\" is not found.", scriptname.c_str()); if (!warning_out.count(scriptname)) { std::printf("[Info] Canonical name for \"%s\" is not found. New script?\n", scriptname.c_str()); warning_out[scriptname] = true; } } } if (rei2s.done()) break; } } } scx[name_common] = common; scx[name_inherited] = inherited; } void combine_properties(sortedrangeholder &base, const rangeholder &addition, const char *const ptype, const char *const *aliasnames) { strings_type aliases; for (; **aliasnames; ++aliasnames) aliases.push_back(std::string(*aliasnames)); return combine_properties(base, addition, ptype, aliases); } void combine_properties(sortedrangeholder &base, const rangeholder &addition, const char *const ptype, const strings_type &aliasnames) { sorted_name_and_ranges elem; matchranges_type names; for (strings_type::size_type i = 0; i < aliasnames.size(); ++i) { const std::string &aliases = aliasnames[i]; bool pdata_found = false; names.clear(); split2(names, aliases, ':'); const std::string canonicalname(names[0].str()); for (matchranges_type::size_type j = 0; j < names.size(); ++j) { const rangeholder::const_iterator it = addition.find(names[j].str()); if (it != addition.end()) { elem.ucpranges = it->second; pdata_found = true; break; } } if (!pdata_found) unishared::throw_error("No property value for \"%s\" found.", aliases.c_str()); elem.ptype = ptype; elem.canonicalname = canonicalname; elem.namealiases = aliases; base.push_back(elem); } } #if !defined(SRELL_NO_VMODE) void combine_pos(sortedseqholder &base, const seqholder &addition, const char *const ptype, const char *const *aliasnames) { ui_l32 total = 0; sorted_name_and_seqs elem; matchranges_type names; u32array compclass; // Composite class. compclass.push_backncr(compositeclass); compclass.push_backncr(0); elem.ptype = ptype; for (; **aliasnames; ++aliasnames) { const std::string aliases(*aliasnames); bool pdata_found = false; names.clear(); split2(names, aliases, ':'); const std::string canonicalname(names[0].str()); for (strings_type::size_type i = 0; i < names.size(); ++i) { const seqholder::const_iterator it = addition.find(names[i].str()); if (it != addition.end()) { elem.ucpseqs = it->second; pdata_found = true; if (elem.ucpseqs.size() != 1 || elem.ucpseqs[0] != compositeclass) { compclass.push_back(elem.ucpseqs.size()); total += static_cast<ui_l32>(elem.ucpseqs.size()); } break; } } if (!pdata_found) unishared::throw_error("No property value for \"%s\" found.", aliases.c_str()); elem.canonicalname = canonicalname; elem.namealiases = aliases; base.push_back(elem); } // Composite class. compclass[1] = total; base[0].ucpseqs = compclass; // [0] = RGI_Emoji. } #endif // !defined(SRELL_NO_VMODE) name_mapper create_ptype_mappings() { name_mapper categories; categories["gc"] = "general_category"; categories["bp"] = "binary"; categories["sc"] = "script"; categories["scx"] = "script_extensions"; return categories; } std::string create_ptypes(const name_mapper &ptypes, const int version) { std::string ptypedef(version >= 300 ? "" : (version >= 201 ? "\tuptype_unknown = 0,\n" : "\tstruct ptype\n\t{\n\t\tstatic const T2 unknown = 0;\n")); const char *names[] = { "bp", "gc", "sc", "scx", "" }; const std::string t2head = version >= 201 ? "\t" : "\t\tstatic const T2 "; const std::string t2tail = version >= 201 ? "," : ";"; const std::string t2finaltail = version >= 201 ? "" : ";"; const std::string t2prefix = version >= 201 ? "uptype_" : ""; for (unsigned int i = 0; *names[i];) { const char *const name = names[i]; const name_mapper::const_iterator it = ptypes.find(name); if (it == ptypes.end()) unishared::throw_error("Name for ptype \"%s\" is not found.", name); ptypedef += t2head + t2prefix + (version >= 300 ? name : it->second) + " = " + unishared::to_string(++i) + t2tail + "\n"; } if (version >= 300) { } else if (version >= 201) { drop_finalcomma(ptypedef); } else ptypedef += "\t};\n"; return ptypedef; } std::string ranges_to_string(const ucprange_array &array, const std::string &indent, const bool compositeclass) { std::string rangestring(indent); if (compositeclass) { rangestring += "// "; for (ucprange_array::size_type i = 1; i < array.size(); ++i) { const ucprange &range = array[i]; if (i > 1) rangestring += " + "; rangestring += static_cast<char>(range.first); rangestring += static_cast<char>(range.second); rangestring += ':' + unishared::to_string(array[++i].first); } } else { unsigned count = 0; for (ucprange_array::size_type i = 0; i < array.size(); ++i) { const ucprange &range = array[i]; if (count == 4) { count = 0; rangestring += '\n' + indent; } else if (count) { rangestring += ' '; } rangestring += "0x" + unishared::to_string(range.first, 16, 4) + ", 0x" + unishared::to_string(range.second, 16, 4) + ','; ++count; } } return rangestring; } #if !defined(SRELL_NO_VMODE) std::string seqs_to_string(const u32array &array, const std::string &indent) { std::string seqstring; if (array.size() == 1 && array[0] == compositeclass) { } else { for (u32array::size_type i = 0; i < array.size();) { const ui_l32 num = array[i]; if (num == compositeclass) { break; } if (num == 0) // Padding. { seqstring += indent + "0,\t// Padding.\n"; break; } if (++i == array.size()) unishared::throw_error("[InternalError] No data follows %u.", num); seqstring += indent + unishared::to_string(num); seqstring += ", 0x" + unishared::to_string(array[i++], 16, 4); if (num == 1) // Range. { if (i == array.size()) unishared::throw_error("[InternalError] No pair for %.4lX.", array[i - 1]); seqstring += ", 0x" + unishared::to_string(array[i++], 16, 4); } else { for (ui_l32 j = 2; j < num; ++j) { if (i == array.size()) unishared::throw_error("[InternalError] Broken after %.4lX.", array[i - 1]); seqstring += ", 0x" + unishared::to_string(array[i++], 16, 4); } } seqstring += ",\n"; } if (seqstring.size()) seqstring.resize(seqstring.size() - 1); } return seqstring; } #endif // !defined(SRELL_NO_VMODE) void drop_finalcomma(std::string &data) { std::string::size_type commapos = data.rfind(','); if (commapos != std::string::npos) data.erase(commapos, 1); } std::string create_pnametable(ui_l32 &count, const int version, const std::string &indent) { const char *const *pnames = updata::property_names; std::string out; if (version >= 300) { namenumber_mapper categories; count = 0u; for (unsigned int i = 2; **pnames; ++pnames, ++i) { const std::string names(*pnames); srell::sregex_iterator2 rei2(names, re_colon_); for (rei2.split_begin();; rei2.split_next()) { const std::string name(!rei2.done() ? rei2.split_range() : rei2.split_remainder()); categories[name] = i; ++count; if (rei2.done()) break; } } out.assign(indent + "{ \"\", " + unishared::to_string(count) + " },\n"); for (namenumber_mapper::const_iterator it = categories.begin(); it != categories.end(); ++it) { out.append(indent); out.append("{ \""); #if !defined(NO_LITERAL_ESCAPING) out.append(escape_string(it->first)); #else out.append(it->first); #endif out.append("\", " + unishared::to_string(it->second) + " },\n"); } } else { out.append(indent + "\"*\",\t// #0:unknown\n"); out.append(indent + "\"*\",\t// #1:binary\n"); for (unsigned int i = 2; **pnames; ++pnames, ++i) { out.append(indent); out.append(1, '"'); out.append(*pnames); out.append("\",\t// #" + unishared::to_string(i) + '\n'); } out.append(indent + "\"\"\n"); } return out; } std::string join_dropcomma_append(const strings_type &s, const std::string &return_table) { std::string tmp(join('\n', s, true)); drop_finalcomma(tmp); tmp.append(return_table); return tmp; } void do_formatting(std::string &out, const sortedrangeholder &alldata, const sortedseqholder &emsq, const int version) { const std::size_t numofproperties = sizeof (updata::property_names) / sizeof (updata::property_names[0]) + 1; const std::string template1(version >= 300 ? "template <typename T3, typename T4, typename T5>\n" : (version >= 201 ? "template <typename T3, typename T4, typename T5, typename T6>\n" : "template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>\n")); const std::string template2(version >= 300 ? "unicode_property_data<T3, T4, T5>::" : (version >= 201 ? "unicode_property_data<T3, T4, T5, T6>::" : "unicode_property_data<T1, T2, T3, T4, T5, T6>::")); const std::string return_table(version == 100 ? "\t\t};\n\t\treturn table;\n\t}\n" : "};\n"); const std::string indent(version == 100 ? "\t\t\t" : "\t"); name_mapper ptype_mappings(create_ptype_mappings()); const std::string ptypes(create_ptypes(ptype_mappings, version)); // T2, property types. const std::string t1head = version >= 201 ? "\t" : "\tstatic const T1 "; const std::string t1tail = version >= 201 ? "," : ";"; const std::string t1finaltail = version >= 201 ? "" : ";"; const std::string t1prefix = version >= 201 ? "upid_" : ""; const std::string t2scope = version >= 201 ? "{ uptype_" : "{ ptype::"; const std::string maxorlast = version >= 200 ? "max" : "last"; const ui_l32 pno_base = version >= 300 ? numofproperties : 1u; ui_l32 offset = 0u; ui_l32 property_number = pno_base; ui_l32 property_id_number = pno_base; std::string pnumbers(t1head + t1prefix + "unknown = 0" + t1tail + "\n"); // T1, property numbers. strings_type rangetable; strings_type lookup_ranges; std::string lookup_numbers; namenumber_mapper rangeno_map; if (version >= 300) { pnumbers += t1head + t1prefix + "invalid = 0" + t1tail + "\n"; pnumbers += t1head + t1prefix + "error = 0" + t1tail + "\n"; pnumbers += ptypes; } do_formatting2(rangeno_map, lookup_numbers, lookup_ranges, rangetable, pnumbers, property_id_number, property_number, offset, pno_base, maxorlast, t2scope, t1prefix, t1finaltail, t1tail, t1head, ptype_mappings, indent, alldata, emsq, version); ui_l32 basepos = 0u; std::string pnames(create_pnametable(basepos, version, indent)); if (version >= 300) { u32pair posinfo[numofproperties]; sort_rangeno_table(posinfo, basepos, lookup_numbers, rangeno_map, indent); lookup_numbers.append(return_table); merge_posinfo(lookup_ranges, posinfo, numofproperties, indent); } else if (version >= 200) { lookup_numbers.append(indent + t2scope + "unknown, 0, \"\" }\n"); lookup_numbers.append(return_table); lookup_numbers.insert(0, template1 + "const T5 " + template2 + "rangenumbertable[] =\n{\n\t" + t2scope + "unknown, 0, \"*\" },\t// #0\n"); } else { lookup_numbers.append(indent + t2scope + "unknown, \"\", 0 }\n"); lookup_numbers.append(return_table); lookup_numbers.insert(0, version == 100 ? "\tstatic const T5 *rangenumber_table()\n\t{\n\t\tstatic const T5 table[] =\n\t\t{\n\t\t\t" + t2scope + "unknown, \"*\", 0 },\t// #0\n" : template1 + "const T5 " + template2 + "rangenumbertable[] =\n{\n\t" + t2scope + "unknown, \"*\", 0 },\t// #0\n"); } pnames.insert(0, version == 100 ? "\tstatic const T3 *propertyname_table()\n\t{\n\t\tstatic const T3 table[] =\n\t\t{\n" : template1 + "const T3 " + template2 + (version >= 300 ? "propertynumbertable" : "propertynametable") + "[] =\n{\n"); if (version < 300) pnames.append(return_table); if (version >= 201) { out.append("enum upid_type\n{\n"); out.append(pnumbers); // T1 out.append("};\n\n"); if (version < 300) { out.append("enum up_type\n{\n"); out.append(ptypes); out.append("};\n\n"); } out.append(template1 + "struct unicode_property_data\n{\n"); } else { out.append(template1 + "struct unicode_property_data\n{\n"); out.append(pnumbers); out.append(ptypes); } if (version == 100) { out.append(pnames); out.append(std::string("\tstatic const T4 *ranges()\n\t{\n\t\tstatic const T4 table[] =\n\t\t{\n")); out.append(join_dropcomma_append(rangetable, return_table)); out.append(lookup_numbers); out.append(std::string("\tstatic const T6 *position_table()\n\t{\n\t\tstatic const T6 table[] =\n\t\t{\n\t\t\t{ 0, 0 },\t// #0 unknown\n")); out.append(join_dropcomma_append(lookup_ranges, return_table)); out.append("};\n"); } else { if (version >= 300) { out.append("\tstatic const T3 propertynumbertable[];\n"); out.append("\tstatic const T4 positiontable[];\n"); out.append("\tstatic const T5 rangetable[];\n"); } else { out.append("\tstatic const T3 propertynametable[];\n"); out.append("\tstatic const T4 rangetable[];\n"); out.append("\tstatic const T5 rangenumbertable[];\n"); out.append("\tstatic const T6 positiontable[];\n"); } if (version <= 200) { out.append("\n\tstatic const T3 *propertyname_table()\n\t{\n\t\treturn propertynametable;\n\t}\n"); out.append("\tstatic const T4 *ranges()\n\t{\n\t\treturn rangetable;\n\t}\n"); out.append("\tstatic const T5 *rangenumber_table()\n\t{\n\t\treturn rangenumbertable;\n\t}\n"); out.append("\tstatic const T6 *position_table()\n\t{\n\t\treturn positiontable;\n\t}\n"); } out.append("};\n\n"); out.append(pnames); // T3 if (version < 300) { out.append("\n"); out.append(template1 + "const T4 " + template2 + "rangetable[] =\n{\n"); out.append(join_dropcomma_append(rangetable, return_table)); // T4 out.append("\n"); } out.append(lookup_numbers); // T5 out.append("\n"); out.append(template1 + (version >= 300 ? "const T4 " : "const T6 ") + template2 + "positiontable[] =\n{\n\t{ 0, 0 },\t// #0 unknown\n"); out.append(join_dropcomma_append(lookup_ranges, return_table)); // T6 if (version >= 300) { out.append("\n"); out.append(template1 + "const T5 " + template2 + "rangetable[] =\n{\n"); out.append(join_dropcomma_append(rangetable, return_table)); // T4 } } if (version > 100) out.append("#define SRELL_UPDATA_VERSION " + unishared::to_string(static_cast<unsigned int>(version)) + "\n"); } void do_formatting2( namenumber_mapper &rangeno_map, std::string &lookup_numbers, strings_type &lookup_ranges, strings_type &rangetable, std::string &pnumbers, ui_l32 &property_id_number, ui_l32 &property_number, ui_l32 &offset, const ui_l32 pno_base, const std::string &maxorlast, const std::string &t2scope, const std::string &t1prefix, const std::string &t1finaltail, const std::string &t1tail, const std::string &t1head, name_mapper &ptype_mappings, const std::string &indent, const sortedrangeholder &alldata, const sortedseqholder &emsq, const int version) { namenumber_mapper registered; srell::re_detail::simple_array<ucprange> rangepos; srell::sregex_iterator2 rei2; for (sortedrangeholder::size_type i = 0; i < alldata.size(); ++i) { const sorted_name_and_ranges &elem = alldata[i]; const std::string ptype = elem.ptype; const std::string name = elem.canonicalname; const std::string aliases = elem.namealiases; const ucprange_array &array = elem.ucpranges; const std::string pnumber_keyname(ptype + '_' + name); const std::string position_comment(' ' + ptype + '=' + aliases); const bool compositeclass_found = array.size() && array[0].first == compositeclass; std::string rangestring(ranges_to_string(array, indent, compositeclass_found)); ui_l32 numofranges = static_cast<ui_l32 >(array.size()); ui_l32 pno = property_number; const namenumber_mapper::const_iterator rit = registered.find(rangestring); if (rit != registered.end()) { pno = rit->second; lookup_ranges[pno - pno_base] += position_comment; rangetable[(pno - pno_base) * 2] += position_comment; if (version >= 300) { rei2.assign(aliases, re_colon_); for (rei2.split_begin();; rei2.split_next()) { const std::string alias(!rei2.done() ? rei2.split_range() : rei2.split_remainder()); rangeno_map[ptype + ':' + alias] = pno; if (rei2.done()) break; } } else if (version >= 200) { lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", " + unishared::to_string(pno) + ", \"" + aliases + "\" },\t// #" + unishared::to_string(property_id_number) + "\n"); } else lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", \"" + aliases + "\", " + unishared::to_string(pno) + " },\t// #" + unishared::to_string(property_id_number) + "\n"); } else { // ucpranges of "Assigned" is empty. if (compositeclass_found) { std::printf("[Info] Composite property \"%s\" found.\n", aliases.c_str()); numofranges = array[0].second; } else { registered[rangestring] = property_number; } if (version >= 300) { rei2.assign(aliases, re_colon_); for (rei2.split_begin();; rei2.split_next()) { const std::string alias(!rei2.done() ? rei2.split_range() : rei2.split_remainder()); rangeno_map[ptype + ':' + alias] = property_number; if (rei2.done()) break; } } else if (version >= 200) { lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", " + unishared::to_string(property_number) + ", \"" + aliases + "\" },\t// #" + unishared::to_string(property_id_number) + "\n"); } else lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", \"" + aliases + "\", " + unishared::to_string(property_number) + " },\t// #" + unishared::to_string(property_id_number) + "\n"); lookup_ranges.push_back(indent + "{ " + unishared::to_string(offset) + ", " + unishared::to_string(numofranges) + " },\t// #" + unishared::to_string(pno) + position_comment); rangetable.push_back(indent + "// #" + unishared::to_string(pno) + " (" + unishared::to_string(offset) + '+' + unishared::to_string(numofranges) + "):" + position_comment); rangetable.push_back(rangestring); rangepos.push_back(ucprange_helper(offset, numofranges)); if (!compositeclass_found) offset += numofranges; ++property_number; } if (version >= 300) pnumbers.append(t1head + pnumber_keyname + " = " + unishared::to_string(pno) + t1tail + (pno != property_id_number ? ("\t// #" + unishared::to_string(property_id_number)) : "") + '\n'); else pnumbers.append(t1head + pnumber_keyname + " = " + unishared::to_string(property_id_number) + t1tail + "\t// #" + unishared::to_string(pno) + '\n'); ++property_id_number; } pnumbers.append(t1head + t1prefix + maxorlast + "_property_number = " + unishared::to_string(property_number - 1) + t1tail + "\n"); #if !defined(SRELL_NO_VMODE) if (rangetable.size()) drop_finalcomma(rangetable[rangetable.size() - 1]); rangetable.push_back("#if !defined(SRELL_NO_UNICODE_POS)\n" + indent + ","); if (version < 300) lookup_numbers.append("#if !defined(SRELL_NO_UNICODE_POS)\n"); for (sortedseqholder::size_type i = 0; i < emsq.size(); ++i) { const sorted_name_and_seqs &elem = emsq[i]; const std::string ptype = elem.ptype; const std::string name = elem.canonicalname; const std::string aliases = elem.namealiases; const u32array &array = elem.ucpseqs; const bool compositeclass_found = array.size() && array[0] == compositeclass; const std::string pnumber_keyname(ptype + '_' + name); const std::string position_comment(' ' + ptype + '=' + aliases); ui_l32 numofseqs = static_cast<ui_l32>(array.size()); std::string seqstring; if (compositeclass_found) { std::printf("[Info] Composite property \"%s\" found.\n", aliases.c_str()); numofseqs = array[1]; seqstring = indent + "// "; for (u32array::size_type i = 2; i < array.size(); ++i) { if (i > 2) seqstring += " + "; seqstring += unishared::to_string(array[i]) + "/2"; } } else { seqstring = seqs_to_string(array, indent); } const ui_l32 numofranges = numofseqs / 2; if (version >= 300) pnumbers.append(t1head + pnumber_keyname + " = " + unishared::to_string(property_number) + t1tail + "\t// #" + unishared::to_string(property_id_number) + '\n'); else pnumbers.append(t1head + pnumber_keyname + " = " + unishared::to_string(property_id_number) + t1tail + "\t// #" + unishared::to_string(property_number) + '\n'); if (version >= 300) { rei2.assign(aliases, re_colon_); for (rei2.split_begin();; rei2.split_next()) { const std::string alias(!rei2.done() ? rei2.split_range() : rei2.split_remainder()); rangeno_map[ptype + ':' + aliases] = property_number; if (rei2.done()) break; } } else if (version >= 200) { lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", " + unishared::to_string(property_number) + ", \"" + aliases + "\" },\t// #" + unishared::to_string(property_id_number) + "\n"); } else lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", \"" + aliases + "\", " + unishared::to_string(property_number) + " },\t// #" + unishared::to_string(property_id_number) + "\n"); lookup_ranges.push_back(indent + "{ " + unishared::to_string(offset) + ", " + unishared::to_string(numofranges) + " },\t// #" + unishared::to_string(property_number) + position_comment); rangetable.push_back(indent + "// #" + unishared::to_string(property_number) + " (" + unishared::to_string(offset) + '+' + unishared::to_string(numofseqs) + "/2):" + position_comment); rangetable.push_back(seqstring); ++property_number; ++property_id_number; if (!compositeclass_found) offset += numofranges; } pnumbers.append(t1head + t1prefix + maxorlast + "_pos_number = " + unishared::to_string(--property_number) + t1finaltail + "\n"); rangetable.push_back("#endif\t// !defined(SRELL_NO_UNICODE_POS)"); if (version < 300) lookup_numbers.append("#endif\t// !defined(SRELL_NO_UNICODE_POS)\n"); #endif // !defined(SRELL_NO_VMODE) } void sort_rangeno_table(u32pair *const posinfo, ui_l32 offset, std::string &lookup_numbers, const namenumber_mapper &rangeno_map, const std::string &indent) { typedef std::vector<srell::ssub_match> names_type; names_type names; name_mapper pvalues; namenumber_mapper pcounts; for (namenumber_mapper::const_iterator it = rangeno_map.begin(); it != rangeno_map.end(); ++it) { names.clear(); re_colon_.split(names, it->first, 2); if (names.size() == 2) { const std::string pname(names[0].str()); const std::string pvalue(names[1].str()); #if !defined(NO_LITERAL_ESCAPING) pvalues[pname] += indent + "{ \"" + escape_string(pvalue) + "\", " + unishared::to_string(it->second) + " },\n"; #else pvalues[pname] += indent + "{ \"" + pvalue + "\", " + unishared::to_string(it->second) + " },\n"; #endif ++pcounts[pname]; } } offset += set_pvalue_and_count(lookup_numbers, posinfo[2], "gc", offset, pcounts, pvalues, indent); offset += set_pvalue_and_count(lookup_numbers, posinfo[1], "bp", offset, pcounts, pvalues, indent); offset += set_pvalue_and_count(lookup_numbers, posinfo[3], "sc", offset, pcounts, pvalues, indent); offset += set_pvalue_and_count(lookup_numbers, posinfo[4], "scx", offset, pcounts, pvalues, indent); drop_finalcomma(lookup_numbers); } ui_l32 set_pvalue_and_count(std::string &lookup_numbers, u32pair &posinfo, const std::string category, const ui_l32 offset, namenumber_mapper &pcounts, name_mapper &pvalues, const std::string &indent) { lookup_numbers.append(indent + "// " + category + ": " + unishared::to_string(pcounts[category]) + "\n" + pvalues[category]); posinfo.set(offset, pcounts[category]); return posinfo.second; } void merge_posinfo(strings_type &lookup_ranges, const u32pair *const posinfo, const std::size_t numofproperties, const std::string &indent) { for (std::size_t i = 1; i < numofproperties; ++i) { const u32pair &pair = posinfo[i]; const std::string line(indent + "{ " + unishared::to_string(pair.first) + ", " + unishared::to_string(pair.second) + " },\t// #" + unishared::to_string(i) + ' ' + (i == 1 ? "binary" : updata::property_names[i - 2])); lookup_ranges.insert(lookup_ranges.begin() + i - 1, line); } } std::string escape_string(const std::string &s) { static const char hex[] = "0123456789ABCDEF"; std::string out; for (std::string::size_type i = 0; i < s.size(); ++i) { out.append("\\x"); out.append(1, hex[(s[i] >> 4) & 15]); out.append(1, hex[s[i] & 15]); } return out; } srell::regex re_colon_; }; // class unicode_property int main(const int argc, const char *const *const argv) { up_options upopts(argc, argv); std::string outdata; unicode_property up; int errorno = up.create_updata(outdata, upopts); if (errorno == 0) { if (!unishared::write_file(upopts.outfilename, outdata)) errorno = 2; } return errorno; }