Mercurial > minori
diff dep/anitomy/dep/srell/unicode/updataout3.cpp @ 347:a0aa8c8c4307
dep/anitomy: port to use UCS-4 rather than wide strings
rationale: wide strings are not the same on every platform, and
might not even be Unicode. (while they usually are, its possible
that they are not)
I was *going* to change StringToInt to use a string stream, but
outputting to an integer doesn't seem to work at all with UCS-4,
even though it ought to, so I just rolled my own that uses the
arabic digits only.
author | Paper <paper@paper.us.eu.org> |
---|---|
date | Sun, 23 Jun 2024 10:32:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dep/anitomy/dep/srell/unicode/updataout3.cpp Sun Jun 23 10:32:09 2024 -0400 @@ -0,0 +1,1689 @@ +// +// updataout.cpp: version 3.002 (2023/12/29). +// +// This is a program that generates srell_updata3.h from: +// DerivedCoreProperties.txt +// DerivedNormalizationProps.txt +// PropList.txt +// PropertyValueAliases.txt +// ScriptExtensions.txt +// Scripts.txt +// UnicodeData.txt +// emoji-data.txt +// emoji-sequences.txt +// emoji-zwj-sequences.txt +// provided by the Unicode Consortium. The latese versions of them are +// available at: +// emoji-data.txt: http://www.unicode.org/Public/UNIDATA/emoji/ +// emoji-sequences.txt and emoji-zwj-sequences.txt: +// http://www.unicode.org/Public/emoji/ +// others: http://www.unicode.org/Public/UNIDATA/ +// + +#include <cstdio> +#include <cstdlib> +#include <cstdarg> +#include <string> +#include <vector> +#include <map> +#include <stdexcept> +#include <algorithm> // For std::swap in C++98/03 +#include <utility> // For std::swap in C++11- +#define SRELL_NO_UNICODE_DATA +#include "../srell.hpp" + +#if defined(_MSC_VER) && _MSC_VER >= 1400 +#pragma warning(disable:4996) +#endif + +namespace updata +{ +static const char *const property_names[] = { // 3 + "General_Category:gc", "Script:sc", "Script_Extensions:scx", "" +}; +static const char *const binary_property_names[] = { // 53 (52+1) + // *1: http://unicode.org/reports/tr18/#General_Category_Property + // *2: 9th field in UnicodeData.txt + "ASCII", // *1 + "ASCII_Hex_Digit:AHex", // PropList.txt + "Alphabetic:Alpha", // DerivedCoreProperties.txt + "Any", // *1 + "Assigned", // *1 + "Bidi_Control:Bidi_C", // PropList.txt + "Bidi_Mirrored:Bidi_M", // *2 + "Case_Ignorable:CI", // DerivedCoreProperties.txt + "Cased", // DerivedCoreProperties.txt + "Changes_When_Casefolded:CWCF", // DerivedCoreProperties.txt + "Changes_When_Casemapped:CWCM", // DerivedCoreProperties.txt + "Changes_When_Lowercased:CWL", // DerivedCoreProperties.txt + "Changes_When_NFKC_Casefolded:CWKCF", // DerivedNormalizationProps.txt + "Changes_When_Titlecased:CWT", // DerivedCoreProperties.txt + "Changes_When_Uppercased:CWU", // DerivedCoreProperties.txt + "Dash", // PropList.txt + "Default_Ignorable_Code_Point:DI", // DerivedCoreProperties.txt + "Deprecated:Dep", // PropList.txt + "Diacritic:Dia", // PropList.txt + "Emoji", // emoji-data.txt + "Emoji_Component:EComp", // emoji-data.txt + "Emoji_Modifier:EMod", // emoji-data.txt + "Emoji_Modifier_Base:EBase", // emoji-data.txt + "Emoji_Presentation:EPres", // emoji-data.txt + "Extended_Pictographic:ExtPict", // emoji-data.txt + "Extender:Ext", // PropList.txt + "Grapheme_Base:Gr_Base", // DerivedCoreProperties.txt + "Grapheme_Extend:Gr_Ext", // DerivedCoreProperties.txt + "Hex_Digit:Hex", // PropList.txt + "IDS_Binary_Operator:IDSB", // PropList.txt + "IDS_Trinary_Operator:IDST", // PropList.txt + "ID_Continue:IDC", // DerivedCoreProperties.txt + "ID_Start:IDS", // DerivedCoreProperties.txt + "Ideographic:Ideo", // PropList.txt + "Join_Control:Join_C", // PropList.txt + "Logical_Order_Exception:LOE", // PropList.txt + "Lowercase:Lower", // DerivedCoreProperties.txt + "Math", // DerivedCoreProperties.txt + "Noncharacter_Code_Point:NChar", // PropList.txt + "Pattern_Syntax:Pat_Syn", // PropList.txt + "Pattern_White_Space:Pat_WS", // PropList.txt + "Quotation_Mark:QMark", // PropList.txt + "Radical", // PropList.txt + "Regional_Indicator:RI", // PropList.txt + "Sentence_Terminal:STerm", // PropList.txt + "Soft_Dotted:SD", // PropList.txt + "Terminal_Punctuation:Term", // PropList.txt + "Unified_Ideograph:UIdeo", // PropList.txt + "Uppercase:Upper", // DerivedCoreProperties.txt + "Variation_Selector:VS", // PropList.txt + "White_Space:space", // PropList.txt + "XID_Continue:XIDC", // DerivedCoreProperties.txt + "XID_Start:XIDS", // DerivedCoreProperties.txt + // ECMAScript 2019/Unicode 11: + // "Extended_Pictographic:ExtPict", + // ECMAScript 2021/Unicode 13: + // Aliases: EComp, EMod, EBase, EPres, and ExtPict + "" +}; +static const char *const emoseq_property_names[] = { + "RGI_Emoji", + "Basic_Emoji", // emoji-sequences.txt + "Emoji_Keycap_Sequence", // emoji-sequences.txt + "RGI_Emoji_Modifier_Sequence", // emoji-sequences.txt + "RGI_Emoji_Flag_Sequence", // emoji-sequences.txt + "RGI_Emoji_Tag_Sequence", // emoji-sequences.txt + "RGI_Emoji_ZWJ_Sequence", // emoji-zwj-sequences.txt + "" +}; +static const char *const gc_values[] = { // 38 + "Other:C", "Control:Cc:cntrl", "Format:Cf", "Unassigned:Cn", + "Private_Use:Co", "Surrogate:Cs", "Letter:L", "Cased_Letter:LC", + "Lowercase_Letter:Ll", "Titlecase_Letter:Lt", "Uppercase_Letter:Lu", "Modifier_Letter:Lm", + "Other_Letter:Lo", "Mark:M:Combining_Mark", "Spacing_Mark:Mc", "Enclosing_Mark:Me", + "Nonspacing_Mark:Mn", "Number:N", "Decimal_Number:Nd:digit", "Letter_Number:Nl", + "Other_Number:No", "Punctuation:P:punct", "Connector_Punctuation:Pc", "Dash_Punctuation:Pd", + "Close_Punctuation:Pe", "Final_Punctuation:Pf", "Initial_Punctuation:Pi", "Other_Punctuation:Po", + "Open_Punctuation:Ps", "Symbol:S", "Currency_Symbol:Sc", "Modifier_Symbol:Sk", + "Math_Symbol:Sm", "Other_Symbol:So", "Separator:Z", "Line_Separator:Zl", + "Paragraph_Separator:Zp", "Space_Separator:Zs", "" +}; +} // namespace updata + +namespace unishared +{ +template <typename T> +std::string to_string(T value, int radix = 10, const int precision = 1) +{ + std::string num; + + if (radix >= 2 && radix <= 16) + { + typedef typename std::string::size_type size_type; + const bool minus = value < 0 ? (value = 0 - value, true) : false; + + for (; value; value /= radix) + num.push_back("0123456789ABCDEF"[value % radix]); + + if (precision > 0 && num.size() < static_cast<size_type>(precision)) + num.append(static_cast<size_type>(precision) - num.size(), static_cast<char>('0')); + + if (minus) + num.push_back(static_cast<char>('-')); + + const size_type mid = num.size() / 2; + + for (size_type i = 0; i < mid; ++i) + std::swap(num[i], num[num.size() - i - 1]); + } + return num; +} + +void throw_error(const char *const s, ...) +{ + char buffer[256]; + + va_list va; + va_start(va, s); + std::vsprintf(buffer, s, va); + va_end(va); + throw std::runtime_error(buffer); +} + +void read_file(std::string &str, const char *const filename, const char *const dir) +{ + const std::string path(std::string(dir ? dir : "") + filename); + FILE *const fp = std::fopen(path.c_str(), "r"); + + std::fprintf(stdout, "Reading '%s'... ", path.c_str()); + + if (fp) + { + static const std::size_t bufsize = 4096; + char *const buffer = static_cast<char *>(std::malloc(bufsize)); + + if (buffer) + { + for (;;) + { + const std::size_t size = std::fread(buffer, 1, bufsize, fp); + + if (!size) + break; + + str.append(buffer, size); + } + std::fclose(fp); + std::fputs("done.\n", stdout); + std::free(buffer); + return; + } + } + std::fputs("failed...", stdout); + throw_error("could not open!"); +} + +bool write_file(const char *const filename, const std::string &str) +{ + FILE *const fp = std::fopen(filename, "wb"); + + std::fprintf(stdout, "Writing '%s'... ", filename); + + if (fp) + { + const bool success = std::fwrite(str.c_str(), 1, str.size(), fp) == str.size(); + std::fclose(fp); + if (success) + { + std::fputs("done.\n", stdout); + return true; + } + } + std::fputs("failed...\n", stdout); + return false; +} +} // namespace unishared + +struct up_options +{ + const char *outfilename; + const char *indir; + int version; + int errorno; + + up_options(const int argc, const char *const *const argv) + : outfilename("srell_updata3.h") + , indir("") + , version(301) + , errorno(0) + { + for (int index = 1; index < argc; ++index) + { + const char firstchar = argv[index][0]; + + if (firstchar == '-' || firstchar == '/') + { + const char *const option = argv[index] + 1; + + if (std::strcmp(option, "o") == 0) + { + if (index + 1 >= argc) + goto NO_ARGUMENT; + outfilename = argv[++index]; + } + else if (std::strcmp(option, "v") == 0) + { + if (index + 1 >= argc) + goto NO_ARGUMENT; + version = static_cast<int>(std::strtod(argv[++index], NULL) * 100.0 + 0.5); + } + else if (std::strcmp(option, "i") == 0 || std::strcmp(option, "id") == 0) + { + if (index + 1 >= argc) + goto NO_ARGUMENT; + indir = argv[++index]; + } + else if (std::strcmp(option, "?") == 0 || std::strcmp(option, "h") == 0) + { + std::fputs("Usage: updataout2 [options]\nOptions:\n", stdout); + std::fputs(" -i <DIRECTORY>\tSame as -id.\n", stdout); + std::fputs(" -id <DIRECTORY>\tAssume that input files exist in <DIRECTORY>.\n\t\t\t<DIRECTORY> must ends with '/' or '\\'.\n", stdout); + std::fputs(" -o <FILE>\t\tOutput to <FILE>.\n", stdout); +// std::fputs(" -v <VERNO>\t\tOutput in the version VERNO format.\n", stdout); + errorno = 1; + return; + } + else + goto UNKNOWN_OPTION; + + continue; + + NO_ARGUMENT: + std::fprintf(stdout, "[Error] no argument for \"%s\" specified.\n", argv[index]); + errorno = -2; + } + else + { + UNKNOWN_OPTION: + std::fprintf(stdout, "[Error] unknown option \"%s\" found.\n", argv[index]); + errorno = -1; + } + } + } +}; +// struct up_options + +class unicode_property +{ +public: + + unicode_property() + : re_colon_(":") + { + } + + int create_updata(std::string &outdata, const up_options &opts) + { + int errorno = opts.errorno; + const char *const unidatafilename = "UnicodeData.txt"; + const char *const propdatafiles[] = { "PropList.txt", "DerivedCoreProperties.txt", "emoji-data.txt", "DerivedNormalizationProps.txt", "" }; + const char *const emodatafiles[] = { "emoji-sequences.txt", "emoji-zwj-sequences.txt", "" }; + const char *const scfilename = "Scripts.txt"; + const char *const scxfilename = "ScriptExtensions.txt"; + const char *const pvafilename = "PropertyValueAliases.txt"; + canonicalname_mapper scriptname_maps; + strings_type scriptname_aliases; + std::string licensetext; + rangeholder general_category_values; + rangeholder binary_properties; + seqholder emoseq_properties; + rangeholder scripts; + rangeholder scriptextensions; + sortedrangeholder combined_properties; + sortedseqholder combined_pos; +// scriptnameholder ucs_to_scriptname; // codepoint->scriptname. + + if (errorno) + return errorno; + + try + { + licensetext = "// "; + licensetext += unidatafilename; + licensetext += "\n//\n"; + + read_unidata(general_category_values, binary_properties, unidatafilename, opts.indir); + set_additionalbinprops(binary_properties, general_category_values); // for ASCII, Any, Cn. + create_compositecategories(general_category_values); // This needs "Cn". + + read_binprops(binary_properties, licensetext, propdatafiles, opts.indir); +#if !defined(SRELL_NO_VMODE) + read_emoseq(emoseq_properties, licensetext, emodatafiles, opts.indir); +#endif + + read_scriptnames(scriptname_maps, scriptname_aliases, licensetext, scfilename, pvafilename, opts); + + read_scripts(scripts, licensetext, scfilename, opts.indir); + + scriptextensions = scripts; + modify_for_scx(scriptextensions, scriptname_maps, licensetext, scxfilename, opts.indir); + + combine_properties(combined_properties, general_category_values, "gc", updata::gc_values); + combine_properties(combined_properties, binary_properties, "bp", updata::binary_property_names); + combine_properties(combined_properties, scripts, "sc", scriptname_aliases); + combine_properties(combined_properties, scriptextensions, "scx", scriptname_aliases); +#if !defined(SRELL_NO_VMODE) + combine_pos(combined_pos, emoseq_properties, "bp", updata::emoseq_property_names); +#endif + + do_formatting(outdata, combined_properties, combined_pos, opts.version); + + licensetext.append(1, '\n'); + outdata.insert(0, licensetext); + } + catch (srell::regex_error &e) + { + std::printf("\nError: %s,%d\n", e.what(), e.code()); + errorno = 1; + } + catch (std::runtime_error &e) + { + std::printf("\nError: %s\n", e.what()); + errorno = 2; + } + return errorno; + } + +private: + + typedef srell::re_detail::ui_l32 ui_l32; + typedef srell::re_detail::range_pairs ucprange_array; + typedef srell::re_detail::range_pair u32pair; + typedef u32pair ucprange; + typedef srell::re_detail::range_pair_helper u32rp_helper; + typedef u32rp_helper ucprange_helper; + typedef std::map<std::string, ucprange_array> rangeholder; + typedef srell::re_detail::simple_array<ui_l32> u32array; + typedef std::map<std::string, u32array> seqholder; + typedef std::vector<std::string> strings_type; + typedef std::vector<srell::csub_match> matchranges_type; + typedef std::map<ui_l32, std::string> scriptnameholder; + typedef std::map<std::string, std::string> name_mapper; + typedef std::map<std::string, ui_l32> namenumber_mapper; + typedef name_mapper canonicalname_mapper; + static const ui_l32 invalid_u32value = srell::re_detail::constants::invalid_u32value; + static const ui_l32 compositeclass = invalid_u32value; + + struct sorted_name_and_ranges + { + std::string ptype; + std::string canonicalname; + std::string namealiases; + ucprange_array ucpranges; + }; + typedef std::vector<sorted_name_and_ranges> sortedrangeholder; + + struct sorted_name_and_seqs + { + std::string ptype; + std::string canonicalname; + std::string namealiases; + u32array ucpseqs; + }; + typedef std::vector<sorted_name_and_seqs> sortedseqholder; + + void split2(matchranges_type &parts, const std::string &data, const char splitter) + { + std::string::size_type readpos = 0; + srell::csub_match csm; + + csm.matched = true; + for (;;) + { + std::string::size_type lineend = data.find(splitter, readpos); + + csm.first = data.data() + readpos; + if (lineend == std::string::npos) + { + csm.second = data.data() + data.size(); + parts.push_back(csm); + break; + } + + csm.second = data.data() + lineend; + parts.push_back(csm); + ++lineend; + readpos = lineend; + } + } + + std::string join(const char c, const strings_type &parts, const bool add_final_also = false) + { + std::string out; + + for (strings_type::size_type i = 0; i < parts.size(); ++i) + out.append(parts[i] + c); + + if (!add_final_also && out.size()) + out.resize(out.size() - 1); + + return out; + } + + void read_unidata(rangeholder &gc, rangeholder &bp, const char *const unidatafilename, const char *const indir) + { + const srell::regex re_dataline("^([0-9A-F]+);([^;]*);(([^;]*);(?:[^;]*;){6}([^;]*)(?:;[^;]*){5})$"); + const srell::regex re_rangefirst("^<(.*), First>$"); + + const std::string stringY("Y"); + const std::string stringN("N"); + ui_l32 prevucp = invalid_u32value; + std::string data; + matchranges_type lines; + srell::cmatch cmatch; +// matchranges_type parts; + std::string rangename; + std::string rangefirstproperty; + ui_l32 rangefirstcp = 0; + ucprange range; + ucprange_array bidi_mirrored_ranges; + + unishared::read_file(data, unidatafilename, indir); + split2(lines, data, '\n'); + + for (matchranges_type::size_type i = 0; i < lines.size(); ++i) + { + const srell::csub_match &line = lines[i]; + + if (srell::regex_match(line.first, line.second, cmatch, re_dataline)) + { + const srell::cmatch::value_type &codepoint = cmatch[1]; + const srell::cmatch::value_type &name = cmatch[2]; + const std::string name_string(name.str()); + const std::string property(cmatch[3].str()); + + range.first = range.second = static_cast<ui_l32>(std::strtol(codepoint.first, NULL, 16)); + + if (prevucp >= range.first && prevucp != invalid_u32value) + unishared::throw_error("Out of order: %.4lX >= %.4lX", prevucp, range.first); + +// parts.clear(); +// split2(parts, property, ';'); +// if (parts.size() != 13) +// unishared::throw_error("number of fields is not 13, but %u\n\t[%s]", parts.size(), line.str().c_str()); + +// const std::string &general_category = parts[0]; +// const std::string &bidi_mirrored = parts[7]; + const std::string general_category(cmatch[4].str()); + const std::string bidi_mirrored(cmatch[5].str()); + + prevucp = range.first; + + if (rangename.size()) + { + if (name_string.compare("<" + rangename + ", Last>") != 0) + unishared::throw_error("<%s, Last> does not follow its First line.\n\t%s follows insteadly.", rangename.c_str(), name_string.c_str()); + + if (property != rangefirstproperty) + { + unishared::throw_error("\"%s\": properties of First and Last are different.\n\tFirst: %s\n\tLast: %s", rangename.c_str(), rangefirstproperty.c_str(), property.c_str()); + } + + range.first = rangefirstcp; + rangename.clear(); + } + else if (srell::regex_match(name.first, name.second, cmatch, re_rangefirst)) + { + rangename = cmatch[1]; + rangefirstproperty = property; + rangefirstcp = range.first; + continue; + } + + // Registers "general_category" value. + gc[general_category].join(range); + + // Registers "bidi_mirrored" value. + if (bidi_mirrored == stringY) + { + bidi_mirrored_ranges.join(range); + } + else if (bidi_mirrored != stringN) + unishared::throw_error("Unknown Bidi_Mirrored value [%s] in %s.", bidi_mirrored.c_str(), line.str().c_str()); + } + else if (line.first != line.second) + unishared::throw_error("Unknown format [%s]", line.str().c_str()); + } + bp["Bidi_Mirrored"] = bidi_mirrored_ranges; + } + + void read_scriptnames(canonicalname_mapper &sn_maps, strings_type &sn_aliases, std::string &licensetext, const char *const scfilename, const char *const pvafilename, const up_options &opts) + { + const srell::regex re_scline("^[0-9A-Fa-f.]+\\s*;\\s*(\\S+)"); + const srell::regex re_pvaline("scx?\\s*;\\s*(\\S.*)\\r?\\n?"); + const srell::regex re_split("[ ;]+"); + ui_l32 count = 0; + std::string data; + matchranges_type lines; + srell::cmatch cmatch; + namenumber_mapper seennames; + + unishared::read_file(data, scfilename, opts.indir); + + lines.clear(); + split2(lines, data, '\n'); + + for (matchranges_type::size_type i = 0; i < lines.size(); ++i) + { + const srell::csub_match &line = lines[i]; + + if (srell::regex_search(line.first, line.second, cmatch, re_scline, srell::regex_constants::match_continuous)) + { + const std::string scname(cmatch.str(1)); + + if (!seennames.count(scname)) + { + seennames[scname] = count++; + } + } + } + + if (opts.version >= 300) + { + seennames["Unknown"] = count++; + sn_aliases.resize(count); + } + + typedef std::vector<srell::csub_match> scnames_type; + canonicalname_mapper aliases_tmp; + scnames_type scnames; + + data.clear(); + unishared::read_file(data, pvafilename, opts.indir); + + lines.clear(); + split2(lines, data, '\n'); + + matchranges_type::size_type i = read_license(licensetext, lines, 0); + + for (; i < lines.size(); ++i) + { + const srell::csub_match &line = lines[i]; + + if (srell::regex_match(line.first, line.second, cmatch, re_pvaline, srell::regex_constants::match_continuous)) + { + scnames.clear(); + re_split.split(scnames, cmatch[1].first, cmatch[1].second); + + if (scnames.size() >= 2) + { + const std::string canonicalname(scnames[1]); + + if (seennames.count(canonicalname)) + { + std::string aliases(canonicalname); + + for (scnames_type::size_type i = 0; i < scnames.size(); ++i) + { + const std::string scname(scnames[i].str()); + + sn_maps[scname] = canonicalname; + if ((opts.version < 300 && i != 1) + || (opts.version >= 300 && scname != canonicalname)) + { + aliases += ':'; + aliases += scname; + } + } + if (opts.version >= 300) + sn_aliases[seennames[canonicalname]] = aliases; + else + aliases_tmp[canonicalname] = aliases; + } + } + } + } + + if (opts.version < 300) + { + for (canonicalname_mapper::const_iterator it = aliases_tmp.begin(); it != aliases_tmp.end(); ++it) + sn_aliases.push_back(it->second); + } + } + + matchranges_type::size_type read_license(std::string &licensetext, const matchranges_type &lines, matchranges_type::size_type pos) + { + static const srell::regex re_license("^#[ \\t]*(\\S.*)?$"); + srell::cmatch cm; + + for (; pos < lines.size(); ++pos) + { + const srell::csub_match &line = lines[pos]; + + if (srell::regex_search(line.first, line.second, cm, re_license, srell::regex_constants::match_continuous)) + { + const std::string comment(cm[1].str()); + + if (comment.size()) + licensetext += "// " + comment + '\n'; + else + { + licensetext += "//\n"; + break; + } + } + } + return pos; + } + + // binary properties created from UnicodeData.txt. + void set_additionalbinprops(rangeholder &bp, rangeholder &gc) + { + ucprange_array assigned_ranges; + + for (rangeholder::iterator it = gc.begin(); it != gc.end(); ++it) + assigned_ranges.merge(it->second); + + bp["Any"].join(ucprange_helper(0x0000, 0x10ffff)); + bp["ASCII"].join(ucprange_helper(0x0000, 0x007f)); + bp["Assigned"]; // Only creates. No data. + +// bp["Assigned"] = assigned_ranges; + assigned_ranges.negation(); + gc["Cn"] = assigned_ranges; + } + + void create_compositecategory(rangeholder &gc, const char *const newname, const char *const *categories) + { + ucprange_array array; + ui_l32 total = 0; + + array.append_newpair(ucprange_helper(compositeclass, 0)); + + for (; **categories; ++categories) + { + const char *const c = *categories; + const ui_l32 count = static_cast<ui_l32>(gc[*categories].size()); + + array.append_newpair(ucprange_helper(c[0], c[1])); + array.append_newpair(ucprange_helper(count, 0)); + total += count; + } + array[0].second = total; + gc[newname] = array; + } + + void create_compositecategories(rangeholder &gc) + { + const char *const categoryLC[] = { "Ll", "Lt", "Lu", "" }; + const char *const categoryL[] = { "Ll", "Lt", "Lu", "Lm", "Lo", "" }; + const char *const categoryM[] = { "Mc", "Me", "Mn", "" }; + const char *const categoryN[] = { "Nd", "Nl", "No", "" }; + const char *const categoryC[] = { "Cc", "Cf", "Cn", "Co", "Cs", "" }; + const char *const categoryP[] = { "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps", "" }; + const char *const categoryZ[] = { "Zl", "Zp", "Zs", "" }; + const char *const categoryS[] = { "Sc", "Sk", "Sm", "So", "" }; + + create_compositecategory(gc, "LC", categoryLC); + create_compositecategory(gc, "L", categoryL); + create_compositecategory(gc, "M", categoryM); + create_compositecategory(gc, "N", categoryN); + create_compositecategory(gc, "C", categoryC); + create_compositecategory(gc, "P", categoryP); + create_compositecategory(gc, "Z", categoryZ); + create_compositecategory(gc, "S", categoryS); + } + + void read_binprops(rangeholder &bp, std::string &licensetext, const char *const *propdatafiles, const char *const indir) + { + static const srell::regex re_propfmt("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*([^\\s;#]+)\\s*"); // (#.*)?$"); + ucprange range; + std::string data; + matchranges_type lines; + srell::cmatch cmatch; + + for (; **propdatafiles; ++propdatafiles) + { + data.clear(); + unishared::read_file(data, *propdatafiles, indir); + + lines.clear(); + split2(lines, data, '\n'); + + matchranges_type::size_type i = read_license(licensetext, lines, 0); + + for (; i < lines.size(); ++i) + { + const srell::csub_match &line = lines[i]; + + if (srell::regex_search(line.first, line.second, cmatch, re_propfmt, srell::regex_constants::match_continuous)) + { + const srell::cmatch::value_type &begin = cmatch[1]; + const srell::cmatch::value_type &end = cmatch[2]; + const srell::cmatch::value_type &property = cmatch[3]; +// const srell::cmatch::value_type &comment = cmatch[4]; + + range.first = static_cast<ui_l32>(std::strtol(begin.first, NULL, 16)); + range.second = end.matched ? static_cast<ui_l32>(std::strtol(end.first, NULL, 16)) : range.first; + + bp[property.str()].join(range); + } + } + } + } + + void read_emoseq(seqholder &emsq, std::string &licensetext, const char *const *emodatafiles, const char *const indir) + { + const srell::regex re_emsqfmt("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,})|((?:\\s+[0-9A-Fa-f]{4,})+))?\\s*;\\s*([^\\s;#]+)\\s*"); // (?:\\s*;[^#]*)(#.*)?$"); + const srell::regex re_emsq2fmt("\\s*([0-9A-Fa-f]{4,})"); + std::string data; + matchranges_type lines; + srell::cmatch cmatch; + + for (; **emodatafiles; ++emodatafiles) + { + data.clear(); + unishared::read_file(data, *emodatafiles, indir); + + lines.clear(); + split2(lines, data, '\n'); + + matchranges_type::size_type i = read_license(licensetext, lines, 0); + + for (; i < lines.size(); ++i) + { + const srell::csub_match &line = lines[i]; + + if (srell::regex_search(line.first, line.second, cmatch, re_emsqfmt, srell::regex_constants::match_continuous)) + { + const srell::cmatch::value_type &begin = cmatch[1]; + const srell::cmatch::value_type &end = cmatch[2]; + const srell::cmatch::value_type &seqs = cmatch[3]; + const std::string seqname = cmatch[4].str(); +// const srell::cmatch::value_type &comment = cmatch[5]; + const ui_l32 first = static_cast<ui_l32>(std::strtol(begin.first, NULL, 16)); + + if (seqs.matched) + { + const u32array::size_type orgsize = emsq[seqname].size(); + srell::cregex_iterator2 it(seqs.first, seqs.second, re_emsq2fmt, srell::regex_constants::match_continuous); + ui_l32 count = 2; + + emsq[seqname].push_backncr(0); // Number of code points. + emsq[seqname].push_back(first); + + for (; !it.done(); ++it, ++count) + { + const srell::cmatch::value_type &ucp = (*it)[1]; + + emsq[seqname].push_back(static_cast<ui_l32>(std::strtol(ucp.first, NULL, 16))); + } + emsq[seqname][orgsize] = count; + } + else + { + if (end.matched) + { + emsq[seqname].push_backncr(1); // Range. + emsq[seqname].push_back(first); + emsq[seqname].push_back(static_cast<ui_l32>(std::strtol(end.first, NULL, 16))); + } + else + { + emsq[seqname].push_backncr(2); // Single code point. + emsq[seqname].push_back(first); + } + } + } + } + } + + for (seqholder::iterator it = emsq.begin(); it != emsq.end(); ++it) + { + if (it->second.size() & 1) + { + std::printf("[Info] Padding added to \"%s\" (%u).\n", it->first.c_str(), static_cast<unsigned int>(it->second.size())); + it->second.push_backncr(0); + } + } + + emsq["RGI_Emoji"].push_backncr(compositeclass); // Dummy data. + } + + void read_scripts(rangeholder &sc, std::string &licensetext, const char *const filename, const char *const indir) + { + const srell::regex re_scriptdata("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*([^\\s;#]+)\\s*"); // (#.*)?$"); + ucprange range; + std::string data; + matchranges_type lines; + srell::cmatch cmatch; + ucprange_array assigned_ranges; + + data.clear(); + unishared::read_file(data, filename, indir); + + lines.clear(); + split2(lines, data, '\n'); + + matchranges_type::size_type i = read_license(licensetext, lines, 0); + + for (; i < lines.size(); ++i) + { + const srell::csub_match &line = lines[i]; + + if (srell::regex_search(line.first, line.second, cmatch, re_scriptdata, srell::regex_constants::match_continuous)) + { + const srell::cmatch::value_type &begin = cmatch[1]; + const srell::cmatch::value_type &end = cmatch[2]; + const srell::cmatch::value_type &scriptname = cmatch[3]; +// const srell::cmatch::value_type &comment = cmatch[4]; + + range.first = static_cast<ui_l32>(std::strtol(begin.first, NULL, 16)); + range.second = end.matched ? static_cast<ui_l32>(std::strtol(end.first, NULL, 16)) : range.first; + + sc[scriptname].join(range); + assigned_ranges.join(range); + } + } + assigned_ranges.negation(); + sc["Unknown"] = assigned_ranges; + } + + canonicalname_mapper load_canonicalnames(const char *const *names) + { + canonicalname_mapper canonicalnames; + matchranges_type parts; + + for (; **names; ++names) + { + parts.clear(); + split2(parts, *names, ':'); + const std::string canonicalname(parts[0].str()); + for (matchranges_type::size_type i = 0; i < parts.size(); ++i) + { + canonicalnames[parts[i].str()] = canonicalname; + } + } + return canonicalnames; + } + + void modify_for_scx(rangeholder &scx, const canonicalname_mapper &canonicalnames, std::string &licensetext, const char *const filename, const char *const indir) + { + const srell::regex re_scxdata("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*([^\\s;#][^;#]*[^\\s;#])\\s*", srell::regex::multiline); // (#.*)?$"); + const srell::regex re_space(" "); + const std::string name_common("Common"); + const std::string name_inherited("Inherited"); + ucprange_array common = scx[name_common]; + ucprange_array inherited = scx[name_inherited]; + ucprange range; + std::map<std::string, bool> warning_out; + std::string data; + matchranges_type lines; + srell::cmatch cmatch; + + unishared::read_file(data, filename, indir); + + lines.clear(); + split2(lines, data, '\n'); + + matchranges_type::size_type i = read_license(licensetext, lines, 0); + + for (; i < lines.size(); ++i) + { + const srell::csub_match &line = lines[i]; + + if (srell::regex_search(line.first, line.second, cmatch, re_scxdata, srell::regex_constants::match_continuous)) + { + const srell::cmatch::value_type &begin = cmatch[1]; + const srell::cmatch::value_type &end = cmatch[2]; + const srell::cmatch::value_type &scxnames = cmatch[3]; +// const srell::cmatch::value_type &comment = cmatch[4]; + + range.first = static_cast<ui_l32>(std::strtol(begin.str().c_str(), NULL, 16)); + range.second = end.matched ? static_cast<ui_l32>(std::strtol(end.str().c_str(), NULL, 16)) : range.first; + + common.remove_range(range); + inherited.remove_range(range); + + srell::cregex_iterator2 rei2s(scxnames.first, scxnames.second, re_space); + + for (rei2s.split_begin();; rei2s.split_next()) + { + const std::string scriptname(!rei2s.done() ? rei2s.split_range() : rei2s.split_remainder()); + + if (scriptname.size()) + { + const canonicalname_mapper::const_iterator it = canonicalnames.find(scriptname); + + if (it != canonicalnames.end()) + scx[it->second].join(range); + else + { +// unishared::throw_error("Canonical name for \"%s\" is not found.", scriptname.c_str()); + if (!warning_out.count(scriptname)) + { + std::printf("[Info] Canonical name for \"%s\" is not found. New script?\n", scriptname.c_str()); + warning_out[scriptname] = true; + } + } + } + if (rei2s.done()) + break; + } + } + } + scx[name_common] = common; + scx[name_inherited] = inherited; + } + + void combine_properties(sortedrangeholder &base, const rangeholder &addition, const char *const ptype, const char *const *aliasnames) + { + strings_type aliases; + + for (; **aliasnames; ++aliasnames) + aliases.push_back(std::string(*aliasnames)); + + return combine_properties(base, addition, ptype, aliases); + } + + void combine_properties(sortedrangeholder &base, const rangeholder &addition, const char *const ptype, const strings_type &aliasnames) + { + sorted_name_and_ranges elem; + matchranges_type names; + + for (strings_type::size_type i = 0; i < aliasnames.size(); ++i) + { + const std::string &aliases = aliasnames[i]; + bool pdata_found = false; + + names.clear(); + split2(names, aliases, ':'); + + const std::string canonicalname(names[0].str()); + + for (matchranges_type::size_type j = 0; j < names.size(); ++j) + { + const rangeholder::const_iterator it = addition.find(names[j].str()); + + if (it != addition.end()) + { + elem.ucpranges = it->second; + pdata_found = true; + break; + } + } + + if (!pdata_found) + unishared::throw_error("No property value for \"%s\" found.", aliases.c_str()); + + elem.ptype = ptype; + elem.canonicalname = canonicalname; + elem.namealiases = aliases; + base.push_back(elem); + } + } + +#if !defined(SRELL_NO_VMODE) + + void combine_pos(sortedseqholder &base, const seqholder &addition, const char *const ptype, const char *const *aliasnames) + { + ui_l32 total = 0; + sorted_name_and_seqs elem; + matchranges_type names; + u32array compclass; + + // Composite class. + compclass.push_backncr(compositeclass); + compclass.push_backncr(0); + + elem.ptype = ptype; + for (; **aliasnames; ++aliasnames) + { + const std::string aliases(*aliasnames); + bool pdata_found = false; + + names.clear(); + split2(names, aliases, ':'); + + const std::string canonicalname(names[0].str()); + + for (strings_type::size_type i = 0; i < names.size(); ++i) + { + const seqholder::const_iterator it = addition.find(names[i].str()); + + if (it != addition.end()) + { + elem.ucpseqs = it->second; + pdata_found = true; + if (elem.ucpseqs.size() != 1 || elem.ucpseqs[0] != compositeclass) + { + compclass.push_back(elem.ucpseqs.size()); + total += static_cast<ui_l32>(elem.ucpseqs.size()); + } + break; + } + } + + if (!pdata_found) + unishared::throw_error("No property value for \"%s\" found.", aliases.c_str()); + + elem.canonicalname = canonicalname; + elem.namealiases = aliases; + base.push_back(elem); + } + + // Composite class. + compclass[1] = total; + base[0].ucpseqs = compclass; // [0] = RGI_Emoji. + } + +#endif // !defined(SRELL_NO_VMODE) + + name_mapper create_ptype_mappings() + { + name_mapper categories; + + categories["gc"] = "general_category"; + categories["bp"] = "binary"; + categories["sc"] = "script"; + categories["scx"] = "script_extensions"; + return categories; + } + + std::string create_ptypes(const name_mapper &ptypes, const int version) + { + std::string ptypedef(version >= 300 ? "" : (version >= 201 ? "\tuptype_unknown = 0,\n" : "\tstruct ptype\n\t{\n\t\tstatic const T2 unknown = 0;\n")); + const char *names[] = { "bp", "gc", "sc", "scx", "" }; + const std::string t2head = version >= 201 ? "\t" : "\t\tstatic const T2 "; + const std::string t2tail = version >= 201 ? "," : ";"; + const std::string t2finaltail = version >= 201 ? "" : ";"; + const std::string t2prefix = version >= 201 ? "uptype_" : ""; + + for (unsigned int i = 0; *names[i];) + { + const char *const name = names[i]; + const name_mapper::const_iterator it = ptypes.find(name); + + if (it == ptypes.end()) + unishared::throw_error("Name for ptype \"%s\" is not found.", name); + + ptypedef += t2head + t2prefix + (version >= 300 ? name : it->second) + " = " + unishared::to_string(++i) + t2tail + "\n"; + } + + if (version >= 300) + { + } + else if (version >= 201) + { + drop_finalcomma(ptypedef); + } + else + ptypedef += "\t};\n"; + + return ptypedef; + } + + std::string ranges_to_string(const ucprange_array &array, const std::string &indent, const bool compositeclass) + { + std::string rangestring(indent); + + if (compositeclass) + { + rangestring += "// "; + + for (ucprange_array::size_type i = 1; i < array.size(); ++i) + { + const ucprange &range = array[i]; + + if (i > 1) + rangestring += " + "; + rangestring += static_cast<char>(range.first); + rangestring += static_cast<char>(range.second); + rangestring += ':' + unishared::to_string(array[++i].first); + } + } + else + { + unsigned count = 0; + + for (ucprange_array::size_type i = 0; i < array.size(); ++i) + { + const ucprange &range = array[i]; + if (count == 4) + { + count = 0; + rangestring += '\n' + indent; + } + else if (count) + { + rangestring += ' '; + } + rangestring += "0x" + unishared::to_string(range.first, 16, 4) + ", 0x" + unishared::to_string(range.second, 16, 4) + ','; + ++count; + } + } + return rangestring; + } + +#if !defined(SRELL_NO_VMODE) + std::string seqs_to_string(const u32array &array, const std::string &indent) + { + std::string seqstring; + + if (array.size() == 1 && array[0] == compositeclass) + { + } + else + { + for (u32array::size_type i = 0; i < array.size();) + { + const ui_l32 num = array[i]; + + if (num == compositeclass) + { + break; + } + + if (num == 0) // Padding. + { + seqstring += indent + "0,\t// Padding.\n"; + break; + } + + if (++i == array.size()) + unishared::throw_error("[InternalError] No data follows %u.", num); + + seqstring += indent + unishared::to_string(num); + seqstring += ", 0x" + unishared::to_string(array[i++], 16, 4); + + if (num == 1) // Range. + { + if (i == array.size()) + unishared::throw_error("[InternalError] No pair for %.4lX.", array[i - 1]); + + seqstring += ", 0x" + unishared::to_string(array[i++], 16, 4); + } + else + { + for (ui_l32 j = 2; j < num; ++j) + { + if (i == array.size()) + unishared::throw_error("[InternalError] Broken after %.4lX.", array[i - 1]); + + seqstring += ", 0x" + unishared::to_string(array[i++], 16, 4); + } + } + seqstring += ",\n"; + } + + if (seqstring.size()) + seqstring.resize(seqstring.size() - 1); + } + return seqstring; + } +#endif // !defined(SRELL_NO_VMODE) + + void drop_finalcomma(std::string &data) + { + std::string::size_type commapos = data.rfind(','); + if (commapos != std::string::npos) + data.erase(commapos, 1); + } + + std::string create_pnametable(ui_l32 &count, const int version, const std::string &indent) + { + const char *const *pnames = updata::property_names; + std::string out; + + if (version >= 300) + { + namenumber_mapper categories; + + count = 0u; + for (unsigned int i = 2; **pnames; ++pnames, ++i) + { + const std::string names(*pnames); + srell::sregex_iterator2 rei2(names, re_colon_); + + for (rei2.split_begin();; rei2.split_next()) + { + const std::string name(!rei2.done() ? rei2.split_range() : rei2.split_remainder()); + categories[name] = i; + ++count; + + if (rei2.done()) + break; + } + } + + out.assign(indent + "{ \"\", " + unishared::to_string(count) + " },\n"); + + for (namenumber_mapper::const_iterator it = categories.begin(); it != categories.end(); ++it) + { + out.append(indent); + out.append("{ \""); +#if !defined(NO_LITERAL_ESCAPING) + out.append(escape_string(it->first)); +#else + out.append(it->first); +#endif + out.append("\", " + unishared::to_string(it->second) + " },\n"); + } + } + else + { + out.append(indent + "\"*\",\t// #0:unknown\n"); + out.append(indent + "\"*\",\t// #1:binary\n"); + + for (unsigned int i = 2; **pnames; ++pnames, ++i) + { + out.append(indent); + out.append(1, '"'); + out.append(*pnames); + out.append("\",\t// #" + unishared::to_string(i) + '\n'); + } + out.append(indent + "\"\"\n"); + } + return out; + } + + std::string join_dropcomma_append(const strings_type &s, const std::string &return_table) + { + std::string tmp(join('\n', s, true)); + + drop_finalcomma(tmp); + tmp.append(return_table); + return tmp; + } + + void do_formatting(std::string &out, const sortedrangeholder &alldata, const sortedseqholder &emsq, const int version) + { + const std::size_t numofproperties = sizeof (updata::property_names) / sizeof (updata::property_names[0]) + 1; + const std::string template1(version >= 300 ? "template <typename T3, typename T4, typename T5>\n" : (version >= 201 ? "template <typename T3, typename T4, typename T5, typename T6>\n" : "template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>\n")); + const std::string template2(version >= 300 ? "unicode_property_data<T3, T4, T5>::" : (version >= 201 ? "unicode_property_data<T3, T4, T5, T6>::" : "unicode_property_data<T1, T2, T3, T4, T5, T6>::")); + const std::string return_table(version == 100 ? "\t\t};\n\t\treturn table;\n\t}\n" : "};\n"); + const std::string indent(version == 100 ? "\t\t\t" : "\t"); + name_mapper ptype_mappings(create_ptype_mappings()); + const std::string ptypes(create_ptypes(ptype_mappings, version)); // T2, property types. + const std::string t1head = version >= 201 ? "\t" : "\tstatic const T1 "; + const std::string t1tail = version >= 201 ? "," : ";"; + const std::string t1finaltail = version >= 201 ? "" : ";"; + const std::string t1prefix = version >= 201 ? "upid_" : ""; + const std::string t2scope = version >= 201 ? "{ uptype_" : "{ ptype::"; + const std::string maxorlast = version >= 200 ? "max" : "last"; + + const ui_l32 pno_base = version >= 300 ? numofproperties : 1u; + ui_l32 offset = 0u; + ui_l32 property_number = pno_base; + ui_l32 property_id_number = pno_base; + + std::string pnumbers(t1head + t1prefix + "unknown = 0" + t1tail + "\n"); // T1, property numbers. + strings_type rangetable; + strings_type lookup_ranges; + std::string lookup_numbers; + namenumber_mapper rangeno_map; + + if (version >= 300) + { + pnumbers += t1head + t1prefix + "invalid = 0" + t1tail + "\n"; + pnumbers += t1head + t1prefix + "error = 0" + t1tail + "\n"; + pnumbers += ptypes; + } + + do_formatting2(rangeno_map, lookup_numbers, lookup_ranges, rangetable, pnumbers, property_id_number, property_number, offset, pno_base, maxorlast, t2scope, t1prefix, t1finaltail, t1tail, t1head, ptype_mappings, indent, alldata, emsq, version); + + ui_l32 basepos = 0u; + std::string pnames(create_pnametable(basepos, version, indent)); + + if (version >= 300) + { + u32pair posinfo[numofproperties]; + + sort_rangeno_table(posinfo, basepos, lookup_numbers, rangeno_map, indent); + + lookup_numbers.append(return_table); + + merge_posinfo(lookup_ranges, posinfo, numofproperties, indent); + } + else if (version >= 200) + { + lookup_numbers.append(indent + t2scope + "unknown, 0, \"\" }\n"); + lookup_numbers.append(return_table); + lookup_numbers.insert(0, template1 + "const T5 " + template2 + "rangenumbertable[] =\n{\n\t" + t2scope + "unknown, 0, \"*\" },\t// #0\n"); + } + else + { + lookup_numbers.append(indent + t2scope + "unknown, \"\", 0 }\n"); + lookup_numbers.append(return_table); + lookup_numbers.insert(0, version == 100 ? "\tstatic const T5 *rangenumber_table()\n\t{\n\t\tstatic const T5 table[] =\n\t\t{\n\t\t\t" + t2scope + "unknown, \"*\", 0 },\t// #0\n" : template1 + "const T5 " + template2 + "rangenumbertable[] =\n{\n\t" + t2scope + "unknown, \"*\", 0 },\t// #0\n"); + } + + pnames.insert(0, version == 100 ? "\tstatic const T3 *propertyname_table()\n\t{\n\t\tstatic const T3 table[] =\n\t\t{\n" : template1 + "const T3 " + template2 + (version >= 300 ? "propertynumbertable" : "propertynametable") + "[] =\n{\n"); + if (version < 300) + pnames.append(return_table); + + if (version >= 201) + { + out.append("enum upid_type\n{\n"); + out.append(pnumbers); // T1 + out.append("};\n\n"); + if (version < 300) + { + out.append("enum up_type\n{\n"); + out.append(ptypes); + out.append("};\n\n"); + } + out.append(template1 + "struct unicode_property_data\n{\n"); + } + else + { + out.append(template1 + "struct unicode_property_data\n{\n"); + out.append(pnumbers); + out.append(ptypes); + } + if (version == 100) + { + out.append(pnames); + out.append(std::string("\tstatic const T4 *ranges()\n\t{\n\t\tstatic const T4 table[] =\n\t\t{\n")); + out.append(join_dropcomma_append(rangetable, return_table)); + out.append(lookup_numbers); + out.append(std::string("\tstatic const T6 *position_table()\n\t{\n\t\tstatic const T6 table[] =\n\t\t{\n\t\t\t{ 0, 0 },\t// #0 unknown\n")); + out.append(join_dropcomma_append(lookup_ranges, return_table)); + out.append("};\n"); + } + else + { + if (version >= 300) + { + out.append("\tstatic const T3 propertynumbertable[];\n"); + out.append("\tstatic const T4 positiontable[];\n"); + out.append("\tstatic const T5 rangetable[];\n"); + } + else + { + out.append("\tstatic const T3 propertynametable[];\n"); + out.append("\tstatic const T4 rangetable[];\n"); + out.append("\tstatic const T5 rangenumbertable[];\n"); + out.append("\tstatic const T6 positiontable[];\n"); + } + + if (version <= 200) + { + out.append("\n\tstatic const T3 *propertyname_table()\n\t{\n\t\treturn propertynametable;\n\t}\n"); + out.append("\tstatic const T4 *ranges()\n\t{\n\t\treturn rangetable;\n\t}\n"); + out.append("\tstatic const T5 *rangenumber_table()\n\t{\n\t\treturn rangenumbertable;\n\t}\n"); + out.append("\tstatic const T6 *position_table()\n\t{\n\t\treturn positiontable;\n\t}\n"); + } + out.append("};\n\n"); + out.append(pnames); // T3 + + if (version < 300) + { + out.append("\n"); + out.append(template1 + "const T4 " + template2 + "rangetable[] =\n{\n"); + out.append(join_dropcomma_append(rangetable, return_table)); // T4 + out.append("\n"); + } + + out.append(lookup_numbers); // T5 + out.append("\n"); + + out.append(template1 + (version >= 300 ? "const T4 " : "const T6 ") + template2 + "positiontable[] =\n{\n\t{ 0, 0 },\t// #0 unknown\n"); + out.append(join_dropcomma_append(lookup_ranges, return_table)); // T6 + if (version >= 300) + { + out.append("\n"); + + out.append(template1 + "const T5 " + template2 + "rangetable[] =\n{\n"); + out.append(join_dropcomma_append(rangetable, return_table)); // T4 + } + } + if (version > 100) + out.append("#define SRELL_UPDATA_VERSION " + unishared::to_string(static_cast<unsigned int>(version)) + "\n"); + } + + void do_formatting2( + namenumber_mapper &rangeno_map, std::string &lookup_numbers, strings_type &lookup_ranges, strings_type &rangetable, std::string &pnumbers, + ui_l32 &property_id_number, ui_l32 &property_number, ui_l32 &offset, const ui_l32 pno_base, + const std::string &maxorlast, const std::string &t2scope, const std::string &t1prefix, const std::string &t1finaltail, const std::string &t1tail, const std::string &t1head, name_mapper &ptype_mappings, const std::string &indent, const sortedrangeholder &alldata, const sortedseqholder &emsq, const int version) + { + namenumber_mapper registered; + srell::re_detail::simple_array<ucprange> rangepos; + srell::sregex_iterator2 rei2; + + for (sortedrangeholder::size_type i = 0; i < alldata.size(); ++i) + { + const sorted_name_and_ranges &elem = alldata[i]; + const std::string ptype = elem.ptype; + const std::string name = elem.canonicalname; + const std::string aliases = elem.namealiases; + const ucprange_array &array = elem.ucpranges; + const std::string pnumber_keyname(ptype + '_' + name); + const std::string position_comment(' ' + ptype + '=' + aliases); + const bool compositeclass_found = array.size() && array[0].first == compositeclass; + std::string rangestring(ranges_to_string(array, indent, compositeclass_found)); + ui_l32 numofranges = static_cast<ui_l32 >(array.size()); + ui_l32 pno = property_number; + const namenumber_mapper::const_iterator rit = registered.find(rangestring); + + if (rit != registered.end()) + { + pno = rit->second; + + lookup_ranges[pno - pno_base] += position_comment; + rangetable[(pno - pno_base) * 2] += position_comment; + + if (version >= 300) + { + rei2.assign(aliases, re_colon_); + + for (rei2.split_begin();; rei2.split_next()) + { + const std::string alias(!rei2.done() ? rei2.split_range() : rei2.split_remainder()); + rangeno_map[ptype + ':' + alias] = pno; + if (rei2.done()) + break; + } + } + else if (version >= 200) + { + lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", " + unishared::to_string(pno) + ", \"" + aliases + "\" },\t// #" + unishared::to_string(property_id_number) + "\n"); + } + else + lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", \"" + aliases + "\", " + unishared::to_string(pno) + " },\t// #" + unishared::to_string(property_id_number) + "\n"); + } + else + { + // ucpranges of "Assigned" is empty. + if (compositeclass_found) + { + std::printf("[Info] Composite property \"%s\" found.\n", aliases.c_str()); + numofranges = array[0].second; + } + else + { + registered[rangestring] = property_number; + } + + if (version >= 300) + { + rei2.assign(aliases, re_colon_); + + for (rei2.split_begin();; rei2.split_next()) + { + const std::string alias(!rei2.done() ? rei2.split_range() : rei2.split_remainder()); + rangeno_map[ptype + ':' + alias] = property_number; + if (rei2.done()) + break; + } + } + else if (version >= 200) + { + lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", " + unishared::to_string(property_number) + ", \"" + aliases + "\" },\t// #" + unishared::to_string(property_id_number) + "\n"); + } + else + lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", \"" + aliases + "\", " + unishared::to_string(property_number) + " },\t// #" + unishared::to_string(property_id_number) + "\n"); + + lookup_ranges.push_back(indent + "{ " + unishared::to_string(offset) + ", " + unishared::to_string(numofranges) + " },\t// #" + unishared::to_string(pno) + position_comment); + + rangetable.push_back(indent + "// #" + unishared::to_string(pno) + " (" + unishared::to_string(offset) + '+' + unishared::to_string(numofranges) + "):" + position_comment); + rangetable.push_back(rangestring); + + rangepos.push_back(ucprange_helper(offset, numofranges)); + + if (!compositeclass_found) + offset += numofranges; + + ++property_number; + } + + if (version >= 300) + pnumbers.append(t1head + pnumber_keyname + " = " + unishared::to_string(pno) + t1tail + (pno != property_id_number ? ("\t// #" + unishared::to_string(property_id_number)) : "") + '\n'); + else + pnumbers.append(t1head + pnumber_keyname + " = " + unishared::to_string(property_id_number) + t1tail + "\t// #" + unishared::to_string(pno) + '\n'); + ++property_id_number; + } + + pnumbers.append(t1head + t1prefix + maxorlast + "_property_number = " + unishared::to_string(property_number - 1) + t1tail + "\n"); + +#if !defined(SRELL_NO_VMODE) + if (rangetable.size()) + drop_finalcomma(rangetable[rangetable.size() - 1]); + rangetable.push_back("#if !defined(SRELL_NO_UNICODE_POS)\n" + indent + ","); + + if (version < 300) + lookup_numbers.append("#if !defined(SRELL_NO_UNICODE_POS)\n"); + + for (sortedseqholder::size_type i = 0; i < emsq.size(); ++i) + { + const sorted_name_and_seqs &elem = emsq[i]; + const std::string ptype = elem.ptype; + const std::string name = elem.canonicalname; + const std::string aliases = elem.namealiases; + const u32array &array = elem.ucpseqs; + const bool compositeclass_found = array.size() && array[0] == compositeclass; + const std::string pnumber_keyname(ptype + '_' + name); + const std::string position_comment(' ' + ptype + '=' + aliases); + ui_l32 numofseqs = static_cast<ui_l32>(array.size()); + std::string seqstring; + + if (compositeclass_found) + { + std::printf("[Info] Composite property \"%s\" found.\n", aliases.c_str()); + numofseqs = array[1]; + seqstring = indent + "// "; + + for (u32array::size_type i = 2; i < array.size(); ++i) + { + if (i > 2) + seqstring += " + "; + seqstring += unishared::to_string(array[i]) + "/2"; + } + } + else + { + seqstring = seqs_to_string(array, indent); + } + + const ui_l32 numofranges = numofseqs / 2; + + if (version >= 300) + pnumbers.append(t1head + pnumber_keyname + " = " + unishared::to_string(property_number) + t1tail + "\t// #" + unishared::to_string(property_id_number) + '\n'); + else + pnumbers.append(t1head + pnumber_keyname + " = " + unishared::to_string(property_id_number) + t1tail + "\t// #" + unishared::to_string(property_number) + '\n'); + + if (version >= 300) + { + rei2.assign(aliases, re_colon_); + + for (rei2.split_begin();; rei2.split_next()) + { + const std::string alias(!rei2.done() ? rei2.split_range() : rei2.split_remainder()); + rangeno_map[ptype + ':' + aliases] = property_number; + if (rei2.done()) + break; + } + } + else if (version >= 200) + { + lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", " + unishared::to_string(property_number) + ", \"" + aliases + "\" },\t// #" + unishared::to_string(property_id_number) + "\n"); + } + else + lookup_numbers.append(indent + t2scope + ptype_mappings[ptype] + ", \"" + aliases + "\", " + unishared::to_string(property_number) + " },\t// #" + unishared::to_string(property_id_number) + "\n"); + lookup_ranges.push_back(indent + "{ " + unishared::to_string(offset) + ", " + unishared::to_string(numofranges) + " },\t// #" + unishared::to_string(property_number) + position_comment); + rangetable.push_back(indent + "// #" + unishared::to_string(property_number) + " (" + unishared::to_string(offset) + '+' + unishared::to_string(numofseqs) + "/2):" + position_comment); + rangetable.push_back(seqstring); + + ++property_number; + ++property_id_number; + if (!compositeclass_found) + offset += numofranges; + } + + pnumbers.append(t1head + t1prefix + maxorlast + "_pos_number = " + unishared::to_string(--property_number) + t1finaltail + "\n"); + rangetable.push_back("#endif\t// !defined(SRELL_NO_UNICODE_POS)"); + if (version < 300) + lookup_numbers.append("#endif\t// !defined(SRELL_NO_UNICODE_POS)\n"); + +#endif // !defined(SRELL_NO_VMODE) + } + + void sort_rangeno_table(u32pair *const posinfo, ui_l32 offset, std::string &lookup_numbers, const namenumber_mapper &rangeno_map, const std::string &indent) + { + typedef std::vector<srell::ssub_match> names_type; + names_type names; + name_mapper pvalues; + namenumber_mapper pcounts; + + for (namenumber_mapper::const_iterator it = rangeno_map.begin(); it != rangeno_map.end(); ++it) + { + names.clear(); + re_colon_.split(names, it->first, 2); + + if (names.size() == 2) + { + const std::string pname(names[0].str()); + const std::string pvalue(names[1].str()); +#if !defined(NO_LITERAL_ESCAPING) + pvalues[pname] += indent + "{ \"" + escape_string(pvalue) + "\", " + unishared::to_string(it->second) + " },\n"; +#else + pvalues[pname] += indent + "{ \"" + pvalue + "\", " + unishared::to_string(it->second) + " },\n"; +#endif + ++pcounts[pname]; + } + } + + offset += set_pvalue_and_count(lookup_numbers, posinfo[2], "gc", offset, pcounts, pvalues, indent); + offset += set_pvalue_and_count(lookup_numbers, posinfo[1], "bp", offset, pcounts, pvalues, indent); + offset += set_pvalue_and_count(lookup_numbers, posinfo[3], "sc", offset, pcounts, pvalues, indent); + offset += set_pvalue_and_count(lookup_numbers, posinfo[4], "scx", offset, pcounts, pvalues, indent); + drop_finalcomma(lookup_numbers); + } + + ui_l32 set_pvalue_and_count(std::string &lookup_numbers, u32pair &posinfo, const std::string category, const ui_l32 offset, namenumber_mapper &pcounts, name_mapper &pvalues, const std::string &indent) + { + lookup_numbers.append(indent + "// " + category + ": " + unishared::to_string(pcounts[category]) + "\n" + pvalues[category]); + posinfo.set(offset, pcounts[category]); + return posinfo.second; + } + + void merge_posinfo(strings_type &lookup_ranges, const u32pair *const posinfo, const std::size_t numofproperties, const std::string &indent) + { + for (std::size_t i = 1; i < numofproperties; ++i) + { + const u32pair &pair = posinfo[i]; + const std::string line(indent + "{ " + unishared::to_string(pair.first) + ", " + unishared::to_string(pair.second) + " },\t// #" + unishared::to_string(i) + ' ' + (i == 1 ? "binary" : updata::property_names[i - 2])); + + lookup_ranges.insert(lookup_ranges.begin() + i - 1, line); + } + } + + std::string escape_string(const std::string &s) + { + static const char hex[] = "0123456789ABCDEF"; + std::string out; + + for (std::string::size_type i = 0; i < s.size(); ++i) + { + out.append("\\x"); + out.append(1, hex[(s[i] >> 4) & 15]); + out.append(1, hex[s[i] & 15]); + } + return out; + } + + srell::regex re_colon_; +}; +// class unicode_property + +int main(const int argc, const char *const *const argv) +{ + up_options upopts(argc, argv); + std::string outdata; + unicode_property up; + int errorno = up.create_updata(outdata, upopts); + + if (errorno == 0) + { + if (!unishared::write_file(upopts.outfilename, outdata)) + errorno = 2; + } + return errorno; +}