Mercurial > minori
comparison dep/toml11/toml/lexer.hpp @ 318:3b355fa948c7
config: use TOML instead of INI
unfortunately, INI is not enough, and causes some paths including
semicolons to break with our current storage of the library folders.
so, I decided to switch to TOML which does support real arrays...
| author | Paper <paper@paper.us.eu.org> |
|---|---|
| date | Wed, 12 Jun 2024 05:25:41 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 317:b1f4d1867ab1 | 318:3b355fa948c7 |
|---|---|
| 1 // Copyright Toru Niina 2017. | |
| 2 // Distributed under the MIT License. | |
| 3 #ifndef TOML11_LEXER_HPP | |
| 4 #define TOML11_LEXER_HPP | |
| 5 #include <istream> | |
| 6 #include <sstream> | |
| 7 #include <stdexcept> | |
| 8 | |
| 9 #include "combinator.hpp" | |
| 10 | |
| 11 namespace toml | |
| 12 { | |
| 13 namespace detail | |
| 14 { | |
| 15 | |
| 16 // these scans contents from current location in a container of char | |
| 17 // and extract a region that matches their own pattern. | |
| 18 // to see the implementation of each component, see combinator.hpp. | |
| 19 | |
| 20 using lex_wschar = either<character<' '>, character<'\t'>>; | |
| 21 using lex_ws = repeat<lex_wschar, at_least<1>>; | |
| 22 using lex_newline = either<character<'\n'>, | |
| 23 sequence<character<'\r'>, character<'\n'>>>; | |
| 24 using lex_lower = in_range<'a', 'z'>; | |
| 25 using lex_upper = in_range<'A', 'Z'>; | |
| 26 using lex_alpha = either<lex_lower, lex_upper>; | |
| 27 using lex_digit = in_range<'0', '9'>; | |
| 28 using lex_nonzero = in_range<'1', '9'>; | |
| 29 using lex_oct_dig = in_range<'0', '7'>; | |
| 30 using lex_bin_dig = in_range<'0', '1'>; | |
| 31 using lex_hex_dig = either<lex_digit, in_range<'A', 'F'>, in_range<'a', 'f'>>; | |
| 32 | |
| 33 using lex_hex_prefix = sequence<character<'0'>, character<'x'>>; | |
| 34 using lex_oct_prefix = sequence<character<'0'>, character<'o'>>; | |
| 35 using lex_bin_prefix = sequence<character<'0'>, character<'b'>>; | |
| 36 using lex_underscore = character<'_'>; | |
| 37 using lex_plus = character<'+'>; | |
| 38 using lex_minus = character<'-'>; | |
| 39 using lex_sign = either<lex_plus, lex_minus>; | |
| 40 | |
| 41 // digit | nonzero 1*(digit | _ digit) | |
| 42 using lex_unsigned_dec_int = either<sequence<lex_nonzero, repeat< | |
| 43 either<lex_digit, sequence<lex_underscore, lex_digit>>, at_least<1>>>, | |
| 44 lex_digit>; | |
| 45 // (+|-)? unsigned_dec_int | |
| 46 using lex_dec_int = sequence<maybe<lex_sign>, lex_unsigned_dec_int>; | |
| 47 | |
| 48 // hex_prefix hex_dig *(hex_dig | _ hex_dig) | |
| 49 using lex_hex_int = sequence<lex_hex_prefix, sequence<lex_hex_dig, repeat< | |
| 50 either<lex_hex_dig, sequence<lex_underscore, lex_hex_dig>>, unlimited>>>; | |
| 51 // oct_prefix oct_dig *(oct_dig | _ oct_dig) | |
| 52 using lex_oct_int = sequence<lex_oct_prefix, sequence<lex_oct_dig, repeat< | |
| 53 either<lex_oct_dig, sequence<lex_underscore, lex_oct_dig>>, unlimited>>>; | |
| 54 // bin_prefix bin_dig *(bin_dig | _ bin_dig) | |
| 55 using lex_bin_int = sequence<lex_bin_prefix, sequence<lex_bin_dig, repeat< | |
| 56 either<lex_bin_dig, sequence<lex_underscore, lex_bin_dig>>, unlimited>>>; | |
| 57 | |
| 58 // (dec_int | hex_int | oct_int | bin_int) | |
| 59 using lex_integer = either<lex_bin_int, lex_oct_int, lex_hex_int, lex_dec_int>; | |
| 60 | |
| 61 // =========================================================================== | |
| 62 | |
| 63 using lex_inf = sequence<character<'i'>, character<'n'>, character<'f'>>; | |
| 64 using lex_nan = sequence<character<'n'>, character<'a'>, character<'n'>>; | |
| 65 using lex_special_float = sequence<maybe<lex_sign>, either<lex_inf, lex_nan>>; | |
| 66 | |
| 67 using lex_zero_prefixable_int = sequence<lex_digit, repeat<either<lex_digit, | |
| 68 sequence<lex_underscore, lex_digit>>, unlimited>>; | |
| 69 | |
| 70 using lex_fractional_part = sequence<character<'.'>, lex_zero_prefixable_int>; | |
| 71 | |
| 72 using lex_exponent_part = sequence<either<character<'e'>, character<'E'>>, | |
| 73 maybe<lex_sign>, lex_zero_prefixable_int>; | |
| 74 | |
| 75 using lex_float = either<lex_special_float, | |
| 76 sequence<lex_dec_int, either<lex_exponent_part, | |
| 77 sequence<lex_fractional_part, maybe<lex_exponent_part>>>>>; | |
| 78 | |
| 79 // =========================================================================== | |
| 80 | |
| 81 using lex_true = sequence<character<'t'>, character<'r'>, | |
| 82 character<'u'>, character<'e'>>; | |
| 83 using lex_false = sequence<character<'f'>, character<'a'>, character<'l'>, | |
| 84 character<'s'>, character<'e'>>; | |
| 85 using lex_boolean = either<lex_true, lex_false>; | |
| 86 | |
| 87 // =========================================================================== | |
| 88 | |
| 89 using lex_date_fullyear = repeat<lex_digit, exactly<4>>; | |
| 90 using lex_date_month = repeat<lex_digit, exactly<2>>; | |
| 91 using lex_date_mday = repeat<lex_digit, exactly<2>>; | |
| 92 using lex_time_delim = either<character<'T'>, character<'t'>, character<' '>>; | |
| 93 using lex_time_hour = repeat<lex_digit, exactly<2>>; | |
| 94 using lex_time_minute = repeat<lex_digit, exactly<2>>; | |
| 95 using lex_time_second = repeat<lex_digit, exactly<2>>; | |
| 96 using lex_time_secfrac = sequence<character<'.'>, | |
| 97 repeat<lex_digit, at_least<1>>>; | |
| 98 | |
| 99 using lex_time_numoffset = sequence<either<character<'+'>, character<'-'>>, | |
| 100 sequence<lex_time_hour, character<':'>, | |
| 101 lex_time_minute>>; | |
| 102 using lex_time_offset = either<character<'Z'>, character<'z'>, | |
| 103 lex_time_numoffset>; | |
| 104 | |
| 105 using lex_partial_time = sequence<lex_time_hour, character<':'>, | |
| 106 lex_time_minute, character<':'>, | |
| 107 lex_time_second, maybe<lex_time_secfrac>>; | |
| 108 using lex_full_date = sequence<lex_date_fullyear, character<'-'>, | |
| 109 lex_date_month, character<'-'>, | |
| 110 lex_date_mday>; | |
| 111 using lex_full_time = sequence<lex_partial_time, lex_time_offset>; | |
| 112 | |
| 113 using lex_offset_date_time = sequence<lex_full_date, lex_time_delim, lex_full_time>; | |
| 114 using lex_local_date_time = sequence<lex_full_date, lex_time_delim, lex_partial_time>; | |
| 115 using lex_local_date = lex_full_date; | |
| 116 using lex_local_time = lex_partial_time; | |
| 117 | |
| 118 // =========================================================================== | |
| 119 | |
| 120 using lex_quotation_mark = character<'"'>; | |
| 121 using lex_basic_unescaped = exclude<either<in_range<0x00, 0x08>, // 0x09 (tab) is allowed | |
| 122 in_range<0x0A, 0x1F>, | |
| 123 character<0x22>, character<0x5C>, | |
| 124 character<0x7F>>>; | |
| 125 | |
| 126 using lex_escape = character<'\\'>; | |
| 127 using lex_escape_unicode_short = sequence<character<'u'>, | |
| 128 repeat<lex_hex_dig, exactly<4>>>; | |
| 129 using lex_escape_unicode_long = sequence<character<'U'>, | |
| 130 repeat<lex_hex_dig, exactly<8>>>; | |
| 131 using lex_escape_seq_char = either<character<'"'>, character<'\\'>, | |
| 132 character<'b'>, character<'f'>, | |
| 133 character<'n'>, character<'r'>, | |
| 134 character<'t'>, | |
| 135 #ifdef TOML11_USE_UNRELEASED_TOML_FEATURES | |
| 136 character<'e'>, // ESC (0x1B) | |
| 137 #endif | |
| 138 lex_escape_unicode_short, | |
| 139 lex_escape_unicode_long | |
| 140 >; | |
| 141 using lex_escaped = sequence<lex_escape, lex_escape_seq_char>; | |
| 142 using lex_basic_char = either<lex_basic_unescaped, lex_escaped>; | |
| 143 using lex_basic_string = sequence<lex_quotation_mark, | |
| 144 repeat<lex_basic_char, unlimited>, | |
| 145 lex_quotation_mark>; | |
| 146 | |
| 147 // After toml post-v0.5.0, it is explicitly clarified how quotes in ml-strings | |
| 148 // are allowed to be used. | |
| 149 // After this, the following strings are *explicitly* allowed. | |
| 150 // - One or two `"`s in a multi-line basic string is allowed wherever it is. | |
| 151 // - Three consecutive `"`s in a multi-line basic string is considered as a delimiter. | |
| 152 // - One or two `"`s can appear just before or after the delimiter. | |
| 153 // ```toml | |
| 154 // str4 = """Here are two quotation marks: "". Simple enough.""" | |
| 155 // str5 = """Here are three quotation marks: ""\".""" | |
| 156 // str6 = """Here are fifteen quotation marks: ""\"""\"""\"""\"""\".""" | |
| 157 // str7 = """"This," she said, "is just a pointless statement."""" | |
| 158 // ``` | |
| 159 // In the current implementation (v3.3.0), it is difficult to parse `str7` in | |
| 160 // the above example. It is difficult to recognize `"` at the end of string body | |
| 161 // collectly. It will be misunderstood as a `"""` delimiter and an additional, | |
| 162 // invalid `"`. Like this: | |
| 163 // ```console | |
| 164 // what(): [error] toml::parse_table: invalid line format | |
| 165 // --> hoge.toml | |
| 166 // | | |
| 167 // 13 | str7 = """"This," she said, "is just a pointless statement."""" | |
| 168 // | ^- expected newline, but got '"'. | |
| 169 // ``` | |
| 170 // As a quick workaround for this problem, `lex_ml_basic_string_delim` was | |
| 171 // split into two, `lex_ml_basic_string_open` and `lex_ml_basic_string_close`. | |
| 172 // `lex_ml_basic_string_open` allows only `"""`. `_close` allows 3-5 `"`s. | |
| 173 // In parse_ml_basic_string() function, the trailing `"`s will be attached to | |
| 174 // the string body. | |
| 175 // | |
| 176 using lex_ml_basic_string_delim = repeat<lex_quotation_mark, exactly<3>>; | |
| 177 using lex_ml_basic_string_open = lex_ml_basic_string_delim; | |
| 178 using lex_ml_basic_string_close = sequence< | |
| 179 repeat<lex_quotation_mark, exactly<3>>, | |
| 180 maybe<lex_quotation_mark>, maybe<lex_quotation_mark> | |
| 181 >; | |
| 182 | |
| 183 using lex_ml_basic_unescaped = exclude<either<in_range<0x00, 0x08>, // 0x09 is tab | |
| 184 in_range<0x0A, 0x1F>, | |
| 185 character<0x5C>, // backslash | |
| 186 character<0x7F>, // DEL | |
| 187 lex_ml_basic_string_delim>>; | |
| 188 | |
| 189 using lex_ml_basic_escaped_newline = sequence< | |
| 190 lex_escape, maybe<lex_ws>, lex_newline, | |
| 191 repeat<either<lex_ws, lex_newline>, unlimited>>; | |
| 192 | |
| 193 using lex_ml_basic_char = either<lex_ml_basic_unescaped, lex_escaped>; | |
| 194 using lex_ml_basic_body = repeat<either<lex_ml_basic_char, lex_newline, | |
| 195 lex_ml_basic_escaped_newline>, | |
| 196 unlimited>; | |
| 197 using lex_ml_basic_string = sequence<lex_ml_basic_string_open, | |
| 198 lex_ml_basic_body, | |
| 199 lex_ml_basic_string_close>; | |
| 200 | |
| 201 using lex_literal_char = exclude<either<in_range<0x00, 0x08>, in_range<0x0A, 0x1F>, | |
| 202 character<0x7F>, character<0x27>>>; | |
| 203 using lex_apostrophe = character<'\''>; | |
| 204 using lex_literal_string = sequence<lex_apostrophe, | |
| 205 repeat<lex_literal_char, unlimited>, | |
| 206 lex_apostrophe>; | |
| 207 | |
| 208 // the same reason as above. | |
| 209 using lex_ml_literal_string_delim = repeat<lex_apostrophe, exactly<3>>; | |
| 210 using lex_ml_literal_string_open = lex_ml_literal_string_delim; | |
| 211 using lex_ml_literal_string_close = sequence< | |
| 212 repeat<lex_apostrophe, exactly<3>>, | |
| 213 maybe<lex_apostrophe>, maybe<lex_apostrophe> | |
| 214 >; | |
| 215 | |
| 216 using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>, | |
| 217 in_range<0x0A, 0x1F>, | |
| 218 character<0x7F>, | |
| 219 lex_ml_literal_string_delim>>; | |
| 220 using lex_ml_literal_body = repeat<either<lex_ml_literal_char, lex_newline>, | |
| 221 unlimited>; | |
| 222 using lex_ml_literal_string = sequence<lex_ml_literal_string_open, | |
| 223 lex_ml_literal_body, | |
| 224 lex_ml_literal_string_close>; | |
| 225 | |
| 226 using lex_string = either<lex_ml_basic_string, lex_basic_string, | |
| 227 lex_ml_literal_string, lex_literal_string>; | |
| 228 | |
| 229 // =========================================================================== | |
| 230 using lex_dot_sep = sequence<maybe<lex_ws>, character<'.'>, maybe<lex_ws>>; | |
| 231 | |
| 232 using lex_unquoted_key = repeat<either<lex_alpha, lex_digit, | |
| 233 character<'-'>, character<'_'>>, | |
| 234 at_least<1>>; | |
| 235 using lex_quoted_key = either<lex_basic_string, lex_literal_string>; | |
| 236 using lex_simple_key = either<lex_unquoted_key, lex_quoted_key>; | |
| 237 using lex_dotted_key = sequence<lex_simple_key, | |
| 238 repeat<sequence<lex_dot_sep, lex_simple_key>, | |
| 239 at_least<1> | |
| 240 > | |
| 241 >; | |
| 242 using lex_key = either<lex_dotted_key, lex_simple_key>; | |
| 243 | |
| 244 using lex_keyval_sep = sequence<maybe<lex_ws>, | |
| 245 character<'='>, | |
| 246 maybe<lex_ws>>; | |
| 247 | |
| 248 using lex_std_table_open = character<'['>; | |
| 249 using lex_std_table_close = character<']'>; | |
| 250 using lex_std_table = sequence<lex_std_table_open, | |
| 251 maybe<lex_ws>, | |
| 252 lex_key, | |
| 253 maybe<lex_ws>, | |
| 254 lex_std_table_close>; | |
| 255 | |
| 256 using lex_array_table_open = sequence<lex_std_table_open, lex_std_table_open>; | |
| 257 using lex_array_table_close = sequence<lex_std_table_close, lex_std_table_close>; | |
| 258 using lex_array_table = sequence<lex_array_table_open, | |
| 259 maybe<lex_ws>, | |
| 260 lex_key, | |
| 261 maybe<lex_ws>, | |
| 262 lex_array_table_close>; | |
| 263 | |
| 264 using lex_utf8_1byte = in_range<0x00, 0x7F>; | |
| 265 using lex_utf8_2byte = sequence< | |
| 266 in_range<'\xC2', '\xDF'>, | |
| 267 in_range<'\x80', '\xBF'> | |
| 268 >; | |
| 269 using lex_utf8_3byte = sequence<either< | |
| 270 sequence<character<'\xE0'>, in_range<'\xA0', '\xBF'>>, | |
| 271 sequence<in_range<'\xE1', '\xEC'>, in_range<'\x80', '\xBF'>>, | |
| 272 sequence<character<'\xED'>, in_range<'\x80', '\x9F'>>, | |
| 273 sequence<in_range<'\xEE', '\xEF'>, in_range<'\x80', '\xBF'>> | |
| 274 >, in_range<'\x80', '\xBF'>>; | |
| 275 using lex_utf8_4byte = sequence<either< | |
| 276 sequence<character<'\xF0'>, in_range<'\x90', '\xBF'>>, | |
| 277 sequence<in_range<'\xF1', '\xF3'>, in_range<'\x80', '\xBF'>>, | |
| 278 sequence<character<'\xF4'>, in_range<'\x80', '\x8F'>> | |
| 279 >, in_range<'\x80', '\xBF'>, in_range<'\x80', '\xBF'>>; | |
| 280 using lex_utf8_code = either< | |
| 281 lex_utf8_1byte, | |
| 282 lex_utf8_2byte, | |
| 283 lex_utf8_3byte, | |
| 284 lex_utf8_4byte | |
| 285 >; | |
| 286 | |
| 287 using lex_comment_start_symbol = character<'#'>; | |
| 288 using lex_non_eol_ascii = either<character<0x09>, in_range<0x20, 0x7E>>; | |
| 289 using lex_comment = sequence<lex_comment_start_symbol, repeat<either< | |
| 290 lex_non_eol_ascii, lex_utf8_2byte, lex_utf8_3byte, lex_utf8_4byte>, unlimited>>; | |
| 291 | |
| 292 } // detail | |
| 293 } // toml | |
| 294 #endif // TOML_LEXER_HPP |
