318
+ − 1 // Copyright Toru Niina 2017.
+ − 2 // Distributed under the MIT License.
+ − 3 #ifndef TOML11_LEXER_HPP
+ − 4 #define TOML11_LEXER_HPP
+ − 5 #include <istream>
+ − 6 #include <sstream>
+ − 7 #include <stdexcept>
+ − 8
+ − 9 #include "combinator.hpp"
+ − 10
+ − 11 namespace toml
+ − 12 {
+ − 13 namespace detail
+ − 14 {
+ − 15
+ − 16 // these scans contents from current location in a container of char
+ − 17 // and extract a region that matches their own pattern.
+ − 18 // to see the implementation of each component, see combinator.hpp.
+ − 19
+ − 20 using lex_wschar = either<character<' '>, character<'\t'>>;
+ − 21 using lex_ws = repeat<lex_wschar, at_least<1>>;
+ − 22 using lex_newline = either<character<'\n'>,
+ − 23 sequence<character<'\r'>, character<'\n'>>>;
+ − 24 using lex_lower = in_range<'a', 'z'>;
+ − 25 using lex_upper = in_range<'A', 'Z'>;
+ − 26 using lex_alpha = either<lex_lower, lex_upper>;
+ − 27 using lex_digit = in_range<'0', '9'>;
+ − 28 using lex_nonzero = in_range<'1', '9'>;
+ − 29 using lex_oct_dig = in_range<'0', '7'>;
+ − 30 using lex_bin_dig = in_range<'0', '1'>;
+ − 31 using lex_hex_dig = either<lex_digit, in_range<'A', 'F'>, in_range<'a', 'f'>>;
+ − 32
+ − 33 using lex_hex_prefix = sequence<character<'0'>, character<'x'>>;
+ − 34 using lex_oct_prefix = sequence<character<'0'>, character<'o'>>;
+ − 35 using lex_bin_prefix = sequence<character<'0'>, character<'b'>>;
+ − 36 using lex_underscore = character<'_'>;
+ − 37 using lex_plus = character<'+'>;
+ − 38 using lex_minus = character<'-'>;
+ − 39 using lex_sign = either<lex_plus, lex_minus>;
+ − 40
+ − 41 // digit | nonzero 1*(digit | _ digit)
+ − 42 using lex_unsigned_dec_int = either<sequence<lex_nonzero, repeat<
+ − 43 either<lex_digit, sequence<lex_underscore, lex_digit>>, at_least<1>>>,
+ − 44 lex_digit>;
+ − 45 // (+|-)? unsigned_dec_int
+ − 46 using lex_dec_int = sequence<maybe<lex_sign>, lex_unsigned_dec_int>;
+ − 47
+ − 48 // hex_prefix hex_dig *(hex_dig | _ hex_dig)
+ − 49 using lex_hex_int = sequence<lex_hex_prefix, sequence<lex_hex_dig, repeat<
+ − 50 either<lex_hex_dig, sequence<lex_underscore, lex_hex_dig>>, unlimited>>>;
+ − 51 // oct_prefix oct_dig *(oct_dig | _ oct_dig)
+ − 52 using lex_oct_int = sequence<lex_oct_prefix, sequence<lex_oct_dig, repeat<
+ − 53 either<lex_oct_dig, sequence<lex_underscore, lex_oct_dig>>, unlimited>>>;
+ − 54 // bin_prefix bin_dig *(bin_dig | _ bin_dig)
+ − 55 using lex_bin_int = sequence<lex_bin_prefix, sequence<lex_bin_dig, repeat<
+ − 56 either<lex_bin_dig, sequence<lex_underscore, lex_bin_dig>>, unlimited>>>;
+ − 57
+ − 58 // (dec_int | hex_int | oct_int | bin_int)
+ − 59 using lex_integer = either<lex_bin_int, lex_oct_int, lex_hex_int, lex_dec_int>;
+ − 60
+ − 61 // ===========================================================================
+ − 62
+ − 63 using lex_inf = sequence<character<'i'>, character<'n'>, character<'f'>>;
+ − 64 using lex_nan = sequence<character<'n'>, character<'a'>, character<'n'>>;
+ − 65 using lex_special_float = sequence<maybe<lex_sign>, either<lex_inf, lex_nan>>;
+ − 66
+ − 67 using lex_zero_prefixable_int = sequence<lex_digit, repeat<either<lex_digit,
+ − 68 sequence<lex_underscore, lex_digit>>, unlimited>>;
+ − 69
+ − 70 using lex_fractional_part = sequence<character<'.'>, lex_zero_prefixable_int>;
+ − 71
+ − 72 using lex_exponent_part = sequence<either<character<'e'>, character<'E'>>,
+ − 73 maybe<lex_sign>, lex_zero_prefixable_int>;
+ − 74
+ − 75 using lex_float = either<lex_special_float,
+ − 76 sequence<lex_dec_int, either<lex_exponent_part,
+ − 77 sequence<lex_fractional_part, maybe<lex_exponent_part>>>>>;
+ − 78
+ − 79 // ===========================================================================
+ − 80
+ − 81 using lex_true = sequence<character<'t'>, character<'r'>,
+ − 82 character<'u'>, character<'e'>>;
+ − 83 using lex_false = sequence<character<'f'>, character<'a'>, character<'l'>,
+ − 84 character<'s'>, character<'e'>>;
+ − 85 using lex_boolean = either<lex_true, lex_false>;
+ − 86
+ − 87 // ===========================================================================
+ − 88
+ − 89 using lex_date_fullyear = repeat<lex_digit, exactly<4>>;
+ − 90 using lex_date_month = repeat<lex_digit, exactly<2>>;
+ − 91 using lex_date_mday = repeat<lex_digit, exactly<2>>;
+ − 92 using lex_time_delim = either<character<'T'>, character<'t'>, character<' '>>;
+ − 93 using lex_time_hour = repeat<lex_digit, exactly<2>>;
+ − 94 using lex_time_minute = repeat<lex_digit, exactly<2>>;
+ − 95 using lex_time_second = repeat<lex_digit, exactly<2>>;
+ − 96 using lex_time_secfrac = sequence<character<'.'>,
+ − 97 repeat<lex_digit, at_least<1>>>;
+ − 98
+ − 99 using lex_time_numoffset = sequence<either<character<'+'>, character<'-'>>,
+ − 100 sequence<lex_time_hour, character<':'>,
+ − 101 lex_time_minute>>;
+ − 102 using lex_time_offset = either<character<'Z'>, character<'z'>,
+ − 103 lex_time_numoffset>;
+ − 104
+ − 105 using lex_partial_time = sequence<lex_time_hour, character<':'>,
+ − 106 lex_time_minute, character<':'>,
+ − 107 lex_time_second, maybe<lex_time_secfrac>>;
+ − 108 using lex_full_date = sequence<lex_date_fullyear, character<'-'>,
+ − 109 lex_date_month, character<'-'>,
+ − 110 lex_date_mday>;
+ − 111 using lex_full_time = sequence<lex_partial_time, lex_time_offset>;
+ − 112
+ − 113 using lex_offset_date_time = sequence<lex_full_date, lex_time_delim, lex_full_time>;
+ − 114 using lex_local_date_time = sequence<lex_full_date, lex_time_delim, lex_partial_time>;
+ − 115 using lex_local_date = lex_full_date;
+ − 116 using lex_local_time = lex_partial_time;
+ − 117
+ − 118 // ===========================================================================
+ − 119
+ − 120 using lex_quotation_mark = character<'"'>;
+ − 121 using lex_basic_unescaped = exclude<either<in_range<0x00, 0x08>, // 0x09 (tab) is allowed
+ − 122 in_range<0x0A, 0x1F>,
+ − 123 character<0x22>, character<0x5C>,
+ − 124 character<0x7F>>>;
+ − 125
+ − 126 using lex_escape = character<'\\'>;
+ − 127 using lex_escape_unicode_short = sequence<character<'u'>,
+ − 128 repeat<lex_hex_dig, exactly<4>>>;
+ − 129 using lex_escape_unicode_long = sequence<character<'U'>,
+ − 130 repeat<lex_hex_dig, exactly<8>>>;
+ − 131 using lex_escape_seq_char = either<character<'"'>, character<'\\'>,
+ − 132 character<'b'>, character<'f'>,
+ − 133 character<'n'>, character<'r'>,
+ − 134 character<'t'>,
+ − 135 #ifdef TOML11_USE_UNRELEASED_TOML_FEATURES
+ − 136 character<'e'>, // ESC (0x1B)
+ − 137 #endif
+ − 138 lex_escape_unicode_short,
+ − 139 lex_escape_unicode_long
+ − 140 >;
+ − 141 using lex_escaped = sequence<lex_escape, lex_escape_seq_char>;
+ − 142 using lex_basic_char = either<lex_basic_unescaped, lex_escaped>;
+ − 143 using lex_basic_string = sequence<lex_quotation_mark,
+ − 144 repeat<lex_basic_char, unlimited>,
+ − 145 lex_quotation_mark>;
+ − 146
+ − 147 // After toml post-v0.5.0, it is explicitly clarified how quotes in ml-strings
+ − 148 // are allowed to be used.
+ − 149 // After this, the following strings are *explicitly* allowed.
+ − 150 // - One or two `"`s in a multi-line basic string is allowed wherever it is.
+ − 151 // - Three consecutive `"`s in a multi-line basic string is considered as a delimiter.
+ − 152 // - One or two `"`s can appear just before or after the delimiter.
+ − 153 // ```toml
+ − 154 // str4 = """Here are two quotation marks: "". Simple enough."""
+ − 155 // str5 = """Here are three quotation marks: ""\"."""
+ − 156 // str6 = """Here are fifteen quotation marks: ""\"""\"""\"""\"""\"."""
+ − 157 // str7 = """"This," she said, "is just a pointless statement.""""
+ − 158 // ```
+ − 159 // In the current implementation (v3.3.0), it is difficult to parse `str7` in
+ − 160 // the above example. It is difficult to recognize `"` at the end of string body
+ − 161 // collectly. It will be misunderstood as a `"""` delimiter and an additional,
+ − 162 // invalid `"`. Like this:
+ − 163 // ```console
+ − 164 // what(): [error] toml::parse_table: invalid line format
+ − 165 // --> hoge.toml
+ − 166 // |
+ − 167 // 13 | str7 = """"This," she said, "is just a pointless statement.""""
+ − 168 // | ^- expected newline, but got '"'.
+ − 169 // ```
+ − 170 // As a quick workaround for this problem, `lex_ml_basic_string_delim` was
+ − 171 // split into two, `lex_ml_basic_string_open` and `lex_ml_basic_string_close`.
+ − 172 // `lex_ml_basic_string_open` allows only `"""`. `_close` allows 3-5 `"`s.
+ − 173 // In parse_ml_basic_string() function, the trailing `"`s will be attached to
+ − 174 // the string body.
+ − 175 //
+ − 176 using lex_ml_basic_string_delim = repeat<lex_quotation_mark, exactly<3>>;
+ − 177 using lex_ml_basic_string_open = lex_ml_basic_string_delim;
+ − 178 using lex_ml_basic_string_close = sequence<
+ − 179 repeat<lex_quotation_mark, exactly<3>>,
+ − 180 maybe<lex_quotation_mark>, maybe<lex_quotation_mark>
+ − 181 >;
+ − 182
+ − 183 using lex_ml_basic_unescaped = exclude<either<in_range<0x00, 0x08>, // 0x09 is tab
+ − 184 in_range<0x0A, 0x1F>,
+ − 185 character<0x5C>, // backslash
+ − 186 character<0x7F>, // DEL
+ − 187 lex_ml_basic_string_delim>>;
+ − 188
+ − 189 using lex_ml_basic_escaped_newline = sequence<
+ − 190 lex_escape, maybe<lex_ws>, lex_newline,
+ − 191 repeat<either<lex_ws, lex_newline>, unlimited>>;
+ − 192
+ − 193 using lex_ml_basic_char = either<lex_ml_basic_unescaped, lex_escaped>;
+ − 194 using lex_ml_basic_body = repeat<either<lex_ml_basic_char, lex_newline,
+ − 195 lex_ml_basic_escaped_newline>,
+ − 196 unlimited>;
+ − 197 using lex_ml_basic_string = sequence<lex_ml_basic_string_open,
+ − 198 lex_ml_basic_body,
+ − 199 lex_ml_basic_string_close>;
+ − 200
+ − 201 using lex_literal_char = exclude<either<in_range<0x00, 0x08>, in_range<0x0A, 0x1F>,
+ − 202 character<0x7F>, character<0x27>>>;
+ − 203 using lex_apostrophe = character<'\''>;
+ − 204 using lex_literal_string = sequence<lex_apostrophe,
+ − 205 repeat<lex_literal_char, unlimited>,
+ − 206 lex_apostrophe>;
+ − 207
+ − 208 // the same reason as above.
+ − 209 using lex_ml_literal_string_delim = repeat<lex_apostrophe, exactly<3>>;
+ − 210 using lex_ml_literal_string_open = lex_ml_literal_string_delim;
+ − 211 using lex_ml_literal_string_close = sequence<
+ − 212 repeat<lex_apostrophe, exactly<3>>,
+ − 213 maybe<lex_apostrophe>, maybe<lex_apostrophe>
+ − 214 >;
+ − 215
+ − 216 using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
+ − 217 in_range<0x0A, 0x1F>,
+ − 218 character<0x7F>,
+ − 219 lex_ml_literal_string_delim>>;
+ − 220 using lex_ml_literal_body = repeat<either<lex_ml_literal_char, lex_newline>,
+ − 221 unlimited>;
+ − 222 using lex_ml_literal_string = sequence<lex_ml_literal_string_open,
+ − 223 lex_ml_literal_body,
+ − 224 lex_ml_literal_string_close>;
+ − 225
+ − 226 using lex_string = either<lex_ml_basic_string, lex_basic_string,
+ − 227 lex_ml_literal_string, lex_literal_string>;
+ − 228
+ − 229 // ===========================================================================
+ − 230 using lex_dot_sep = sequence<maybe<lex_ws>, character<'.'>, maybe<lex_ws>>;
+ − 231
+ − 232 using lex_unquoted_key = repeat<either<lex_alpha, lex_digit,
+ − 233 character<'-'>, character<'_'>>,
+ − 234 at_least<1>>;
+ − 235 using lex_quoted_key = either<lex_basic_string, lex_literal_string>;
+ − 236 using lex_simple_key = either<lex_unquoted_key, lex_quoted_key>;
+ − 237 using lex_dotted_key = sequence<lex_simple_key,
+ − 238 repeat<sequence<lex_dot_sep, lex_simple_key>,
+ − 239 at_least<1>
+ − 240 >
+ − 241 >;
+ − 242 using lex_key = either<lex_dotted_key, lex_simple_key>;
+ − 243
+ − 244 using lex_keyval_sep = sequence<maybe<lex_ws>,
+ − 245 character<'='>,
+ − 246 maybe<lex_ws>>;
+ − 247
+ − 248 using lex_std_table_open = character<'['>;
+ − 249 using lex_std_table_close = character<']'>;
+ − 250 using lex_std_table = sequence<lex_std_table_open,
+ − 251 maybe<lex_ws>,
+ − 252 lex_key,
+ − 253 maybe<lex_ws>,
+ − 254 lex_std_table_close>;
+ − 255
+ − 256 using lex_array_table_open = sequence<lex_std_table_open, lex_std_table_open>;
+ − 257 using lex_array_table_close = sequence<lex_std_table_close, lex_std_table_close>;
+ − 258 using lex_array_table = sequence<lex_array_table_open,
+ − 259 maybe<lex_ws>,
+ − 260 lex_key,
+ − 261 maybe<lex_ws>,
+ − 262 lex_array_table_close>;
+ − 263
+ − 264 using lex_utf8_1byte = in_range<0x00, 0x7F>;
+ − 265 using lex_utf8_2byte = sequence<
+ − 266 in_range<'\xC2', '\xDF'>,
+ − 267 in_range<'\x80', '\xBF'>
+ − 268 >;
+ − 269 using lex_utf8_3byte = sequence<either<
+ − 270 sequence<character<'\xE0'>, in_range<'\xA0', '\xBF'>>,
+ − 271 sequence<in_range<'\xE1', '\xEC'>, in_range<'\x80', '\xBF'>>,
+ − 272 sequence<character<'\xED'>, in_range<'\x80', '\x9F'>>,
+ − 273 sequence<in_range<'\xEE', '\xEF'>, in_range<'\x80', '\xBF'>>
+ − 274 >, in_range<'\x80', '\xBF'>>;
+ − 275 using lex_utf8_4byte = sequence<either<
+ − 276 sequence<character<'\xF0'>, in_range<'\x90', '\xBF'>>,
+ − 277 sequence<in_range<'\xF1', '\xF3'>, in_range<'\x80', '\xBF'>>,
+ − 278 sequence<character<'\xF4'>, in_range<'\x80', '\x8F'>>
+ − 279 >, in_range<'\x80', '\xBF'>, in_range<'\x80', '\xBF'>>;
+ − 280 using lex_utf8_code = either<
+ − 281 lex_utf8_1byte,
+ − 282 lex_utf8_2byte,
+ − 283 lex_utf8_3byte,
+ − 284 lex_utf8_4byte
+ − 285 >;
+ − 286
+ − 287 using lex_comment_start_symbol = character<'#'>;
+ − 288 using lex_non_eol_ascii = either<character<0x09>, in_range<0x20, 0x7E>>;
+ − 289 using lex_comment = sequence<lex_comment_start_symbol, repeat<either<
+ − 290 lex_non_eol_ascii, lex_utf8_2byte, lex_utf8_3byte, lex_utf8_4byte>, unlimited>>;
+ − 291
+ − 292 } // detail
+ − 293 } // toml
+ − 294 #endif // TOML_LEXER_HPP