318
|
1 // Copyright Toru Niina 2017.
|
|
2 // Distributed under the MIT License.
|
|
3 #ifndef TOML11_LEXER_HPP
|
|
4 #define TOML11_LEXER_HPP
|
|
5 #include <istream>
|
|
6 #include <sstream>
|
|
7 #include <stdexcept>
|
|
8
|
|
9 #include "combinator.hpp"
|
|
10
|
|
11 namespace toml
|
|
12 {
|
|
13 namespace detail
|
|
14 {
|
|
15
|
|
16 // these scans contents from current location in a container of char
|
|
17 // and extract a region that matches their own pattern.
|
|
18 // to see the implementation of each component, see combinator.hpp.
|
|
19
|
|
20 using lex_wschar = either<character<' '>, character<'\t'>>;
|
|
21 using lex_ws = repeat<lex_wschar, at_least<1>>;
|
|
22 using lex_newline = either<character<'\n'>,
|
|
23 sequence<character<'\r'>, character<'\n'>>>;
|
|
24 using lex_lower = in_range<'a', 'z'>;
|
|
25 using lex_upper = in_range<'A', 'Z'>;
|
|
26 using lex_alpha = either<lex_lower, lex_upper>;
|
|
27 using lex_digit = in_range<'0', '9'>;
|
|
28 using lex_nonzero = in_range<'1', '9'>;
|
|
29 using lex_oct_dig = in_range<'0', '7'>;
|
|
30 using lex_bin_dig = in_range<'0', '1'>;
|
|
31 using lex_hex_dig = either<lex_digit, in_range<'A', 'F'>, in_range<'a', 'f'>>;
|
|
32
|
|
33 using lex_hex_prefix = sequence<character<'0'>, character<'x'>>;
|
|
34 using lex_oct_prefix = sequence<character<'0'>, character<'o'>>;
|
|
35 using lex_bin_prefix = sequence<character<'0'>, character<'b'>>;
|
|
36 using lex_underscore = character<'_'>;
|
|
37 using lex_plus = character<'+'>;
|
|
38 using lex_minus = character<'-'>;
|
|
39 using lex_sign = either<lex_plus, lex_minus>;
|
|
40
|
|
41 // digit | nonzero 1*(digit | _ digit)
|
|
42 using lex_unsigned_dec_int = either<sequence<lex_nonzero, repeat<
|
|
43 either<lex_digit, sequence<lex_underscore, lex_digit>>, at_least<1>>>,
|
|
44 lex_digit>;
|
|
45 // (+|-)? unsigned_dec_int
|
|
46 using lex_dec_int = sequence<maybe<lex_sign>, lex_unsigned_dec_int>;
|
|
47
|
|
48 // hex_prefix hex_dig *(hex_dig | _ hex_dig)
|
|
49 using lex_hex_int = sequence<lex_hex_prefix, sequence<lex_hex_dig, repeat<
|
|
50 either<lex_hex_dig, sequence<lex_underscore, lex_hex_dig>>, unlimited>>>;
|
|
51 // oct_prefix oct_dig *(oct_dig | _ oct_dig)
|
|
52 using lex_oct_int = sequence<lex_oct_prefix, sequence<lex_oct_dig, repeat<
|
|
53 either<lex_oct_dig, sequence<lex_underscore, lex_oct_dig>>, unlimited>>>;
|
|
54 // bin_prefix bin_dig *(bin_dig | _ bin_dig)
|
|
55 using lex_bin_int = sequence<lex_bin_prefix, sequence<lex_bin_dig, repeat<
|
|
56 either<lex_bin_dig, sequence<lex_underscore, lex_bin_dig>>, unlimited>>>;
|
|
57
|
|
58 // (dec_int | hex_int | oct_int | bin_int)
|
|
59 using lex_integer = either<lex_bin_int, lex_oct_int, lex_hex_int, lex_dec_int>;
|
|
60
|
|
61 // ===========================================================================
|
|
62
|
|
63 using lex_inf = sequence<character<'i'>, character<'n'>, character<'f'>>;
|
|
64 using lex_nan = sequence<character<'n'>, character<'a'>, character<'n'>>;
|
|
65 using lex_special_float = sequence<maybe<lex_sign>, either<lex_inf, lex_nan>>;
|
|
66
|
|
67 using lex_zero_prefixable_int = sequence<lex_digit, repeat<either<lex_digit,
|
|
68 sequence<lex_underscore, lex_digit>>, unlimited>>;
|
|
69
|
|
70 using lex_fractional_part = sequence<character<'.'>, lex_zero_prefixable_int>;
|
|
71
|
|
72 using lex_exponent_part = sequence<either<character<'e'>, character<'E'>>,
|
|
73 maybe<lex_sign>, lex_zero_prefixable_int>;
|
|
74
|
|
75 using lex_float = either<lex_special_float,
|
|
76 sequence<lex_dec_int, either<lex_exponent_part,
|
|
77 sequence<lex_fractional_part, maybe<lex_exponent_part>>>>>;
|
|
78
|
|
79 // ===========================================================================
|
|
80
|
|
81 using lex_true = sequence<character<'t'>, character<'r'>,
|
|
82 character<'u'>, character<'e'>>;
|
|
83 using lex_false = sequence<character<'f'>, character<'a'>, character<'l'>,
|
|
84 character<'s'>, character<'e'>>;
|
|
85 using lex_boolean = either<lex_true, lex_false>;
|
|
86
|
|
87 // ===========================================================================
|
|
88
|
|
89 using lex_date_fullyear = repeat<lex_digit, exactly<4>>;
|
|
90 using lex_date_month = repeat<lex_digit, exactly<2>>;
|
|
91 using lex_date_mday = repeat<lex_digit, exactly<2>>;
|
|
92 using lex_time_delim = either<character<'T'>, character<'t'>, character<' '>>;
|
|
93 using lex_time_hour = repeat<lex_digit, exactly<2>>;
|
|
94 using lex_time_minute = repeat<lex_digit, exactly<2>>;
|
|
95 using lex_time_second = repeat<lex_digit, exactly<2>>;
|
|
96 using lex_time_secfrac = sequence<character<'.'>,
|
|
97 repeat<lex_digit, at_least<1>>>;
|
|
98
|
|
99 using lex_time_numoffset = sequence<either<character<'+'>, character<'-'>>,
|
|
100 sequence<lex_time_hour, character<':'>,
|
|
101 lex_time_minute>>;
|
|
102 using lex_time_offset = either<character<'Z'>, character<'z'>,
|
|
103 lex_time_numoffset>;
|
|
104
|
|
105 using lex_partial_time = sequence<lex_time_hour, character<':'>,
|
|
106 lex_time_minute, character<':'>,
|
|
107 lex_time_second, maybe<lex_time_secfrac>>;
|
|
108 using lex_full_date = sequence<lex_date_fullyear, character<'-'>,
|
|
109 lex_date_month, character<'-'>,
|
|
110 lex_date_mday>;
|
|
111 using lex_full_time = sequence<lex_partial_time, lex_time_offset>;
|
|
112
|
|
113 using lex_offset_date_time = sequence<lex_full_date, lex_time_delim, lex_full_time>;
|
|
114 using lex_local_date_time = sequence<lex_full_date, lex_time_delim, lex_partial_time>;
|
|
115 using lex_local_date = lex_full_date;
|
|
116 using lex_local_time = lex_partial_time;
|
|
117
|
|
118 // ===========================================================================
|
|
119
|
|
120 using lex_quotation_mark = character<'"'>;
|
|
121 using lex_basic_unescaped = exclude<either<in_range<0x00, 0x08>, // 0x09 (tab) is allowed
|
|
122 in_range<0x0A, 0x1F>,
|
|
123 character<0x22>, character<0x5C>,
|
|
124 character<0x7F>>>;
|
|
125
|
|
126 using lex_escape = character<'\\'>;
|
|
127 using lex_escape_unicode_short = sequence<character<'u'>,
|
|
128 repeat<lex_hex_dig, exactly<4>>>;
|
|
129 using lex_escape_unicode_long = sequence<character<'U'>,
|
|
130 repeat<lex_hex_dig, exactly<8>>>;
|
|
131 using lex_escape_seq_char = either<character<'"'>, character<'\\'>,
|
|
132 character<'b'>, character<'f'>,
|
|
133 character<'n'>, character<'r'>,
|
|
134 character<'t'>,
|
|
135 #ifdef TOML11_USE_UNRELEASED_TOML_FEATURES
|
|
136 character<'e'>, // ESC (0x1B)
|
|
137 #endif
|
|
138 lex_escape_unicode_short,
|
|
139 lex_escape_unicode_long
|
|
140 >;
|
|
141 using lex_escaped = sequence<lex_escape, lex_escape_seq_char>;
|
|
142 using lex_basic_char = either<lex_basic_unescaped, lex_escaped>;
|
|
143 using lex_basic_string = sequence<lex_quotation_mark,
|
|
144 repeat<lex_basic_char, unlimited>,
|
|
145 lex_quotation_mark>;
|
|
146
|
|
147 // After toml post-v0.5.0, it is explicitly clarified how quotes in ml-strings
|
|
148 // are allowed to be used.
|
|
149 // After this, the following strings are *explicitly* allowed.
|
|
150 // - One or two `"`s in a multi-line basic string is allowed wherever it is.
|
|
151 // - Three consecutive `"`s in a multi-line basic string is considered as a delimiter.
|
|
152 // - One or two `"`s can appear just before or after the delimiter.
|
|
153 // ```toml
|
|
154 // str4 = """Here are two quotation marks: "". Simple enough."""
|
|
155 // str5 = """Here are three quotation marks: ""\"."""
|
|
156 // str6 = """Here are fifteen quotation marks: ""\"""\"""\"""\"""\"."""
|
|
157 // str7 = """"This," she said, "is just a pointless statement.""""
|
|
158 // ```
|
|
159 // In the current implementation (v3.3.0), it is difficult to parse `str7` in
|
|
160 // the above example. It is difficult to recognize `"` at the end of string body
|
|
161 // collectly. It will be misunderstood as a `"""` delimiter and an additional,
|
|
162 // invalid `"`. Like this:
|
|
163 // ```console
|
|
164 // what(): [error] toml::parse_table: invalid line format
|
|
165 // --> hoge.toml
|
|
166 // |
|
|
167 // 13 | str7 = """"This," she said, "is just a pointless statement.""""
|
|
168 // | ^- expected newline, but got '"'.
|
|
169 // ```
|
|
170 // As a quick workaround for this problem, `lex_ml_basic_string_delim` was
|
|
171 // split into two, `lex_ml_basic_string_open` and `lex_ml_basic_string_close`.
|
|
172 // `lex_ml_basic_string_open` allows only `"""`. `_close` allows 3-5 `"`s.
|
|
173 // In parse_ml_basic_string() function, the trailing `"`s will be attached to
|
|
174 // the string body.
|
|
175 //
|
|
176 using lex_ml_basic_string_delim = repeat<lex_quotation_mark, exactly<3>>;
|
|
177 using lex_ml_basic_string_open = lex_ml_basic_string_delim;
|
|
178 using lex_ml_basic_string_close = sequence<
|
|
179 repeat<lex_quotation_mark, exactly<3>>,
|
|
180 maybe<lex_quotation_mark>, maybe<lex_quotation_mark>
|
|
181 >;
|
|
182
|
|
183 using lex_ml_basic_unescaped = exclude<either<in_range<0x00, 0x08>, // 0x09 is tab
|
|
184 in_range<0x0A, 0x1F>,
|
|
185 character<0x5C>, // backslash
|
|
186 character<0x7F>, // DEL
|
|
187 lex_ml_basic_string_delim>>;
|
|
188
|
|
189 using lex_ml_basic_escaped_newline = sequence<
|
|
190 lex_escape, maybe<lex_ws>, lex_newline,
|
|
191 repeat<either<lex_ws, lex_newline>, unlimited>>;
|
|
192
|
|
193 using lex_ml_basic_char = either<lex_ml_basic_unescaped, lex_escaped>;
|
|
194 using lex_ml_basic_body = repeat<either<lex_ml_basic_char, lex_newline,
|
|
195 lex_ml_basic_escaped_newline>,
|
|
196 unlimited>;
|
|
197 using lex_ml_basic_string = sequence<lex_ml_basic_string_open,
|
|
198 lex_ml_basic_body,
|
|
199 lex_ml_basic_string_close>;
|
|
200
|
|
201 using lex_literal_char = exclude<either<in_range<0x00, 0x08>, in_range<0x0A, 0x1F>,
|
|
202 character<0x7F>, character<0x27>>>;
|
|
203 using lex_apostrophe = character<'\''>;
|
|
204 using lex_literal_string = sequence<lex_apostrophe,
|
|
205 repeat<lex_literal_char, unlimited>,
|
|
206 lex_apostrophe>;
|
|
207
|
|
208 // the same reason as above.
|
|
209 using lex_ml_literal_string_delim = repeat<lex_apostrophe, exactly<3>>;
|
|
210 using lex_ml_literal_string_open = lex_ml_literal_string_delim;
|
|
211 using lex_ml_literal_string_close = sequence<
|
|
212 repeat<lex_apostrophe, exactly<3>>,
|
|
213 maybe<lex_apostrophe>, maybe<lex_apostrophe>
|
|
214 >;
|
|
215
|
|
216 using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
|
|
217 in_range<0x0A, 0x1F>,
|
|
218 character<0x7F>,
|
|
219 lex_ml_literal_string_delim>>;
|
|
220 using lex_ml_literal_body = repeat<either<lex_ml_literal_char, lex_newline>,
|
|
221 unlimited>;
|
|
222 using lex_ml_literal_string = sequence<lex_ml_literal_string_open,
|
|
223 lex_ml_literal_body,
|
|
224 lex_ml_literal_string_close>;
|
|
225
|
|
226 using lex_string = either<lex_ml_basic_string, lex_basic_string,
|
|
227 lex_ml_literal_string, lex_literal_string>;
|
|
228
|
|
229 // ===========================================================================
|
|
230 using lex_dot_sep = sequence<maybe<lex_ws>, character<'.'>, maybe<lex_ws>>;
|
|
231
|
|
232 using lex_unquoted_key = repeat<either<lex_alpha, lex_digit,
|
|
233 character<'-'>, character<'_'>>,
|
|
234 at_least<1>>;
|
|
235 using lex_quoted_key = either<lex_basic_string, lex_literal_string>;
|
|
236 using lex_simple_key = either<lex_unquoted_key, lex_quoted_key>;
|
|
237 using lex_dotted_key = sequence<lex_simple_key,
|
|
238 repeat<sequence<lex_dot_sep, lex_simple_key>,
|
|
239 at_least<1>
|
|
240 >
|
|
241 >;
|
|
242 using lex_key = either<lex_dotted_key, lex_simple_key>;
|
|
243
|
|
244 using lex_keyval_sep = sequence<maybe<lex_ws>,
|
|
245 character<'='>,
|
|
246 maybe<lex_ws>>;
|
|
247
|
|
248 using lex_std_table_open = character<'['>;
|
|
249 using lex_std_table_close = character<']'>;
|
|
250 using lex_std_table = sequence<lex_std_table_open,
|
|
251 maybe<lex_ws>,
|
|
252 lex_key,
|
|
253 maybe<lex_ws>,
|
|
254 lex_std_table_close>;
|
|
255
|
|
256 using lex_array_table_open = sequence<lex_std_table_open, lex_std_table_open>;
|
|
257 using lex_array_table_close = sequence<lex_std_table_close, lex_std_table_close>;
|
|
258 using lex_array_table = sequence<lex_array_table_open,
|
|
259 maybe<lex_ws>,
|
|
260 lex_key,
|
|
261 maybe<lex_ws>,
|
|
262 lex_array_table_close>;
|
|
263
|
|
264 using lex_utf8_1byte = in_range<0x00, 0x7F>;
|
|
265 using lex_utf8_2byte = sequence<
|
|
266 in_range<'\xC2', '\xDF'>,
|
|
267 in_range<'\x80', '\xBF'>
|
|
268 >;
|
|
269 using lex_utf8_3byte = sequence<either<
|
|
270 sequence<character<'\xE0'>, in_range<'\xA0', '\xBF'>>,
|
|
271 sequence<in_range<'\xE1', '\xEC'>, in_range<'\x80', '\xBF'>>,
|
|
272 sequence<character<'\xED'>, in_range<'\x80', '\x9F'>>,
|
|
273 sequence<in_range<'\xEE', '\xEF'>, in_range<'\x80', '\xBF'>>
|
|
274 >, in_range<'\x80', '\xBF'>>;
|
|
275 using lex_utf8_4byte = sequence<either<
|
|
276 sequence<character<'\xF0'>, in_range<'\x90', '\xBF'>>,
|
|
277 sequence<in_range<'\xF1', '\xF3'>, in_range<'\x80', '\xBF'>>,
|
|
278 sequence<character<'\xF4'>, in_range<'\x80', '\x8F'>>
|
|
279 >, in_range<'\x80', '\xBF'>, in_range<'\x80', '\xBF'>>;
|
|
280 using lex_utf8_code = either<
|
|
281 lex_utf8_1byte,
|
|
282 lex_utf8_2byte,
|
|
283 lex_utf8_3byte,
|
|
284 lex_utf8_4byte
|
|
285 >;
|
|
286
|
|
287 using lex_comment_start_symbol = character<'#'>;
|
|
288 using lex_non_eol_ascii = either<character<0x09>, in_range<0x20, 0x7E>>;
|
|
289 using lex_comment = sequence<lex_comment_start_symbol, repeat<either<
|
|
290 lex_non_eol_ascii, lex_utf8_2byte, lex_utf8_3byte, lex_utf8_4byte>, unlimited>>;
|
|
291
|
|
292 } // detail
|
|
293 } // toml
|
|
294 #endif // TOML_LEXER_HPP
|