comparison dep/toml11/toml/parser.hpp @ 318:3b355fa948c7

config: use TOML instead of INI unfortunately, INI is not enough, and causes some paths including semicolons to break with our current storage of the library folders. so, I decided to switch to TOML which does support real arrays...
author Paper <paper@paper.us.eu.org>
date Wed, 12 Jun 2024 05:25:41 -0400
parents
children
comparison
equal deleted inserted replaced
317:b1f4d1867ab1 318:3b355fa948c7
1 // Copyright Toru Niina 2017.
2 // Distributed under the MIT License.
3 #ifndef TOML11_PARSER_HPP
4 #define TOML11_PARSER_HPP
5 #include <cstring>
6 #include <fstream>
7 #include <sstream>
8
9 #include "combinator.hpp"
10 #include "lexer.hpp"
11 #include "macros.hpp"
12 #include "region.hpp"
13 #include "result.hpp"
14 #include "types.hpp"
15 #include "value.hpp"
16
17 #ifndef TOML11_DISABLE_STD_FILESYSTEM
18 #ifdef __cpp_lib_filesystem
19 #if __has_include(<filesystem>)
20 #define TOML11_HAS_STD_FILESYSTEM
21 #include <filesystem>
22 #endif // has_include(<string_view>)
23 #endif // __cpp_lib_filesystem
24 #endif // TOML11_DISABLE_STD_FILESYSTEM
25
26 // the previous commit works with 500+ recursions. so it may be too small.
27 // but in most cases, i think we don't need such a deep recursion of
28 // arrays or inline-tables.
29 #define TOML11_VALUE_RECURSION_LIMIT 64
30
31 namespace toml
32 {
33 namespace detail
34 {
35
36 inline result<std::pair<boolean, region>, std::string>
37 parse_boolean(location& loc)
38 {
39 const auto first = loc.iter();
40 if(const auto token = lex_boolean::invoke(loc))
41 {
42 const auto reg = token.unwrap();
43 if (reg.str() == "true") {return ok(std::make_pair(true, reg));}
44 else if(reg.str() == "false") {return ok(std::make_pair(false, reg));}
45 else // internal error.
46 {
47 throw internal_error(format_underline(
48 "toml::parse_boolean: internal error",
49 {{source_location(reg), "invalid token"}}),
50 source_location(reg));
51 }
52 }
53 loc.reset(first); //rollback
54 return err(format_underline("toml::parse_boolean: ",
55 {{source_location(loc), "the next token is not a boolean"}}));
56 }
57
58 inline result<std::pair<integer, region>, std::string>
59 parse_binary_integer(location& loc)
60 {
61 const auto first = loc.iter();
62 if(const auto token = lex_bin_int::invoke(loc))
63 {
64 auto str = token.unwrap().str();
65 assert(str.size() > 2); // minimum -> 0b1
66 assert(str.at(0) == '0' && str.at(1) == 'b');
67
68 // skip all the zeros and `_` locating at the MSB
69 str.erase(str.begin(), std::find_if(
70 str.begin() + 2, // to skip prefix `0b`
71 str.end(),
72 [](const char c) { return c == '1'; })
73 );
74 assert(str.empty() || str.front() == '1');
75
76 // since toml11 uses int64_t, 64bit (unsigned) input cannot be read.
77 const auto max_length = 63 + std::count(str.begin(), str.end(), '_');
78 if(static_cast<std::string::size_type>(max_length) < str.size())
79 {
80 loc.reset(first);
81 return err(format_underline("toml::parse_binary_integer: "
82 "only signed 64bit integer is available",
83 {{source_location(loc), "too large input (> int64_t)"}}));
84 }
85
86 integer retval(0), base(1);
87 for(auto i(str.rbegin()), e(str.rend()); i!=e; ++i)
88 {
89 assert(base != 0); // means overflow, checked in the above code
90 if(*i == '1')
91 {
92 retval += base;
93 if( (std::numeric_limits<integer>::max)() / 2 < base )
94 {
95 base = 0;
96 }
97 base *= 2;
98 }
99 else if(*i == '0')
100 {
101 if( (std::numeric_limits<integer>::max)() / 2 < base )
102 {
103 base = 0;
104 }
105 base *= 2;
106 }
107 else if(*i == '_')
108 {
109 // do nothing.
110 }
111 else // should be detected by lex_bin_int. [[unlikely]]
112 {
113 throw internal_error(format_underline(
114 "toml::parse_binary_integer: internal error",
115 {{source_location(token.unwrap()), "invalid token"}}),
116 source_location(loc));
117 }
118 }
119 return ok(std::make_pair(retval, token.unwrap()));
120 }
121 loc.reset(first);
122 return err(format_underline("toml::parse_binary_integer:",
123 {{source_location(loc), "the next token is not an integer"}}));
124 }
125
126 inline result<std::pair<integer, region>, std::string>
127 parse_octal_integer(location& loc)
128 {
129 const auto first = loc.iter();
130 if(const auto token = lex_oct_int::invoke(loc))
131 {
132 auto str = token.unwrap().str();
133 str.erase(std::remove(str.begin(), str.end(), '_'), str.end());
134 str.erase(str.begin()); str.erase(str.begin()); // remove `0o` prefix
135
136 std::istringstream iss(str);
137 integer retval(0);
138 iss >> std::oct >> retval;
139 if(iss.fail())
140 {
141 // `istream` sets `failbit` if internally-called `std::num_get::get`
142 // fails.
143 // `std::num_get::get` calls `std::strtoll` if the argument type is
144 // signed.
145 // `std::strtoll` fails if
146 // - the value is out_of_range or
147 // - no conversion is possible.
148 // since we already checked that the string is valid octal integer,
149 // so the error reason is out_of_range.
150 loc.reset(first);
151 return err(format_underline("toml::parse_octal_integer:",
152 {{source_location(loc), "out of range"}}));
153 }
154 return ok(std::make_pair(retval, token.unwrap()));
155 }
156 loc.reset(first);
157 return err(format_underline("toml::parse_octal_integer:",
158 {{source_location(loc), "the next token is not an integer"}}));
159 }
160
161 inline result<std::pair<integer, region>, std::string>
162 parse_hexadecimal_integer(location& loc)
163 {
164 const auto first = loc.iter();
165 if(const auto token = lex_hex_int::invoke(loc))
166 {
167 auto str = token.unwrap().str();
168 str.erase(std::remove(str.begin(), str.end(), '_'), str.end());
169 str.erase(str.begin()); str.erase(str.begin()); // remove `0x` prefix
170
171 std::istringstream iss(str);
172 integer retval(0);
173 iss >> std::hex >> retval;
174 if(iss.fail())
175 {
176 // see parse_octal_integer for detail of this error message.
177 loc.reset(first);
178 return err(format_underline("toml::parse_hexadecimal_integer:",
179 {{source_location(loc), "out of range"}}));
180 }
181 return ok(std::make_pair(retval, token.unwrap()));
182 }
183 loc.reset(first);
184 return err(format_underline("toml::parse_hexadecimal_integer",
185 {{source_location(loc), "the next token is not an integer"}}));
186 }
187
188 inline result<std::pair<integer, region>, std::string>
189 parse_integer(location& loc)
190 {
191 const auto first = loc.iter();
192 if(first != loc.end() && *first == '0')
193 {
194 const auto second = std::next(first);
195 if(second == loc.end()) // the token is just zero.
196 {
197 loc.advance();
198 return ok(std::make_pair(0, region(loc, first, second)));
199 }
200
201 if(*second == 'b') {return parse_binary_integer (loc);} // 0b1100
202 if(*second == 'o') {return parse_octal_integer (loc);} // 0o775
203 if(*second == 'x') {return parse_hexadecimal_integer(loc);} // 0xC0FFEE
204
205 if(std::isdigit(*second))
206 {
207 return err(format_underline("toml::parse_integer: "
208 "leading zero in an Integer is not allowed.",
209 {{source_location(loc), "leading zero"}}));
210 }
211 else if(std::isalpha(*second))
212 {
213 return err(format_underline("toml::parse_integer: "
214 "unknown integer prefix appeared.",
215 {{source_location(loc), "none of 0x, 0o, 0b"}}));
216 }
217 }
218
219 if(const auto token = lex_dec_int::invoke(loc))
220 {
221 auto str = token.unwrap().str();
222 str.erase(std::remove(str.begin(), str.end(), '_'), str.end());
223
224 std::istringstream iss(str);
225 integer retval(0);
226 iss >> retval;
227 if(iss.fail())
228 {
229 // see parse_octal_integer for detail of this error message.
230 loc.reset(first);
231 return err(format_underline("toml::parse_integer:",
232 {{source_location(loc), "out of range"}}));
233 }
234 return ok(std::make_pair(retval, token.unwrap()));
235 }
236 loc.reset(first);
237 return err(format_underline("toml::parse_integer: ",
238 {{source_location(loc), "the next token is not an integer"}}));
239 }
240
241 inline result<std::pair<floating, region>, std::string>
242 parse_floating(location& loc)
243 {
244 const auto first = loc.iter();
245 if(const auto token = lex_float::invoke(loc))
246 {
247 auto str = token.unwrap().str();
248 if(str == "inf" || str == "+inf")
249 {
250 if(std::numeric_limits<floating>::has_infinity)
251 {
252 return ok(std::make_pair(
253 std::numeric_limits<floating>::infinity(), token.unwrap()));
254 }
255 else
256 {
257 throw std::domain_error("toml::parse_floating: inf value found"
258 " but the current environment does not support inf. Please"
259 " make sure that the floating-point implementation conforms"
260 " IEEE 754/ISO 60559 international standard.");
261 }
262 }
263 else if(str == "-inf")
264 {
265 if(std::numeric_limits<floating>::has_infinity)
266 {
267 return ok(std::make_pair(
268 -std::numeric_limits<floating>::infinity(), token.unwrap()));
269 }
270 else
271 {
272 throw std::domain_error("toml::parse_floating: inf value found"
273 " but the current environment does not support inf. Please"
274 " make sure that the floating-point implementation conforms"
275 " IEEE 754/ISO 60559 international standard.");
276 }
277 }
278 else if(str == "nan" || str == "+nan")
279 {
280 if(std::numeric_limits<floating>::has_quiet_NaN)
281 {
282 return ok(std::make_pair(
283 std::numeric_limits<floating>::quiet_NaN(), token.unwrap()));
284 }
285 else if(std::numeric_limits<floating>::has_signaling_NaN)
286 {
287 return ok(std::make_pair(
288 std::numeric_limits<floating>::signaling_NaN(), token.unwrap()));
289 }
290 else
291 {
292 throw std::domain_error("toml::parse_floating: NaN value found"
293 " but the current environment does not support NaN. Please"
294 " make sure that the floating-point implementation conforms"
295 " IEEE 754/ISO 60559 international standard.");
296 }
297 }
298 else if(str == "-nan")
299 {
300 if(std::numeric_limits<floating>::has_quiet_NaN)
301 {
302 return ok(std::make_pair(
303 -std::numeric_limits<floating>::quiet_NaN(), token.unwrap()));
304 }
305 else if(std::numeric_limits<floating>::has_signaling_NaN)
306 {
307 return ok(std::make_pair(
308 -std::numeric_limits<floating>::signaling_NaN(), token.unwrap()));
309 }
310 else
311 {
312 throw std::domain_error("toml::parse_floating: NaN value found"
313 " but the current environment does not support NaN. Please"
314 " make sure that the floating-point implementation conforms"
315 " IEEE 754/ISO 60559 international standard.");
316 }
317 }
318 str.erase(std::remove(str.begin(), str.end(), '_'), str.end());
319 std::istringstream iss(str);
320 floating v(0.0);
321 iss >> v;
322 if(iss.fail())
323 {
324 // see parse_octal_integer for detail of this error message.
325 loc.reset(first);
326 return err(format_underline("toml::parse_floating:",
327 {{source_location(loc), "out of range"}}));
328 }
329 return ok(std::make_pair(v, token.unwrap()));
330 }
331 loc.reset(first);
332 return err(format_underline("toml::parse_floating: ",
333 {{source_location(loc), "the next token is not a float"}}));
334 }
335
336 inline std::string read_utf8_codepoint(const region& reg, const location& loc)
337 {
338 const auto str = reg.str().substr(1);
339 std::uint_least32_t codepoint;
340 std::istringstream iss(str);
341 iss >> std::hex >> codepoint;
342
343 const auto to_char = [](const std::uint_least32_t i) noexcept -> char {
344 const auto uc = static_cast<unsigned char>(i);
345 return *reinterpret_cast<const char*>(std::addressof(uc));
346 };
347
348 std::string character;
349 if(codepoint < 0x80) // U+0000 ... U+0079 ; just an ASCII.
350 {
351 character += static_cast<char>(codepoint);
352 }
353 else if(codepoint < 0x800) //U+0080 ... U+07FF
354 {
355 // 110yyyyx 10xxxxxx; 0x3f == 0b0011'1111
356 character += to_char(0xC0| codepoint >> 6);
357 character += to_char(0x80|(codepoint & 0x3F));
358 }
359 else if(codepoint < 0x10000) // U+0800...U+FFFF
360 {
361 if(0xD800 <= codepoint && codepoint <= 0xDFFF)
362 {
363 throw syntax_error(format_underline(
364 "toml::read_utf8_codepoint: codepoints in the range "
365 "[0xD800, 0xDFFF] are not valid UTF-8.", {{
366 source_location(loc), "not a valid UTF-8 codepoint"
367 }}), source_location(loc));
368 }
369 assert(codepoint < 0xD800 || 0xDFFF < codepoint);
370 // 1110yyyy 10yxxxxx 10xxxxxx
371 character += to_char(0xE0| codepoint >> 12);
372 character += to_char(0x80|(codepoint >> 6 & 0x3F));
373 character += to_char(0x80|(codepoint & 0x3F));
374 }
375 else if(codepoint < 0x110000) // U+010000 ... U+10FFFF
376 {
377 // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
378 character += to_char(0xF0| codepoint >> 18);
379 character += to_char(0x80|(codepoint >> 12 & 0x3F));
380 character += to_char(0x80|(codepoint >> 6 & 0x3F));
381 character += to_char(0x80|(codepoint & 0x3F));
382 }
383 else // out of UTF-8 region
384 {
385 throw syntax_error(format_underline("toml::read_utf8_codepoint:"
386 " input codepoint is too large.",
387 {{source_location(loc), "should be in [0x00..0x10FFFF]"}}),
388 source_location(loc));
389 }
390 return character;
391 }
392
393 inline result<std::string, std::string> parse_escape_sequence(location& loc)
394 {
395 const auto first = loc.iter();
396 if(first == loc.end() || *first != '\\')
397 {
398 return err(format_underline("toml::parse_escape_sequence: ", {{
399 source_location(loc), "the next token is not a backslash \"\\\""}}));
400 }
401 loc.advance();
402 switch(*loc.iter())
403 {
404 case '\\':{loc.advance(); return ok(std::string("\\"));}
405 case '"' :{loc.advance(); return ok(std::string("\""));}
406 case 'b' :{loc.advance(); return ok(std::string("\b"));}
407 case 't' :{loc.advance(); return ok(std::string("\t"));}
408 case 'n' :{loc.advance(); return ok(std::string("\n"));}
409 case 'f' :{loc.advance(); return ok(std::string("\f"));}
410 case 'r' :{loc.advance(); return ok(std::string("\r"));}
411 #ifdef TOML11_USE_UNRELEASED_TOML_FEATURES
412 case 'e' :{loc.advance(); return ok(std::string("\x1b"));} // ESC
413 #endif
414 case 'u' :
415 {
416 if(const auto token = lex_escape_unicode_short::invoke(loc))
417 {
418 return ok(read_utf8_codepoint(token.unwrap(), loc));
419 }
420 else
421 {
422 return err(format_underline("parse_escape_sequence: "
423 "invalid token found in UTF-8 codepoint uXXXX.",
424 {{source_location(loc), "here"}}));
425 }
426 }
427 case 'U':
428 {
429 if(const auto token = lex_escape_unicode_long::invoke(loc))
430 {
431 return ok(read_utf8_codepoint(token.unwrap(), loc));
432 }
433 else
434 {
435 return err(format_underline("parse_escape_sequence: "
436 "invalid token found in UTF-8 codepoint Uxxxxxxxx",
437 {{source_location(loc), "here"}}));
438 }
439 }
440 }
441
442 const auto msg = format_underline("parse_escape_sequence: "
443 "unknown escape sequence appeared.", {{source_location(loc),
444 "escape sequence is one of \\, \", b, t, n, f, r, uxxxx, Uxxxxxxxx"}},
445 /* Hints = */{"if you want to write backslash as just one backslash, "
446 "use literal string like: regex = '<\\i\\c*\\s*>'"});
447 loc.reset(first);
448 return err(msg);
449 }
450
451 inline std::ptrdiff_t check_utf8_validity(const std::string& reg)
452 {
453 location loc("tmp", reg);
454 const auto u8 = repeat<lex_utf8_code, unlimited>::invoke(loc);
455 if(!u8 || loc.iter() != loc.end())
456 {
457 const auto error_location = std::distance(loc.begin(), loc.iter());
458 assert(0 <= error_location);
459 return error_location;
460 }
461 return -1;
462 }
463
464 inline result<std::pair<toml::string, region>, std::string>
465 parse_ml_basic_string(location& loc)
466 {
467 const auto first = loc.iter();
468 if(const auto token = lex_ml_basic_string::invoke(loc))
469 {
470 auto inner_loc = loc;
471 inner_loc.reset(first);
472
473 std::string retval;
474 retval.reserve(token.unwrap().size());
475
476 auto delim = lex_ml_basic_string_open::invoke(inner_loc);
477 if(!delim)
478 {
479 throw internal_error(format_underline(
480 "parse_ml_basic_string: invalid token",
481 {{source_location(inner_loc), "should be \"\"\""}}),
482 source_location(inner_loc));
483 }
484 // immediate newline is ignored (if exists)
485 /* discard return value */ lex_newline::invoke(inner_loc);
486
487 delim = none();
488 while(!delim)
489 {
490 using lex_unescaped_seq = repeat<
491 either<lex_ml_basic_unescaped, lex_newline>, unlimited>;
492 if(auto unescaped = lex_unescaped_seq::invoke(inner_loc))
493 {
494 retval += unescaped.unwrap().str();
495 }
496 if(auto escaped = parse_escape_sequence(inner_loc))
497 {
498 retval += escaped.unwrap();
499 }
500 if(auto esc_nl = lex_ml_basic_escaped_newline::invoke(inner_loc))
501 {
502 // ignore newline after escape until next non-ws char
503 }
504 if(inner_loc.iter() == inner_loc.end())
505 {
506 throw internal_error(format_underline(
507 "parse_ml_basic_string: unexpected end of region",
508 {{source_location(inner_loc), "not sufficient token"}}),
509 source_location(inner_loc));
510 }
511 delim = lex_ml_basic_string_close::invoke(inner_loc);
512 }
513 // `lex_ml_basic_string_close` allows 3 to 5 `"`s to allow 1 or 2 `"`s
514 // at just before the delimiter. Here, we need to attach `"`s at the
515 // end of the string body, if it exists.
516 // For detail, see the definition of `lex_ml_basic_string_close`.
517 assert(std::all_of(delim.unwrap().first(), delim.unwrap().last(),
518 [](const char c) noexcept {return c == '\"';}));
519 switch(delim.unwrap().size())
520 {
521 case 3: {break;}
522 case 4: {retval += "\""; break;}
523 case 5: {retval += "\"\""; break;}
524 default:
525 {
526 throw internal_error(format_underline(
527 "parse_ml_basic_string: closing delimiter has invalid length",
528 {{source_location(inner_loc), "end of this"}}),
529 source_location(inner_loc));
530 }
531 }
532
533 const auto err_loc = check_utf8_validity(token.unwrap().str());
534 if(err_loc == -1)
535 {
536 return ok(std::make_pair(toml::string(retval), token.unwrap()));
537 }
538 else
539 {
540 inner_loc.reset(first);
541 inner_loc.advance(err_loc);
542 throw syntax_error(format_underline(
543 "parse_ml_basic_string: invalid utf8 sequence found",
544 {{source_location(inner_loc), "here"}}),
545 source_location(inner_loc));
546 }
547 }
548 else
549 {
550 loc.reset(first);
551 return err(format_underline("toml::parse_ml_basic_string: "
552 "the next token is not a valid multiline string",
553 {{source_location(loc), "here"}}));
554 }
555 }
556
557 inline result<std::pair<toml::string, region>, std::string>
558 parse_basic_string(location& loc)
559 {
560 const auto first = loc.iter();
561 if(const auto token = lex_basic_string::invoke(loc))
562 {
563 auto inner_loc = loc;
564 inner_loc.reset(first);
565
566 auto quot = lex_quotation_mark::invoke(inner_loc);
567 if(!quot)
568 {
569 throw internal_error(format_underline("parse_basic_string: "
570 "invalid token", {{source_location(inner_loc), "should be \""}}),
571 source_location(inner_loc));
572 }
573
574 std::string retval;
575 retval.reserve(token.unwrap().size());
576
577 quot = none();
578 while(!quot)
579 {
580 using lex_unescaped_seq = repeat<lex_basic_unescaped, unlimited>;
581 if(auto unescaped = lex_unescaped_seq::invoke(inner_loc))
582 {
583 retval += unescaped.unwrap().str();
584 }
585 if(auto escaped = parse_escape_sequence(inner_loc))
586 {
587 retval += escaped.unwrap();
588 }
589 if(inner_loc.iter() == inner_loc.end())
590 {
591 throw internal_error(format_underline(
592 "parse_basic_string: unexpected end of region",
593 {{source_location(inner_loc), "not sufficient token"}}),
594 source_location(inner_loc));
595 }
596 quot = lex_quotation_mark::invoke(inner_loc);
597 }
598
599 const auto err_loc = check_utf8_validity(token.unwrap().str());
600 if(err_loc == -1)
601 {
602 return ok(std::make_pair(toml::string(retval), token.unwrap()));
603 }
604 else
605 {
606 inner_loc.reset(first);
607 inner_loc.advance(err_loc);
608 throw syntax_error(format_underline(
609 "parse_basic_string: invalid utf8 sequence found",
610 {{source_location(inner_loc), "here"}}),
611 source_location(inner_loc));
612 }
613 }
614 else
615 {
616 loc.reset(first); // rollback
617 return err(format_underline("toml::parse_basic_string: "
618 "the next token is not a valid string",
619 {{source_location(loc), "here"}}));
620 }
621 }
622
623 inline result<std::pair<toml::string, region>, std::string>
624 parse_ml_literal_string(location& loc)
625 {
626 const auto first = loc.iter();
627 if(const auto token = lex_ml_literal_string::invoke(loc))
628 {
629 auto inner_loc = loc;
630 inner_loc.reset(first);
631
632 const auto open = lex_ml_literal_string_open::invoke(inner_loc);
633 if(!open)
634 {
635 throw internal_error(format_underline(
636 "parse_ml_literal_string: invalid token",
637 {{source_location(inner_loc), "should be '''"}}),
638 source_location(inner_loc));
639 }
640 // immediate newline is ignored (if exists)
641 /* discard return value */ lex_newline::invoke(inner_loc);
642
643 const auto body = lex_ml_literal_body::invoke(inner_loc);
644
645 const auto close = lex_ml_literal_string_close::invoke(inner_loc);
646 if(!close)
647 {
648 throw internal_error(format_underline(
649 "parse_ml_literal_string: invalid token",
650 {{source_location(inner_loc), "should be '''"}}),
651 source_location(inner_loc));
652 }
653 // `lex_ml_literal_string_close` allows 3 to 5 `'`s to allow 1 or 2 `'`s
654 // at just before the delimiter. Here, we need to attach `'`s at the
655 // end of the string body, if it exists.
656 // For detail, see the definition of `lex_ml_basic_string_close`.
657
658 std::string retval = body.unwrap().str();
659 assert(std::all_of(close.unwrap().first(), close.unwrap().last(),
660 [](const char c) noexcept {return c == '\'';}));
661 switch(close.unwrap().size())
662 {
663 case 3: {break;}
664 case 4: {retval += "'"; break;}
665 case 5: {retval += "''"; break;}
666 default:
667 {
668 throw internal_error(format_underline(
669 "parse_ml_literal_string: closing delimiter has invalid length",
670 {{source_location(inner_loc), "end of this"}}),
671 source_location(inner_loc));
672 }
673 }
674
675 const auto err_loc = check_utf8_validity(token.unwrap().str());
676 if(err_loc == -1)
677 {
678 return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
679 token.unwrap()));
680 }
681 else
682 {
683 inner_loc.reset(first);
684 inner_loc.advance(err_loc);
685 throw syntax_error(format_underline(
686 "parse_ml_literal_string: invalid utf8 sequence found",
687 {{source_location(inner_loc), "here"}}),
688 source_location(inner_loc));
689 }
690 }
691 else
692 {
693 loc.reset(first); // rollback
694 return err(format_underline("toml::parse_ml_literal_string: "
695 "the next token is not a valid multiline literal string",
696 {{source_location(loc), "here"}}));
697 }
698 }
699
700 inline result<std::pair<toml::string, region>, std::string>
701 parse_literal_string(location& loc)
702 {
703 const auto first = loc.iter();
704 if(const auto token = lex_literal_string::invoke(loc))
705 {
706 auto inner_loc = loc;
707 inner_loc.reset(first);
708
709 const auto open = lex_apostrophe::invoke(inner_loc);
710 if(!open)
711 {
712 throw internal_error(format_underline(
713 "parse_literal_string: invalid token",
714 {{source_location(inner_loc), "should be '"}}),
715 source_location(inner_loc));
716 }
717
718 const auto body = repeat<lex_literal_char, unlimited>::invoke(inner_loc);
719
720 const auto close = lex_apostrophe::invoke(inner_loc);
721 if(!close)
722 {
723 throw internal_error(format_underline(
724 "parse_literal_string: invalid token",
725 {{source_location(inner_loc), "should be '"}}),
726 source_location(inner_loc));
727 }
728
729 const auto err_loc = check_utf8_validity(token.unwrap().str());
730 if(err_loc == -1)
731 {
732 return ok(std::make_pair(
733 toml::string(body.unwrap().str(), toml::string_t::literal),
734 token.unwrap()));
735 }
736 else
737 {
738 inner_loc.reset(first);
739 inner_loc.advance(err_loc);
740 throw syntax_error(format_underline(
741 "parse_literal_string: invalid utf8 sequence found",
742 {{source_location(inner_loc), "here"}}),
743 source_location(inner_loc));
744 }
745 }
746 else
747 {
748 loc.reset(first); // rollback
749 return err(format_underline("toml::parse_literal_string: "
750 "the next token is not a valid literal string",
751 {{source_location(loc), "here"}}));
752 }
753 }
754
755 inline result<std::pair<toml::string, region>, std::string>
756 parse_string(location& loc)
757 {
758 if(loc.iter() != loc.end() && *(loc.iter()) == '"')
759 {
760 if(loc.iter() + 1 != loc.end() && *(loc.iter() + 1) == '"' &&
761 loc.iter() + 2 != loc.end() && *(loc.iter() + 2) == '"')
762 {
763 return parse_ml_basic_string(loc);
764 }
765 else
766 {
767 return parse_basic_string(loc);
768 }
769 }
770 else if(loc.iter() != loc.end() && *(loc.iter()) == '\'')
771 {
772 if(loc.iter() + 1 != loc.end() && *(loc.iter() + 1) == '\'' &&
773 loc.iter() + 2 != loc.end() && *(loc.iter() + 2) == '\'')
774 {
775 return parse_ml_literal_string(loc);
776 }
777 else
778 {
779 return parse_literal_string(loc);
780 }
781 }
782 return err(format_underline("toml::parse_string: ",
783 {{source_location(loc), "the next token is not a string"}}));
784 }
785
786 inline result<std::pair<local_date, region>, std::string>
787 parse_local_date(location& loc)
788 {
789 const auto first = loc.iter();
790 if(const auto token = lex_local_date::invoke(loc))
791 {
792 location inner_loc(loc.name(), token.unwrap().str());
793
794 const auto y = lex_date_fullyear::invoke(inner_loc);
795 if(!y || inner_loc.iter() == inner_loc.end() || *inner_loc.iter() != '-')
796 {
797 throw internal_error(format_underline(
798 "toml::parse_local_date: invalid year format",
799 {{source_location(inner_loc), "should be `-`"}}),
800 source_location(inner_loc));
801 }
802 inner_loc.advance();
803 const auto m = lex_date_month::invoke(inner_loc);
804 if(!m || inner_loc.iter() == inner_loc.end() || *inner_loc.iter() != '-')
805 {
806 throw internal_error(format_underline(
807 "toml::parse_local_date: invalid month format",
808 {{source_location(inner_loc), "should be `-`"}}),
809 source_location(inner_loc));
810 }
811 inner_loc.advance();
812 const auto d = lex_date_mday::invoke(inner_loc);
813 if(!d)
814 {
815 throw internal_error(format_underline(
816 "toml::parse_local_date: invalid day format",
817 {{source_location(inner_loc), "here"}}),
818 source_location(inner_loc));
819 }
820
821 const auto year = static_cast<std::int16_t>(from_string<int>(y.unwrap().str(), 0));
822 const auto month = static_cast<std::int8_t >(from_string<int>(m.unwrap().str(), 0));
823 const auto day = static_cast<std::int8_t >(from_string<int>(d.unwrap().str(), 0));
824
825 // We briefly check whether the input date is valid or not. But here, we
826 // only check if the RFC3339 compliance.
827 // Actually there are several special date that does not exist,
828 // because of historical reasons, such as 1582/10/5-1582/10/14 (only in
829 // several countries). But here, we do not care about such a complicated
830 // rule. It makes the code complicated and there is only low probability
831 // that such a specific date is needed in practice. If someone need to
832 // validate date accurately, that means that the one need a specialized
833 // library for their purpose in a different layer.
834 {
835 const bool is_leap = (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0));
836 const auto max_day = (month == 2) ? (is_leap ? 29 : 28) :
837 ((month == 4 || month == 6 || month == 9 || month == 11) ? 30 : 31);
838
839 if((month < 1 || 12 < month) || (day < 1 || max_day < day))
840 {
841 throw syntax_error(format_underline("toml::parse_date: "
842 "invalid date: it does not conform RFC3339.", {{
843 source_location(loc), "month should be 01-12, day should be"
844 " 01-28,29,30,31, depending on month/year."
845 }}), source_location(inner_loc));
846 }
847 }
848 return ok(std::make_pair(local_date(year, static_cast<month_t>(month - 1), day),
849 token.unwrap()));
850 }
851 else
852 {
853 loc.reset(first);
854 return err(format_underline("toml::parse_local_date: ",
855 {{source_location(loc), "the next token is not a local_date"}}));
856 }
857 }
858
859 inline result<std::pair<local_time, region>, std::string>
860 parse_local_time(location& loc)
861 {
862 const auto first = loc.iter();
863 if(const auto token = lex_local_time::invoke(loc))
864 {
865 location inner_loc(loc.name(), token.unwrap().str());
866
867 const auto h = lex_time_hour::invoke(inner_loc);
868 if(!h || inner_loc.iter() == inner_loc.end() || *inner_loc.iter() != ':')
869 {
870 throw internal_error(format_underline(
871 "toml::parse_local_time: invalid year format",
872 {{source_location(inner_loc), "should be `:`"}}),
873 source_location(inner_loc));
874 }
875 inner_loc.advance();
876 const auto m = lex_time_minute::invoke(inner_loc);
877 if(!m || inner_loc.iter() == inner_loc.end() || *inner_loc.iter() != ':')
878 {
879 throw internal_error(format_underline(
880 "toml::parse_local_time: invalid month format",
881 {{source_location(inner_loc), "should be `:`"}}),
882 source_location(inner_loc));
883 }
884 inner_loc.advance();
885 const auto s = lex_time_second::invoke(inner_loc);
886 if(!s)
887 {
888 throw internal_error(format_underline(
889 "toml::parse_local_time: invalid second format",
890 {{source_location(inner_loc), "here"}}),
891 source_location(inner_loc));
892 }
893
894 const int hour = from_string<int>(h.unwrap().str(), 0);
895 const int minute = from_string<int>(m.unwrap().str(), 0);
896 const int second = from_string<int>(s.unwrap().str(), 0);
897
898 if((hour < 0 || 23 < hour) || (minute < 0 || 59 < minute) ||
899 (second < 0 || 60 < second)) // it may be leap second
900 {
901 throw syntax_error(format_underline("toml::parse_local_time: "
902 "invalid time: it does not conform RFC3339.", {{
903 source_location(loc), "hour should be 00-23, minute should be"
904 " 00-59, second should be 00-60 (depending on the leap"
905 " second rules.)"}}), source_location(inner_loc));
906 }
907
908 local_time time(hour, minute, second, 0, 0);
909
910 const auto before_secfrac = inner_loc.iter();
911 if(const auto secfrac = lex_time_secfrac::invoke(inner_loc))
912 {
913 auto sf = secfrac.unwrap().str();
914 sf.erase(sf.begin()); // sf.front() == '.'
915 switch(sf.size() % 3)
916 {
917 case 2: sf += '0'; break;
918 case 1: sf += "00"; break;
919 case 0: break;
920 default: break;
921 }
922 if(sf.size() >= 9)
923 {
924 time.millisecond = from_string<std::uint16_t>(sf.substr(0, 3), 0u);
925 time.microsecond = from_string<std::uint16_t>(sf.substr(3, 3), 0u);
926 time.nanosecond = from_string<std::uint16_t>(sf.substr(6, 3), 0u);
927 }
928 else if(sf.size() >= 6)
929 {
930 time.millisecond = from_string<std::uint16_t>(sf.substr(0, 3), 0u);
931 time.microsecond = from_string<std::uint16_t>(sf.substr(3, 3), 0u);
932 }
933 else if(sf.size() >= 3)
934 {
935 time.millisecond = from_string<std::uint16_t>(sf, 0u);
936 time.microsecond = 0u;
937 }
938 }
939 else
940 {
941 if(before_secfrac != inner_loc.iter())
942 {
943 throw internal_error(format_underline(
944 "toml::parse_local_time: invalid subsecond format",
945 {{source_location(inner_loc), "here"}}),
946 source_location(inner_loc));
947 }
948 }
949 return ok(std::make_pair(time, token.unwrap()));
950 }
951 else
952 {
953 loc.reset(first);
954 return err(format_underline("toml::parse_local_time: ",
955 {{source_location(loc), "the next token is not a local_time"}}));
956 }
957 }
958
959 inline result<std::pair<local_datetime, region>, std::string>
960 parse_local_datetime(location& loc)
961 {
962 const auto first = loc.iter();
963 if(const auto token = lex_local_date_time::invoke(loc))
964 {
965 location inner_loc(loc.name(), token.unwrap().str());
966 const auto date = parse_local_date(inner_loc);
967 if(!date || inner_loc.iter() == inner_loc.end())
968 {
969 throw internal_error(format_underline(
970 "toml::parse_local_datetime: invalid datetime format",
971 {{source_location(inner_loc), "date, not datetime"}}),
972 source_location(inner_loc));
973 }
974 const char delim = *(inner_loc.iter());
975 if(delim != 'T' && delim != 't' && delim != ' ')
976 {
977 throw internal_error(format_underline(
978 "toml::parse_local_datetime: invalid datetime format",
979 {{source_location(inner_loc), "should be `T` or ` ` (space)"}}),
980 source_location(inner_loc));
981 }
982 inner_loc.advance();
983 const auto time = parse_local_time(inner_loc);
984 if(!time)
985 {
986 throw internal_error(format_underline(
987 "toml::parse_local_datetime: invalid datetime format",
988 {{source_location(inner_loc), "invalid time format"}}),
989 source_location(inner_loc));
990 }
991 return ok(std::make_pair(
992 local_datetime(date.unwrap().first, time.unwrap().first),
993 token.unwrap()));
994 }
995 else
996 {
997 loc.reset(first);
998 return err(format_underline("toml::parse_local_datetime: ",
999 {{source_location(loc), "the next token is not a local_datetime"}}));
1000 }
1001 }
1002
1003 inline result<std::pair<offset_datetime, region>, std::string>
1004 parse_offset_datetime(location& loc)
1005 {
1006 const auto first = loc.iter();
1007 if(const auto token = lex_offset_date_time::invoke(loc))
1008 {
1009 location inner_loc(loc.name(), token.unwrap().str());
1010 const auto datetime = parse_local_datetime(inner_loc);
1011 if(!datetime || inner_loc.iter() == inner_loc.end())
1012 {
1013 throw internal_error(format_underline(
1014 "toml::parse_offset_datetime: invalid datetime format",
1015 {{source_location(inner_loc), "date, not datetime"}}),
1016 source_location(inner_loc));
1017 }
1018 time_offset offset(0, 0);
1019 if(const auto ofs = lex_time_numoffset::invoke(inner_loc))
1020 {
1021 const auto str = ofs.unwrap().str();
1022
1023 const auto hour = from_string<int>(str.substr(1,2), 0);
1024 const auto minute = from_string<int>(str.substr(4,2), 0);
1025
1026 if((hour < 0 || 23 < hour) || (minute < 0 || 59 < minute))
1027 {
1028 throw syntax_error(format_underline("toml::parse_offset_datetime: "
1029 "invalid offset: it does not conform RFC3339.", {{
1030 source_location(loc), "month should be 01-12, day should be"
1031 " 01-28,29,30,31, depending on month/year."
1032 }}), source_location(inner_loc));
1033 }
1034
1035 if(str.front() == '+')
1036 {
1037 offset = time_offset(hour, minute);
1038 }
1039 else
1040 {
1041 offset = time_offset(-hour, -minute);
1042 }
1043 }
1044 else if(*inner_loc.iter() != 'Z' && *inner_loc.iter() != 'z')
1045 {
1046 throw internal_error(format_underline(
1047 "toml::parse_offset_datetime: invalid datetime format",
1048 {{source_location(inner_loc), "should be `Z` or `+HH:MM`"}}),
1049 source_location(inner_loc));
1050 }
1051 return ok(std::make_pair(offset_datetime(datetime.unwrap().first, offset),
1052 token.unwrap()));
1053 }
1054 else
1055 {
1056 loc.reset(first);
1057 return err(format_underline("toml::parse_offset_datetime: ",
1058 {{source_location(loc), "the next token is not a offset_datetime"}}));
1059 }
1060 }
1061
1062 inline result<std::pair<key, region>, std::string>
1063 parse_simple_key(location& loc)
1064 {
1065 if(const auto bstr = parse_basic_string(loc))
1066 {
1067 return ok(std::make_pair(bstr.unwrap().first.str, bstr.unwrap().second));
1068 }
1069 if(const auto lstr = parse_literal_string(loc))
1070 {
1071 return ok(std::make_pair(lstr.unwrap().first.str, lstr.unwrap().second));
1072 }
1073 if(const auto bare = lex_unquoted_key::invoke(loc))
1074 {
1075 const auto reg = bare.unwrap();
1076 return ok(std::make_pair(reg.str(), reg));
1077 }
1078 return err(format_underline("toml::parse_simple_key: ",
1079 {{source_location(loc), "the next token is not a simple key"}}));
1080 }
1081
1082 // dotted key become vector of keys
1083 inline result<std::pair<std::vector<key>, region>, std::string>
1084 parse_key(location& loc)
1085 {
1086 const auto first = loc.iter();
1087 // dotted key -> `foo.bar.baz` where several single keys are chained by
1088 // dots. Whitespaces between keys and dots are allowed.
1089 if(const auto token = lex_dotted_key::invoke(loc))
1090 {
1091 const auto reg = token.unwrap();
1092 location inner_loc(loc.name(), reg.str());
1093 std::vector<key> keys;
1094
1095 while(inner_loc.iter() != inner_loc.end())
1096 {
1097 lex_ws::invoke(inner_loc);
1098 if(const auto k = parse_simple_key(inner_loc))
1099 {
1100 keys.push_back(k.unwrap().first);
1101 }
1102 else
1103 {
1104 throw internal_error(format_underline(
1105 "toml::parse_key: dotted key contains invalid key",
1106 {{source_location(inner_loc), k.unwrap_err()}}),
1107 source_location(inner_loc));
1108 }
1109
1110 lex_ws::invoke(inner_loc);
1111 if(inner_loc.iter() == inner_loc.end())
1112 {
1113 break;
1114 }
1115 else if(*inner_loc.iter() == '.')
1116 {
1117 inner_loc.advance(); // to skip `.`
1118 }
1119 else
1120 {
1121 throw internal_error(format_underline("toml::parse_key: "
1122 "dotted key contains invalid key ",
1123 {{source_location(inner_loc), "should be `.`"}}),
1124 source_location(inner_loc));
1125 }
1126 }
1127 return ok(std::make_pair(keys, reg));
1128 }
1129 loc.reset(first);
1130
1131 // simple_key: a single (basic_string|literal_string|bare key)
1132 if(const auto smpl = parse_simple_key(loc))
1133 {
1134 return ok(std::make_pair(std::vector<key>(1, smpl.unwrap().first),
1135 smpl.unwrap().second));
1136 }
1137 return err(format_underline("toml::parse_key: an invalid key appeared.",
1138 {{source_location(loc), "is not a valid key"}}, {
1139 "bare keys : non-empty strings composed only of [A-Za-z0-9_-].",
1140 "quoted keys: same as \"basic strings\" or 'literal strings'.",
1141 "dotted keys: sequence of bare or quoted keys joined with a dot."
1142 }));
1143 }
1144
1145 // forward-decl to implement parse_array and parse_table
1146 template<typename Value>
1147 result<Value, std::string> parse_value(location&, const std::size_t n_rec);
1148
1149 template<typename Value>
1150 result<std::pair<typename Value::array_type, region>, std::string>
1151 parse_array(location& loc, const std::size_t n_rec)
1152 {
1153 using value_type = Value;
1154 using array_type = typename value_type::array_type;
1155
1156 if(n_rec > TOML11_VALUE_RECURSION_LIMIT)
1157 {
1158 // parse_array does not have any way to handle recursive error currently...
1159 throw syntax_error(std::string("toml::parse_array: recursion limit ("
1160 TOML11_STRINGIZE(TOML11_VALUE_RECURSION_LIMIT) ") exceeded"),
1161 source_location(loc));
1162 }
1163
1164 const auto first = loc.iter();
1165 if(loc.iter() == loc.end())
1166 {
1167 return err("toml::parse_array: input is empty");
1168 }
1169 if(*loc.iter() != '[')
1170 {
1171 return err("toml::parse_array: token is not an array");
1172 }
1173 loc.advance();
1174
1175 using lex_ws_comment_newline = repeat<
1176 either<lex_wschar, lex_newline, lex_comment>, unlimited>;
1177
1178 array_type retval;
1179 while(loc.iter() != loc.end())
1180 {
1181 lex_ws_comment_newline::invoke(loc); // skip
1182
1183 if(loc.iter() != loc.end() && *loc.iter() == ']')
1184 {
1185 loc.advance(); // skip ']'
1186 return ok(std::make_pair(retval,
1187 region(loc, first, loc.iter())));
1188 }
1189
1190 if(auto val = parse_value<value_type>(loc, n_rec+1))
1191 {
1192 // After TOML v1.0.0-rc.1, array becomes to be able to have values
1193 // with different types. So here we will omit this by default.
1194 //
1195 // But some of the test-suite checks if the parser accepts a hetero-
1196 // geneous arrays, so we keep this for a while.
1197 #ifdef TOML11_DISALLOW_HETEROGENEOUS_ARRAYS
1198 if(!retval.empty() && retval.front().type() != val.as_ok().type())
1199 {
1200 auto array_start_loc = loc;
1201 array_start_loc.reset(first);
1202
1203 throw syntax_error(format_underline("toml::parse_array: "
1204 "type of elements should be the same each other.", {
1205 {source_location(array_start_loc), "array starts here"},
1206 {
1207 retval.front().location(),
1208 "value has type " + stringize(retval.front().type())
1209 },
1210 {
1211 val.unwrap().location(),
1212 "value has different type, " + stringize(val.unwrap().type())
1213 }
1214 }), source_location(loc));
1215 }
1216 #endif
1217 retval.push_back(std::move(val.unwrap()));
1218 }
1219 else
1220 {
1221 auto array_start_loc = loc;
1222 array_start_loc.reset(first);
1223
1224 throw syntax_error(format_underline("toml::parse_array: "
1225 "value having invalid format appeared in an array", {
1226 {source_location(array_start_loc), "array starts here"},
1227 {source_location(loc), "it is not a valid value."}
1228 }), source_location(loc));
1229 }
1230
1231 using lex_array_separator = sequence<maybe<lex_ws_comment_newline>, character<','>>;
1232 const auto sp = lex_array_separator::invoke(loc);
1233 if(!sp)
1234 {
1235 lex_ws_comment_newline::invoke(loc);
1236 if(loc.iter() != loc.end() && *loc.iter() == ']')
1237 {
1238 loc.advance(); // skip ']'
1239 return ok(std::make_pair(retval,
1240 region(loc, first, loc.iter())));
1241 }
1242 else
1243 {
1244 auto array_start_loc = loc;
1245 array_start_loc.reset(first);
1246
1247 throw syntax_error(format_underline("toml::parse_array:"
1248 " missing array separator `,` after a value", {
1249 {source_location(array_start_loc), "array starts here"},
1250 {source_location(loc), "should be `,`"}
1251 }), source_location(loc));
1252 }
1253 }
1254 }
1255 loc.reset(first);
1256 throw syntax_error(format_underline("toml::parse_array: "
1257 "array did not closed by `]`",
1258 {{source_location(loc), "should be closed"}}),
1259 source_location(loc));
1260 }
1261
1262 template<typename Value>
1263 result<std::pair<std::pair<std::vector<key>, region>, Value>, std::string>
1264 parse_key_value_pair(location& loc, const std::size_t n_rec)
1265 {
1266 using value_type = Value;
1267
1268 const auto first = loc.iter();
1269 auto key_reg = parse_key(loc);
1270 if(!key_reg)
1271 {
1272 std::string msg = std::move(key_reg.unwrap_err());
1273 // if the next token is keyvalue-separator, it means that there are no
1274 // key. then we need to show error as "empty key is not allowed".
1275 if(const auto keyval_sep = lex_keyval_sep::invoke(loc))
1276 {
1277 loc.reset(first);
1278 msg = format_underline("toml::parse_key_value_pair: "
1279 "empty key is not allowed.",
1280 {{source_location(loc), "key expected before '='"}});
1281 }
1282 return err(std::move(msg));
1283 }
1284
1285 const auto kvsp = lex_keyval_sep::invoke(loc);
1286 if(!kvsp)
1287 {
1288 std::string msg;
1289 // if the line contains '=' after the invalid sequence, possibly the
1290 // error is in the key (like, invalid character in bare key).
1291 const auto line_end = std::find(loc.iter(), loc.end(), '\n');
1292 if(std::find(loc.iter(), line_end, '=') != line_end)
1293 {
1294 msg = format_underline("toml::parse_key_value_pair: "
1295 "invalid format for key",
1296 {{source_location(loc), "invalid character in key"}},
1297 {"Did you forget '.' to separate dotted-key?",
1298 "Allowed characters for bare key are [0-9a-zA-Z_-]."});
1299 }
1300 else // if not, the error is lack of key-value separator.
1301 {
1302 msg = format_underline("toml::parse_key_value_pair: "
1303 "missing key-value separator `=`",
1304 {{source_location(loc), "should be `=`"}});
1305 }
1306 loc.reset(first);
1307 return err(std::move(msg));
1308 }
1309
1310 const auto after_kvsp = loc.iter(); // err msg
1311 auto val = parse_value<value_type>(loc, n_rec);
1312 if(!val)
1313 {
1314 std::string msg;
1315 loc.reset(after_kvsp);
1316 // check there is something not a comment/whitespace after `=`
1317 if(sequence<maybe<lex_ws>, maybe<lex_comment>, lex_newline>::invoke(loc))
1318 {
1319 loc.reset(after_kvsp);
1320 msg = format_underline("toml::parse_key_value_pair: "
1321 "missing value after key-value separator '='",
1322 {{source_location(loc), "expected value, but got nothing"}});
1323 }
1324 else // there is something not a comment/whitespace, so invalid format.
1325 {
1326 msg = std::move(val.unwrap_err());
1327 }
1328 loc.reset(first);
1329 return err(msg);
1330 }
1331 return ok(std::make_pair(std::move(key_reg.unwrap()),
1332 std::move(val.unwrap())));
1333 }
1334
1335 // for error messages.
1336 template<typename InputIterator>
1337 std::string format_dotted_keys(InputIterator first, const InputIterator last)
1338 {
1339 static_assert(std::is_same<key,
1340 typename std::iterator_traits<InputIterator>::value_type>::value,"");
1341
1342 std::string retval(*first++);
1343 for(; first != last; ++first)
1344 {
1345 retval += '.';
1346 retval += *first;
1347 }
1348 return retval;
1349 }
1350
1351 // forward decl for is_valid_forward_table_definition
1352 result<std::pair<std::vector<key>, region>, std::string>
1353 parse_table_key(location& loc);
1354 result<std::pair<std::vector<key>, region>, std::string>
1355 parse_array_table_key(location& loc);
1356 template<typename Value>
1357 result<std::pair<typename Value::table_type, region>, std::string>
1358 parse_inline_table(location& loc, const std::size_t n_rec);
1359
1360 // The following toml file is allowed.
1361 // ```toml
1362 // [a.b.c] # here, table `a` has element `b`.
1363 // foo = "bar"
1364 // [a] # merge a = {baz = "qux"} to a = {b = {...}}
1365 // baz = "qux"
1366 // ```
1367 // But the following is not allowed.
1368 // ```toml
1369 // [a]
1370 // b.c.foo = "bar"
1371 // [a] # error! the same table [a] defined!
1372 // baz = "qux"
1373 // ```
1374 // The following is neither allowed.
1375 // ```toml
1376 // a = { b.c.foo = "bar"}
1377 // [a] # error! the same table [a] defined!
1378 // baz = "qux"
1379 // ```
1380 // Here, it parses region of `tab->at(k)` as a table key and check the depth
1381 // of the key. If the key region points deeper node, it would be allowed.
1382 // Otherwise, the key points the same node. It would be rejected.
1383 template<typename Value, typename Iterator>
1384 bool is_valid_forward_table_definition(const Value& fwd, const Value& inserting,
1385 Iterator key_first, Iterator key_curr, Iterator key_last)
1386 {
1387 // ------------------------------------------------------------------------
1388 // check type of the value to be inserted/merged
1389
1390 std::string inserting_reg = "";
1391 if(const auto ptr = detail::get_region(inserting))
1392 {
1393 inserting_reg = ptr->str();
1394 }
1395 location inserting_def("internal", std::move(inserting_reg));
1396 if(const auto inlinetable = parse_inline_table<Value>(inserting_def, 0))
1397 {
1398 // check if we are overwriting existing table.
1399 // ```toml
1400 // # NG
1401 // a.b = 42
1402 // a = {d = 3.14}
1403 // ```
1404 // Inserting an inline table to a existing super-table is not allowed in
1405 // any case. If we found it, we can reject it without further checking.
1406 return false;
1407 }
1408
1409 // Valid and invalid cases when inserting to the [a.b] table:
1410 //
1411 // ## Invalid
1412 //
1413 // ```toml
1414 // # invalid
1415 // [a]
1416 // b.c.d = "foo"
1417 // [a.b] # a.b is already defined and closed
1418 // d = "bar"
1419 // ```
1420 // ```toml
1421 // # invalid
1422 // a = {b.c.d = "foo"}
1423 // [a.b] # a is already defined and inline table is closed
1424 // d = "bar"
1425 // ```
1426 // ```toml
1427 // # invalid
1428 // a.b.c.d = "foo"
1429 // [a.b] # a.b is already defined and dotted-key table is closed
1430 // d = "bar"
1431 // ```
1432 //
1433 // ## Valid
1434 //
1435 // ```toml
1436 // # OK. a.b is defined, but is *overwritable*
1437 // [a.b.c]
1438 // d = "foo"
1439 // [a.b]
1440 // d = "bar"
1441 // ```
1442 // ```toml
1443 // # OK. a.b is defined, but is *overwritable*
1444 // [a]
1445 // b.c.d = "foo"
1446 // b.e = "bar"
1447 // ```
1448
1449 // ------------------------------------------------------------------------
1450 // check table defined before
1451
1452 std::string internal = "";
1453 if(const auto ptr = detail::get_region(fwd))
1454 {
1455 internal = ptr->str();
1456 }
1457 location def("internal", std::move(internal));
1458 if(const auto tabkeys = parse_table_key(def)) // [table.key]
1459 {
1460 // table keys always contains all the nodes from the root.
1461 const auto& tks = tabkeys.unwrap().first;
1462 if(std::size_t(std::distance(key_first, key_last)) == tks.size() &&
1463 std::equal(tks.begin(), tks.end(), key_first))
1464 {
1465 // the keys are equivalent. it is not allowed.
1466 return false;
1467 }
1468 // the keys are not equivalent. it is allowed.
1469 return true;
1470 }
1471 // nested array-of-table definition implicitly defines tables.
1472 // those tables can be reopened.
1473 if(const auto atabkeys = parse_array_table_key(def))
1474 {
1475 // table keys always contains all the nodes from the root.
1476 const auto& tks = atabkeys.unwrap().first;
1477 if(std::size_t(std::distance(key_first, key_last)) == tks.size() &&
1478 std::equal(tks.begin(), tks.end(), key_first))
1479 {
1480 // the keys are equivalent. it is not allowed.
1481 return false;
1482 }
1483 // the keys are not equivalent. it is allowed.
1484 return true;
1485 }
1486 if(const auto dotkeys = parse_key(def)) // a.b.c = "foo"
1487 {
1488 // consider the following case.
1489 // [a]
1490 // b.c = {d = 42}
1491 // [a.b.c]
1492 // e = 2.71
1493 // this defines the table [a.b.c] twice. no?
1494 if(const auto reopening_dotkey_by_table = parse_table_key(inserting_def))
1495 {
1496 // re-opening a dotkey-defined table by a table is invalid.
1497 // only dotkey can append a key-val. Like:
1498 // ```toml
1499 // a.b.c = "foo"
1500 // a.b.d = "bar" # OK. reopen `a.b` by dotkey
1501 // [a.b]
1502 // e = "bar" # Invalid. re-opening `a.b` by [a.b] is not allowed.
1503 // ```
1504 return false;
1505 }
1506
1507 // a dotted key starts from the node representing a table in which the
1508 // dotted key belongs to.
1509 const auto& dks = dotkeys.unwrap().first;
1510 if(std::size_t(std::distance(key_curr, key_last)) == dks.size() &&
1511 std::equal(dks.begin(), dks.end(), key_curr))
1512 {
1513 // the keys are equivalent. it is not allowed.
1514 return false;
1515 }
1516 // the keys are not equivalent. it is allowed.
1517 return true;
1518 }
1519 return false;
1520 }
1521
1522 template<typename Value, typename InputIterator>
1523 result<bool, std::string>
1524 insert_nested_key(typename Value::table_type& root, const Value& v,
1525 InputIterator iter, const InputIterator last,
1526 region key_reg,
1527 const bool is_array_of_table = false)
1528 {
1529 static_assert(std::is_same<key,
1530 typename std::iterator_traits<InputIterator>::value_type>::value,"");
1531
1532 using value_type = Value;
1533 using table_type = typename value_type::table_type;
1534 using array_type = typename value_type::array_type;
1535
1536 const auto first = iter;
1537 assert(iter != last);
1538
1539 table_type* tab = std::addressof(root);
1540 for(; iter != last; ++iter) // search recursively
1541 {
1542 const key& k = *iter;
1543 if(std::next(iter) == last) // k is the last key
1544 {
1545 // XXX if the value is array-of-tables, there can be several
1546 // tables that are in the same array. in that case, we need to
1547 // find the last element and insert it to there.
1548 if(is_array_of_table)
1549 {
1550 if(tab->count(k) == 1) // there is already an array of table
1551 {
1552 if(tab->at(k).is_table())
1553 {
1554 // show special err msg for conflicting table
1555 throw syntax_error(format_underline(concat_to_string(
1556 "toml::insert_value: array of table (\"",
1557 format_dotted_keys(first, last),
1558 "\") cannot be defined"), {
1559 {tab->at(k).location(), "table already defined"},
1560 {v.location(), "this conflicts with the previous table"}
1561 }), v.location());
1562 }
1563 else if(!(tab->at(k).is_array()))
1564 {
1565 throw syntax_error(format_underline(concat_to_string(
1566 "toml::insert_value: array of table (\"",
1567 format_dotted_keys(first, last), "\") collides with"
1568 " existing value"), {
1569 {tab->at(k).location(),
1570 concat_to_string("this ", tab->at(k).type(),
1571 " value already exists")},
1572 {v.location(),
1573 "while inserting this array-of-tables"}
1574 }), v.location());
1575 }
1576 // the above if-else-if checks tab->at(k) is an array
1577 auto& a = tab->at(k).as_array();
1578 // If table element is defined as [[array_of_tables]], it
1579 // cannot be an empty array. If an array of tables is
1580 // defined as `aot = []`, it cannot be appended.
1581 if(a.empty() || !(a.front().is_table()))
1582 {
1583 throw syntax_error(format_underline(concat_to_string(
1584 "toml::insert_value: array of table (\"",
1585 format_dotted_keys(first, last), "\") collides with"
1586 " existing value"), {
1587 {tab->at(k).location(),
1588 concat_to_string("this ", tab->at(k).type(),
1589 " value already exists")},
1590 {v.location(),
1591 "while inserting this array-of-tables"}
1592 }), v.location());
1593 }
1594 // avoid conflicting array of table like the following.
1595 // ```toml
1596 // a = [{b = 42}] # define a as an array of *inline* tables
1597 // [[a]] # a is an array of *multi-line* tables
1598 // b = 54
1599 // ```
1600 // Here, from the type information, these cannot be detected
1601 // because inline table is also a table.
1602 // But toml v0.5.0 explicitly says it is invalid. The above
1603 // array-of-tables has a static size and appending to the
1604 // array is invalid.
1605 // In this library, multi-line table value has a region
1606 // that points to the key of the table (e.g. [[a]]). By
1607 // comparing the first two letters in key, we can detect
1608 // the array-of-table is inline or multiline.
1609 if(const auto ptr = detail::get_region(a.front()))
1610 {
1611 if(ptr->str().substr(0,2) != "[[")
1612 {
1613 throw syntax_error(format_underline(concat_to_string(
1614 "toml::insert_value: array of table (\"",
1615 format_dotted_keys(first, last), "\") collides "
1616 "with existing array-of-tables"), {
1617 {tab->at(k).location(),
1618 concat_to_string("this ", tab->at(k).type(),
1619 " value has static size")},
1620 {v.location(),
1621 "appending it to the statically sized array"}
1622 }), v.location());
1623 }
1624 }
1625 a.push_back(v);
1626 return ok(true);
1627 }
1628 else // if not, we need to create the array of table
1629 {
1630 // XXX: Consider the following array of tables.
1631 // ```toml
1632 // # This is a comment.
1633 // [[aot]]
1634 // foo = "bar"
1635 // ```
1636 // Here, the comment is for `aot`. But here, actually two
1637 // values are defined. An array that contains tables, named
1638 // `aot`, and the 0th element of the `aot`, `{foo = "bar"}`.
1639 // Those two are different from each other. But both of them
1640 // points to the same portion of the TOML file, `[[aot]]`,
1641 // so `key_reg.comments()` returns `# This is a comment`.
1642 // If it is assigned as a comment of `aot` defined here, the
1643 // comment will be duplicated. Both the `aot` itself and
1644 // the 0-th element will have the same comment. This causes
1645 // "duplication of the same comments" bug when the data is
1646 // serialized.
1647 // Next, consider the following.
1648 // ```toml
1649 // # comment 1
1650 // aot = [
1651 // # comment 2
1652 // {foo = "bar"},
1653 // ]
1654 // ```
1655 // In this case, we can distinguish those two comments. So
1656 // here we need to add "comment 1" to the `aot` and
1657 // "comment 2" to the 0th element of that.
1658 // To distinguish those two, we check the key region.
1659 std::vector<std::string> comments{/* empty by default */};
1660 if(key_reg.str().substr(0, 2) != "[[")
1661 {
1662 comments = key_reg.comments();
1663 }
1664 value_type aot(array_type(1, v), key_reg, std::move(comments));
1665 tab->insert(std::make_pair(k, aot));
1666 return ok(true);
1667 }
1668 } // end if(array of table)
1669
1670 if(tab->count(k) == 1)
1671 {
1672 if(tab->at(k).is_table() && v.is_table())
1673 {
1674 if(!is_valid_forward_table_definition(
1675 tab->at(k), v, first, iter, last))
1676 {
1677 throw syntax_error(format_underline(concat_to_string(
1678 "toml::insert_value: table (\"",
1679 format_dotted_keys(first, last),
1680 "\") already exists."), {
1681 {tab->at(k).location(), "table already exists here"},
1682 {v.location(), "table defined twice"}
1683 }), v.location());
1684 }
1685 // to allow the following toml file.
1686 // [a.b.c]
1687 // d = 42
1688 // [a]
1689 // e = 2.71
1690 auto& t = tab->at(k).as_table();
1691 for(const auto& kv : v.as_table())
1692 {
1693 if(tab->at(k).contains(kv.first))
1694 {
1695 throw syntax_error(format_underline(concat_to_string(
1696 "toml::insert_value: value (\"",
1697 format_dotted_keys(first, last),
1698 "\") already exists."), {
1699 {t.at(kv.first).location(), "already exists here"},
1700 {v.location(), "this defined twice"}
1701 }), v.location());
1702 }
1703 t[kv.first] = kv.second;
1704 }
1705 detail::change_region(tab->at(k), key_reg);
1706 return ok(true);
1707 }
1708 else if(v.is_table() &&
1709 tab->at(k).is_array() &&
1710 tab->at(k).as_array().size() > 0 &&
1711 tab->at(k).as_array().front().is_table())
1712 {
1713 throw syntax_error(format_underline(concat_to_string(
1714 "toml::insert_value: array of tables (\"",
1715 format_dotted_keys(first, last), "\") already exists."), {
1716 {tab->at(k).location(), "array of tables defined here"},
1717 {v.location(), "table conflicts with the previous array of table"}
1718 }), v.location());
1719 }
1720 else
1721 {
1722 throw syntax_error(format_underline(concat_to_string(
1723 "toml::insert_value: value (\"",
1724 format_dotted_keys(first, last), "\") already exists."), {
1725 {tab->at(k).location(), "value already exists here"},
1726 {v.location(), "value defined twice"}
1727 }), v.location());
1728 }
1729 }
1730 tab->insert(std::make_pair(k, v));
1731 return ok(true);
1732 }
1733 else // k is not the last one, we should insert recursively
1734 {
1735 // if there is no corresponding value, insert it first.
1736 // related: you don't need to write
1737 // # [x]
1738 // # [x.y]
1739 // to write
1740 // [x.y.z]
1741 if(tab->count(k) == 0)
1742 {
1743 // a table that is defined implicitly doesn't have any comments.
1744 (*tab)[k] = value_type(table_type{}, key_reg, {/*no comment*/});
1745 }
1746
1747 // type checking...
1748 if(tab->at(k).is_table())
1749 {
1750 // According to toml-lang/toml:36d3091b3 "Clarify that inline
1751 // tables are immutable", check if it adds key-value pair to an
1752 // inline table.
1753 if(const auto* ptr = get_region(tab->at(k)))
1754 {
1755 // here, if the value is a (multi-line) table, the region
1756 // should be something like `[table-name]`.
1757 if(ptr->front() == '{')
1758 {
1759 throw syntax_error(format_underline(concat_to_string(
1760 "toml::insert_value: inserting to an inline table (",
1761 format_dotted_keys(first, std::next(iter)),
1762 ") but inline tables are immutable"), {
1763 {tab->at(k).location(), "inline tables are immutable"},
1764 {v.location(), "inserting this"}
1765 }), v.location());
1766 }
1767 }
1768 tab = std::addressof((*tab)[k].as_table());
1769 }
1770 else if(tab->at(k).is_array()) // inserting to array-of-tables?
1771 {
1772 auto& a = (*tab)[k].as_array();
1773 if(!a.back().is_table())
1774 {
1775 throw syntax_error(format_underline(concat_to_string(
1776 "toml::insert_value: target (",
1777 format_dotted_keys(first, std::next(iter)),
1778 ") is neither table nor an array of tables"), {
1779 {a.back().location(), concat_to_string(
1780 "actual type is ", a.back().type())},
1781 {v.location(), "inserting this"}
1782 }), v.location());
1783 }
1784 if(a.empty())
1785 {
1786 throw syntax_error(format_underline(concat_to_string(
1787 "toml::insert_value: table (\"",
1788 format_dotted_keys(first, last), "\") conflicts with"
1789 " existing value"), {
1790 {tab->at(k).location(), std::string("this array is not insertable")},
1791 {v.location(), std::string("appending it to the statically sized array")}
1792 }), v.location());
1793 }
1794 if(const auto ptr = detail::get_region(a.at(0)))
1795 {
1796 if(ptr->str().substr(0,2) != "[[")
1797 {
1798 throw syntax_error(format_underline(concat_to_string(
1799 "toml::insert_value: a table (\"",
1800 format_dotted_keys(first, last), "\") cannot be "
1801 "inserted to an existing inline array-of-tables"), {
1802 {tab->at(k).location(), std::string("this array of table has a static size")},
1803 {v.location(), std::string("appending it to the statically sized array")}
1804 }), v.location());
1805 }
1806 }
1807 tab = std::addressof(a.back().as_table());
1808 }
1809 else
1810 {
1811 throw syntax_error(format_underline(concat_to_string(
1812 "toml::insert_value: target (",
1813 format_dotted_keys(first, std::next(iter)),
1814 ") is neither table nor an array of tables"), {
1815 {tab->at(k).location(), concat_to_string(
1816 "actual type is ", tab->at(k).type())},
1817 {v.location(), "inserting this"}
1818 }), v.location());
1819 }
1820 }
1821 }
1822 return err(std::string("toml::detail::insert_nested_key: never reach here"));
1823 }
1824
1825 template<typename Value>
1826 result<std::pair<typename Value::table_type, region>, std::string>
1827 parse_inline_table(location& loc, const std::size_t n_rec)
1828 {
1829 using value_type = Value;
1830 using table_type = typename value_type::table_type;
1831
1832 if(n_rec > TOML11_VALUE_RECURSION_LIMIT)
1833 {
1834 throw syntax_error(std::string("toml::parse_inline_table: recursion limit ("
1835 TOML11_STRINGIZE(TOML11_VALUE_RECURSION_LIMIT) ") exceeded"),
1836 source_location(loc));
1837 }
1838
1839 const auto first = loc.iter();
1840 table_type retval;
1841 if(!(loc.iter() != loc.end() && *loc.iter() == '{'))
1842 {
1843 return err(format_underline("toml::parse_inline_table: ",
1844 {{source_location(loc), "the next token is not an inline table"}}));
1845 }
1846 loc.advance();
1847
1848 // check if the inline table is an empty table = { }
1849 maybe<lex_ws>::invoke(loc);
1850 if(loc.iter() != loc.end() && *loc.iter() == '}')
1851 {
1852 loc.advance(); // skip `}`
1853 return ok(std::make_pair(retval, region(loc, first, loc.iter())));
1854 }
1855
1856 // it starts from "{". it should be formatted as inline-table
1857 while(loc.iter() != loc.end())
1858 {
1859 const auto kv_r = parse_key_value_pair<value_type>(loc, n_rec+1);
1860 if(!kv_r)
1861 {
1862 return err(kv_r.unwrap_err());
1863 }
1864
1865 const auto& kvpair = kv_r.unwrap();
1866 const std::vector<key>& keys = kvpair.first.first;
1867 const auto& key_reg = kvpair.first.second;
1868 const value_type& val = kvpair.second;
1869
1870 const auto inserted =
1871 insert_nested_key(retval, val, keys.begin(), keys.end(), key_reg);
1872 if(!inserted)
1873 {
1874 throw internal_error("toml::parse_inline_table: "
1875 "failed to insert value into table: " + inserted.unwrap_err(),
1876 source_location(loc));
1877 }
1878
1879 using lex_table_separator = sequence<maybe<lex_ws>, character<','>>;
1880 const auto sp = lex_table_separator::invoke(loc);
1881
1882 if(!sp)
1883 {
1884 maybe<lex_ws>::invoke(loc);
1885
1886 if(loc.iter() == loc.end())
1887 {
1888 throw syntax_error(format_underline(
1889 "toml::parse_inline_table: missing table separator `}` ",
1890 {{source_location(loc), "should be `}`"}}),
1891 source_location(loc));
1892 }
1893 else if(*loc.iter() == '}')
1894 {
1895 loc.advance(); // skip `}`
1896 return ok(std::make_pair(
1897 retval, region(loc, first, loc.iter())));
1898 }
1899 else if(*loc.iter() == '#' || *loc.iter() == '\r' || *loc.iter() == '\n')
1900 {
1901 throw syntax_error(format_underline(
1902 "toml::parse_inline_table: missing curly brace `}`",
1903 {{source_location(loc), "should be `}`"}}),
1904 source_location(loc));
1905 }
1906 else
1907 {
1908 throw syntax_error(format_underline(
1909 "toml::parse_inline_table: missing table separator `,` ",
1910 {{source_location(loc), "should be `,`"}}),
1911 source_location(loc));
1912 }
1913 }
1914 else // `,` is found
1915 {
1916 maybe<lex_ws>::invoke(loc);
1917 if(loc.iter() != loc.end() && *loc.iter() == '}')
1918 {
1919 throw syntax_error(format_underline(
1920 "toml::parse_inline_table: trailing comma is not allowed in"
1921 " an inline table",
1922 {{source_location(loc), "should be `}`"}}),
1923 source_location(loc));
1924 }
1925 }
1926 }
1927 loc.reset(first);
1928 throw syntax_error(format_underline("toml::parse_inline_table: "
1929 "inline table did not closed by `}`",
1930 {{source_location(loc), "should be closed"}}),
1931 source_location(loc));
1932 }
1933
1934 inline result<value_t, std::string> guess_number_type(const location& l)
1935 {
1936 // This function tries to find some (common) mistakes by checking characters
1937 // that follows the last character of a value. But it is often difficult
1938 // because some non-newline characters can appear after a value. E.g.
1939 // spaces, tabs, commas (in an array or inline table), closing brackets
1940 // (of an array or inline table), comment-sign (#). Since this function
1941 // does not parse further, those characters are always allowed to be there.
1942 location loc = l;
1943
1944 if(lex_offset_date_time::invoke(loc)) {return ok(value_t::offset_datetime);}
1945 loc.reset(l.iter());
1946
1947 if(lex_local_date_time::invoke(loc))
1948 {
1949 // bad offset may appear after this.
1950 if(loc.iter() != loc.end() && (*loc.iter() == '+' || *loc.iter() == '-'
1951 || *loc.iter() == 'Z' || *loc.iter() == 'z'))
1952 {
1953 return err(format_underline("bad offset: should be [+-]HH:MM or Z",
1954 {{source_location(loc), "[+-]HH:MM or Z"}},
1955 {"pass: +09:00, -05:30", "fail: +9:00, -5:30"}));
1956 }
1957 return ok(value_t::local_datetime);
1958 }
1959 loc.reset(l.iter());
1960
1961 if(lex_local_date::invoke(loc))
1962 {
1963 // bad time may appear after this.
1964 // A space is allowed as a delimiter between local time. But there are
1965 // both cases in which a space becomes valid or invalid.
1966 // - invalid: 2019-06-16 7:00:00
1967 // - valid : 2019-06-16 07:00:00
1968 if(loc.iter() != loc.end())
1969 {
1970 const auto c = *loc.iter();
1971 if(c == 'T' || c == 't')
1972 {
1973 return err(format_underline("bad time: should be HH:MM:SS.subsec",
1974 {{source_location(loc), "HH:MM:SS.subsec"}},
1975 {"pass: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999",
1976 "fail: 1979-05-27T7:32:00, 1979-05-27 17:32"}));
1977 }
1978 if('0' <= c && c <= '9')
1979 {
1980 return err(format_underline("bad time: missing T",
1981 {{source_location(loc), "T or space required here"}},
1982 {"pass: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999",
1983 "fail: 1979-05-27T7:32:00, 1979-05-27 7:32"}));
1984 }
1985 if(c == ' ' && std::next(loc.iter()) != loc.end() &&
1986 ('0' <= *std::next(loc.iter()) && *std::next(loc.iter())<= '9'))
1987 {
1988 loc.advance();
1989 return err(format_underline("bad time: should be HH:MM:SS.subsec",
1990 {{source_location(loc), "HH:MM:SS.subsec"}},
1991 {"pass: 1979-05-27T07:32:00, 1979-05-27 07:32:00.999999",
1992 "fail: 1979-05-27T7:32:00, 1979-05-27 7:32"}));
1993 }
1994 }
1995 return ok(value_t::local_date);
1996 }
1997 loc.reset(l.iter());
1998
1999 if(lex_local_time::invoke(loc)) {return ok(value_t::local_time);}
2000 loc.reset(l.iter());
2001
2002 if(lex_float::invoke(loc))
2003 {
2004 if(loc.iter() != loc.end() && *loc.iter() == '_')
2005 {
2006 return err(format_underline("bad float: `_` should be surrounded by digits",
2007 {{source_location(loc), "here"}},
2008 {"pass: +1.0, -2e-2, 3.141_592_653_589, inf, nan",
2009 "fail: .0, 1., _1.0, 1.0_, 1_.0, 1.0__0"}));
2010 }
2011 return ok(value_t::floating);
2012 }
2013 loc.reset(l.iter());
2014
2015 if(lex_integer::invoke(loc))
2016 {
2017 if(loc.iter() != loc.end())
2018 {
2019 const auto c = *loc.iter();
2020 if(c == '_')
2021 {
2022 return err(format_underline("bad integer: `_` should be surrounded by digits",
2023 {{source_location(loc), "here"}},
2024 {"pass: -42, 1_000, 1_2_3_4_5, 0xC0FFEE, 0b0010, 0o755",
2025 "fail: 1__000, 0123"}));
2026 }
2027 if('0' <= c && c <= '9')
2028 {
2029 // leading zero. point '0'
2030 loc.retrace();
2031 return err(format_underline("bad integer: leading zero",
2032 {{source_location(loc), "here"}},
2033 {"pass: -42, 1_000, 1_2_3_4_5, 0xC0FFEE, 0b0010, 0o755",
2034 "fail: 1__000, 0123"}));
2035 }
2036 if(c == ':' || c == '-')
2037 {
2038 return err(format_underline("bad datetime: invalid format",
2039 {{source_location(loc), "here"}},
2040 {"pass: 1979-05-27T07:32:00-07:00, 1979-05-27 07:32:00.999999Z",
2041 "fail: 1979-05-27T7:32:00-7:00, 1979-05-27 7:32-00:30"}));
2042 }
2043 if(c == '.' || c == 'e' || c == 'E')
2044 {
2045 return err(format_underline("bad float: invalid format",
2046 {{source_location(loc), "here"}},
2047 {"pass: +1.0, -2e-2, 3.141_592_653_589, inf, nan",
2048 "fail: .0, 1., _1.0, 1.0_, 1_.0, 1.0__0"}));
2049 }
2050 }
2051 return ok(value_t::integer);
2052 }
2053 if(loc.iter() != loc.end() && *loc.iter() == '.')
2054 {
2055 return err(format_underline("bad float: invalid format",
2056 {{source_location(loc), "integer part required before this"}},
2057 {"pass: +1.0, -2e-2, 3.141_592_653_589, inf, nan",
2058 "fail: .0, 1., _1.0, 1.0_, 1_.0, 1.0__0"}));
2059 }
2060 if(loc.iter() != loc.end() && *loc.iter() == '_')
2061 {
2062 return err(format_underline("bad number: `_` should be surrounded by digits",
2063 {{source_location(loc), "`_` is not surrounded by digits"}},
2064 {"pass: -42, 1_000, 1_2_3_4_5, 0xC0FFEE, 0b0010, 0o755",
2065 "fail: 1__000, 0123"}));
2066 }
2067 return err(format_underline("bad format: unknown value appeared",
2068 {{source_location(loc), "here"}}));
2069 }
2070
2071 inline result<value_t, std::string> guess_value_type(const location& loc)
2072 {
2073 switch(*loc.iter())
2074 {
2075 case '"' : {return ok(value_t::string); }
2076 case '\'': {return ok(value_t::string); }
2077 case 't' : {return ok(value_t::boolean); }
2078 case 'f' : {return ok(value_t::boolean); }
2079 case '[' : {return ok(value_t::array); }
2080 case '{' : {return ok(value_t::table); }
2081 case 'i' : {return ok(value_t::floating);} // inf.
2082 case 'n' : {return ok(value_t::floating);} // nan.
2083 default : {return guess_number_type(loc);}
2084 }
2085 }
2086
2087 template<typename Value, typename T>
2088 result<Value, std::string>
2089 parse_value_helper(result<std::pair<T, region>, std::string> rslt)
2090 {
2091 if(rslt.is_ok())
2092 {
2093 auto comments = rslt.as_ok().second.comments();
2094 return ok(Value(std::move(rslt.as_ok()), std::move(comments)));
2095 }
2096 else
2097 {
2098 return err(std::move(rslt.as_err()));
2099 }
2100 }
2101
2102 template<typename Value>
2103 result<Value, std::string> parse_value(location& loc, const std::size_t n_rec)
2104 {
2105 const auto first = loc.iter();
2106 if(first == loc.end())
2107 {
2108 return err(format_underline("toml::parse_value: input is empty",
2109 {{source_location(loc), ""}}));
2110 }
2111
2112 const auto type = guess_value_type(loc);
2113 if(!type)
2114 {
2115 return err(type.unwrap_err());
2116 }
2117
2118 switch(type.unwrap())
2119 {
2120 case value_t::boolean : {return parse_value_helper<Value>(parse_boolean(loc) );}
2121 case value_t::integer : {return parse_value_helper<Value>(parse_integer(loc) );}
2122 case value_t::floating : {return parse_value_helper<Value>(parse_floating(loc) );}
2123 case value_t::string : {return parse_value_helper<Value>(parse_string(loc) );}
2124 case value_t::offset_datetime: {return parse_value_helper<Value>(parse_offset_datetime(loc) );}
2125 case value_t::local_datetime : {return parse_value_helper<Value>(parse_local_datetime(loc) );}
2126 case value_t::local_date : {return parse_value_helper<Value>(parse_local_date(loc) );}
2127 case value_t::local_time : {return parse_value_helper<Value>(parse_local_time(loc) );}
2128 case value_t::array : {return parse_value_helper<Value>(parse_array<Value>(loc, n_rec));}
2129 case value_t::table : {return parse_value_helper<Value>(parse_inline_table<Value>(loc, n_rec));}
2130 default:
2131 {
2132 const auto msg = format_underline("toml::parse_value: "
2133 "unknown token appeared", {{source_location(loc), "unknown"}});
2134 loc.reset(first);
2135 return err(msg);
2136 }
2137 }
2138 }
2139
2140 inline result<std::pair<std::vector<key>, region>, std::string>
2141 parse_table_key(location& loc)
2142 {
2143 if(auto token = lex_std_table::invoke(loc))
2144 {
2145 location inner_loc(loc.name(), token.unwrap().str());
2146
2147 const auto open = lex_std_table_open::invoke(inner_loc);
2148 if(!open || inner_loc.iter() == inner_loc.end())
2149 {
2150 throw internal_error(format_underline(
2151 "toml::parse_table_key: no `[`",
2152 {{source_location(inner_loc), "should be `[`"}}),
2153 source_location(inner_loc));
2154 }
2155 // to skip [ a . b . c ]
2156 // ^----------- this whitespace
2157 lex_ws::invoke(inner_loc);
2158 const auto keys = parse_key(inner_loc);
2159 if(!keys)
2160 {
2161 throw internal_error(format_underline(
2162 "toml::parse_table_key: invalid key",
2163 {{source_location(inner_loc), "not key"}}),
2164 source_location(inner_loc));
2165 }
2166 // to skip [ a . b . c ]
2167 // ^-- this whitespace
2168 lex_ws::invoke(inner_loc);
2169 const auto close = lex_std_table_close::invoke(inner_loc);
2170 if(!close)
2171 {
2172 throw internal_error(format_underline(
2173 "toml::parse_table_key: no `]`",
2174 {{source_location(inner_loc), "should be `]`"}}),
2175 source_location(inner_loc));
2176 }
2177
2178 // after [table.key], newline or EOF(empty table) required.
2179 if(loc.iter() != loc.end())
2180 {
2181 using lex_newline_after_table_key =
2182 sequence<maybe<lex_ws>, maybe<lex_comment>, lex_newline>;
2183 const auto nl = lex_newline_after_table_key::invoke(loc);
2184 if(!nl)
2185 {
2186 throw syntax_error(format_underline(
2187 "toml::parse_table_key: newline required after [table.key]",
2188 {{source_location(loc), "expected newline"}}),
2189 source_location(loc));
2190 }
2191 }
2192 return ok(std::make_pair(keys.unwrap().first, token.unwrap()));
2193 }
2194 else
2195 {
2196 return err(format_underline("toml::parse_table_key: "
2197 "not a valid table key", {{source_location(loc), "here"}}));
2198 }
2199 }
2200
2201 inline result<std::pair<std::vector<key>, region>, std::string>
2202 parse_array_table_key(location& loc)
2203 {
2204 if(auto token = lex_array_table::invoke(loc))
2205 {
2206 location inner_loc(loc.name(), token.unwrap().str());
2207
2208 const auto open = lex_array_table_open::invoke(inner_loc);
2209 if(!open || inner_loc.iter() == inner_loc.end())
2210 {
2211 throw internal_error(format_underline(
2212 "toml::parse_array_table_key: no `[[`",
2213 {{source_location(inner_loc), "should be `[[`"}}),
2214 source_location(inner_loc));
2215 }
2216 lex_ws::invoke(inner_loc);
2217 const auto keys = parse_key(inner_loc);
2218 if(!keys)
2219 {
2220 throw internal_error(format_underline(
2221 "toml::parse_array_table_key: invalid key",
2222 {{source_location(inner_loc), "not a key"}}),
2223 source_location(inner_loc));
2224 }
2225 lex_ws::invoke(inner_loc);
2226 const auto close = lex_array_table_close::invoke(inner_loc);
2227 if(!close)
2228 {
2229 throw internal_error(format_underline(
2230 "toml::parse_array_table_key: no `]]`",
2231 {{source_location(inner_loc), "should be `]]`"}}),
2232 source_location(inner_loc));
2233 }
2234
2235 // after [[table.key]], newline or EOF(empty table) required.
2236 if(loc.iter() != loc.end())
2237 {
2238 using lex_newline_after_table_key =
2239 sequence<maybe<lex_ws>, maybe<lex_comment>, lex_newline>;
2240 const auto nl = lex_newline_after_table_key::invoke(loc);
2241 if(!nl)
2242 {
2243 throw syntax_error(format_underline("toml::"
2244 "parse_array_table_key: newline required after [[table.key]]",
2245 {{source_location(loc), "expected newline"}}),
2246 source_location(loc));
2247 }
2248 }
2249 return ok(std::make_pair(keys.unwrap().first, token.unwrap()));
2250 }
2251 else
2252 {
2253 return err(format_underline("toml::parse_array_table_key: "
2254 "not a valid table key", {{source_location(loc), "here"}}));
2255 }
2256 }
2257
2258 // parse table body (key-value pairs until the iter hits the next [tablekey])
2259 template<typename Value>
2260 result<typename Value::table_type, std::string>
2261 parse_ml_table(location& loc)
2262 {
2263 using value_type = Value;
2264 using table_type = typename value_type::table_type;
2265
2266 const auto first = loc.iter();
2267 if(first == loc.end())
2268 {
2269 return ok(table_type{});
2270 }
2271
2272 // XXX at lest one newline is needed.
2273 using skip_line = repeat<
2274 sequence<maybe<lex_ws>, maybe<lex_comment>, lex_newline>, at_least<1>>;
2275 skip_line::invoke(loc);
2276 lex_ws::invoke(loc);
2277
2278 table_type tab;
2279 while(loc.iter() != loc.end())
2280 {
2281 lex_ws::invoke(loc);
2282 const auto before = loc.iter();
2283 if(const auto tmp = parse_array_table_key(loc)) // next table found
2284 {
2285 loc.reset(before);
2286 return ok(tab);
2287 }
2288 if(const auto tmp = parse_table_key(loc)) // next table found
2289 {
2290 loc.reset(before);
2291 return ok(tab);
2292 }
2293
2294 if(const auto kv = parse_key_value_pair<value_type>(loc, 0))
2295 {
2296 const auto& kvpair = kv.unwrap();
2297 const std::vector<key>& keys = kvpair.first.first;
2298 const auto& key_reg = kvpair.first.second;
2299 const value_type& val = kvpair.second;
2300 const auto inserted =
2301 insert_nested_key(tab, val, keys.begin(), keys.end(), key_reg);
2302 if(!inserted)
2303 {
2304 return err(inserted.unwrap_err());
2305 }
2306 }
2307 else
2308 {
2309 return err(kv.unwrap_err());
2310 }
2311
2312 // comment lines are skipped by the above function call.
2313 // However, since the `skip_line` requires at least 1 newline, it fails
2314 // if the file ends with ws and/or comment without newline.
2315 // `skip_line` matches `ws? + comment? + newline`, not `ws` or `comment`
2316 // itself. To skip the last ws and/or comment, call lexers.
2317 // It does not matter if these fails, so the return value is discarded.
2318 lex_ws::invoke(loc);
2319 lex_comment::invoke(loc);
2320
2321 // skip_line is (whitespace? comment? newline)_{1,}. multiple empty lines
2322 // and comments after the last key-value pairs are allowed.
2323 const auto newline = skip_line::invoke(loc);
2324 if(!newline && loc.iter() != loc.end())
2325 {
2326 const auto before2 = loc.iter();
2327 lex_ws::invoke(loc); // skip whitespace
2328 const auto msg = format_underline("toml::parse_table: "
2329 "invalid line format", {{source_location(loc), concat_to_string(
2330 "expected newline, but got '", show_char(*loc.iter()), "'.")}});
2331 loc.reset(before2);
2332 return err(msg);
2333 }
2334
2335 // the skip_lines only matches with lines that includes newline.
2336 // to skip the last line that includes comment and/or whitespace
2337 // but no newline, call them one more time.
2338 lex_ws::invoke(loc);
2339 lex_comment::invoke(loc);
2340 }
2341 return ok(tab);
2342 }
2343
2344 template<typename Value>
2345 result<Value, std::string> parse_toml_file(location& loc)
2346 {
2347 using value_type = Value;
2348 using table_type = typename value_type::table_type;
2349
2350 const auto first = loc.iter();
2351 if(first == loc.end())
2352 {
2353 // For empty files, return an empty table with an empty region (zero-length).
2354 // Without the region, error messages would miss the filename.
2355 return ok(value_type(table_type{}, region(loc, first, first), {}));
2356 }
2357
2358 // put the first line as a region of a file
2359 // Here first != loc.end(), so taking std::next is okay
2360 const region file(loc, first, std::next(loc.iter()));
2361
2362 // The first successive comments that are separated from the first value
2363 // by an empty line are for a file itself.
2364 // ```toml
2365 // # this is a comment for a file.
2366 //
2367 // key = "the first value"
2368 // ```
2369 // ```toml
2370 // # this is a comment for "the first value".
2371 // key = "the first value"
2372 // ```
2373 std::vector<std::string> comments;
2374 using lex_first_comments = sequence<
2375 repeat<sequence<maybe<lex_ws>, lex_comment, lex_newline>, at_least<1>>,
2376 sequence<maybe<lex_ws>, lex_newline>
2377 >;
2378 if(const auto token = lex_first_comments::invoke(loc))
2379 {
2380 location inner_loc(loc.name(), token.unwrap().str());
2381 while(inner_loc.iter() != inner_loc.end())
2382 {
2383 maybe<lex_ws>::invoke(inner_loc); // remove ws if exists
2384 if(lex_newline::invoke(inner_loc))
2385 {
2386 assert(inner_loc.iter() == inner_loc.end());
2387 break; // empty line found.
2388 }
2389 auto com = lex_comment::invoke(inner_loc).unwrap().str();
2390 com.erase(com.begin()); // remove # sign
2391 comments.push_back(std::move(com));
2392 lex_newline::invoke(inner_loc);
2393 }
2394 }
2395
2396 table_type data;
2397 // root object is also a table, but without [tablename]
2398 if(const auto tab = parse_ml_table<value_type>(loc))
2399 {
2400 data = std::move(tab.unwrap());
2401 }
2402 else // failed (empty table is regarded as success in parse_ml_table)
2403 {
2404 return err(tab.unwrap_err());
2405 }
2406 while(loc.iter() != loc.end())
2407 {
2408 // here, the region of [table] is regarded as the table-key because
2409 // the table body is normally too big and it is not so informative
2410 // if the first key-value pair of the table is shown in the error
2411 // message.
2412 if(const auto tabkey = parse_array_table_key(loc))
2413 {
2414 const auto tab = parse_ml_table<value_type>(loc);
2415 if(!tab){return err(tab.unwrap_err());}
2416
2417 const auto& tk = tabkey.unwrap();
2418 const auto& keys = tk.first;
2419 const auto& reg = tk.second;
2420
2421 const auto inserted = insert_nested_key(data,
2422 value_type(tab.unwrap(), reg, reg.comments()),
2423 keys.begin(), keys.end(), reg,
2424 /*is_array_of_table=*/ true);
2425 if(!inserted) {return err(inserted.unwrap_err());}
2426
2427 continue;
2428 }
2429 if(const auto tabkey = parse_table_key(loc))
2430 {
2431 const auto tab = parse_ml_table<value_type>(loc);
2432 if(!tab){return err(tab.unwrap_err());}
2433
2434 const auto& tk = tabkey.unwrap();
2435 const auto& keys = tk.first;
2436 const auto& reg = tk.second;
2437
2438 const auto inserted = insert_nested_key(data,
2439 value_type(tab.unwrap(), reg, reg.comments()),
2440 keys.begin(), keys.end(), reg);
2441 if(!inserted) {return err(inserted.unwrap_err());}
2442
2443 continue;
2444 }
2445 return err(format_underline("toml::parse_toml_file: "
2446 "unknown line appeared", {{source_location(loc), "unknown format"}}));
2447 }
2448
2449 return ok(Value(std::move(data), file, comments));
2450 }
2451
2452 template<typename Comment = TOML11_DEFAULT_COMMENT_STRATEGY,
2453 template<typename ...> class Table = std::unordered_map,
2454 template<typename ...> class Array = std::vector>
2455 basic_value<Comment, Table, Array>
2456 parse(std::vector<char>& letters, const std::string& fname)
2457 {
2458 using value_type = basic_value<Comment, Table, Array>;
2459
2460 // append LF.
2461 // Although TOML does not require LF at the EOF, to make parsing logic
2462 // simpler, we "normalize" the content by adding LF if it does not exist.
2463 // It also checks if the last char is CR, to avoid changing the meaning.
2464 // This is not the *best* way to deal with the last character, but is a
2465 // simple and quick fix.
2466 if(!letters.empty() && letters.back() != '\n' && letters.back() != '\r')
2467 {
2468 letters.push_back('\n');
2469 }
2470
2471 detail::location loc(std::move(fname), std::move(letters));
2472
2473 // skip BOM if exists.
2474 // XXX component of BOM (like 0xEF) exceeds the representable range of
2475 // signed char, so on some (actually, most) of the environment, these cannot
2476 // be compared to char. However, since we are always out of luck, we need to
2477 // check our chars are equivalent to BOM. To do this, first we need to
2478 // convert char to unsigned char to guarantee the comparability.
2479 if(loc.source()->size() >= 3)
2480 {
2481 std::array<unsigned char, 3> BOM;
2482 std::memcpy(BOM.data(), loc.source()->data(), 3);
2483 if(BOM[0] == 0xEF && BOM[1] == 0xBB && BOM[2] == 0xBF)
2484 {
2485 loc.advance(3); // BOM found. skip.
2486 }
2487 }
2488
2489 if (auto data = detail::parse_toml_file<value_type>(loc))
2490 {
2491 return std::move(data).unwrap();
2492 }
2493 else
2494 {
2495 throw syntax_error(std::move(data).unwrap_err(), source_location(loc));
2496 }
2497 }
2498
2499 } // detail
2500
2501 template<typename Comment = TOML11_DEFAULT_COMMENT_STRATEGY,
2502 template<typename ...> class Table = std::unordered_map,
2503 template<typename ...> class Array = std::vector>
2504 basic_value<Comment, Table, Array>
2505 parse(FILE * file, const std::string& fname)
2506 {
2507 const long beg = std::ftell(file);
2508 if (beg == -1l)
2509 {
2510 throw file_io_error(errno, "Failed to access", fname);
2511 }
2512
2513 const int res_seekend = std::fseek(file, 0, SEEK_END);
2514 if (res_seekend != 0)
2515 {
2516 throw file_io_error(errno, "Failed to seek", fname);
2517 }
2518
2519 const long end = std::ftell(file);
2520 if (end == -1l)
2521 {
2522 throw file_io_error(errno, "Failed to access", fname);
2523 }
2524
2525 const auto fsize = end - beg;
2526
2527 const auto res_seekbeg = std::fseek(file, beg, SEEK_SET);
2528 if (res_seekbeg != 0)
2529 {
2530 throw file_io_error(errno, "Failed to seek", fname);
2531 }
2532
2533 // read whole file as a sequence of char
2534 assert(fsize >= 0);
2535 std::vector<char> letters(static_cast<std::size_t>(fsize));
2536 std::fread(letters.data(), sizeof(char), static_cast<std::size_t>(fsize), file);
2537
2538 return detail::parse<Comment, Table, Array>(letters, fname);
2539 }
2540
2541 template<typename Comment = TOML11_DEFAULT_COMMENT_STRATEGY,
2542 template<typename ...> class Table = std::unordered_map,
2543 template<typename ...> class Array = std::vector>
2544 basic_value<Comment, Table, Array>
2545 parse(std::istream& is, std::string fname = "unknown file")
2546 {
2547 const auto beg = is.tellg();
2548 is.seekg(0, std::ios::end);
2549 const auto end = is.tellg();
2550 const auto fsize = end - beg;
2551 is.seekg(beg);
2552
2553 // read whole file as a sequence of char
2554 assert(fsize >= 0);
2555 std::vector<char> letters(static_cast<std::size_t>(fsize));
2556 is.read(letters.data(), fsize);
2557
2558 return detail::parse<Comment, Table, Array>(letters, fname);
2559 }
2560
2561 template<typename Comment = TOML11_DEFAULT_COMMENT_STRATEGY,
2562 template<typename ...> class Table = std::unordered_map,
2563 template<typename ...> class Array = std::vector>
2564 basic_value<Comment, Table, Array> parse(std::string fname)
2565 {
2566 std::ifstream ifs(fname, std::ios_base::binary);
2567 if(!ifs.good())
2568 {
2569 throw std::ios_base::failure(
2570 "toml::parse: Error opening file \"" + fname + "\"");
2571 }
2572 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
2573 return parse<Comment, Table, Array>(ifs, std::move(fname));
2574 }
2575
2576 #ifdef TOML11_HAS_STD_FILESYSTEM
2577 // This function just forwards `parse("filename.toml")` to std::string version
2578 // to avoid the ambiguity in overload resolution.
2579 //
2580 // Both std::string and std::filesystem::path are convertible from const char*.
2581 // Without this, both parse(std::string) and parse(std::filesystem::path)
2582 // matches to parse("filename.toml"). This breaks the existing code.
2583 //
2584 // This function exactly matches to the invocation with c-string.
2585 // So this function is preferred than others and the ambiguity disappears.
2586 template<typename Comment = TOML11_DEFAULT_COMMENT_STRATEGY,
2587 template<typename ...> class Table = std::unordered_map,
2588 template<typename ...> class Array = std::vector>
2589 basic_value<Comment, Table, Array> parse(const char* fname)
2590 {
2591 return parse<Comment, Table, Array>(std::string(fname));
2592 }
2593
2594 template<typename Comment = TOML11_DEFAULT_COMMENT_STRATEGY,
2595 template<typename ...> class Table = std::unordered_map,
2596 template<typename ...> class Array = std::vector>
2597 basic_value<Comment, Table, Array> parse(const std::filesystem::path& fpath)
2598 {
2599 std::ifstream ifs(fpath, std::ios_base::binary);
2600 if(!ifs.good())
2601 {
2602 throw std::ios_base::failure(
2603 "toml::parse: Error opening file \"" + fpath.string() + "\"");
2604 }
2605 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
2606 return parse<Comment, Table, Array>(ifs, fpath.string());
2607 }
2608 #endif // TOML11_HAS_STD_FILESYSTEM
2609
2610 } // toml
2611 #endif// TOML11_PARSER_HPP