8 #include <initializer_list>
13 #include <nlohmann/detail/input/input_adapters.hpp>
14 #include <nlohmann/detail/input/position_t.hpp>
15 #include <nlohmann/detail/macro_scope.hpp>
25 template<
typename BasicJsonType>
52 JSON_HEDLEY_RETURNS_NON_NULL
59 return "<uninitialized>";
61 return "true literal";
63 return "false literal";
65 return "null literal";
67 return "string literal";
71 return "number literal";
85 return "<parse error>";
87 return "end of input";
89 return "'[', '{', or a literal";
92 return "unknown token";
102 template<
typename BasicJsonType,
typename InputAdapterType>
105 using number_integer_t =
typename BasicJsonType::number_integer_t;
106 using number_unsigned_t =
typename BasicJsonType::number_unsigned_t;
107 using number_float_t =
typename BasicJsonType::number_float_t;
108 using string_t =
typename BasicJsonType::string_t;
109 using char_type =
typename InputAdapterType::char_type;
110 using char_int_type =
typename std::char_traits<char_type>::int_type;
115 explicit lexer(InputAdapterType&& adapter,
bool ignore_comments_ =
false) noexcept
116 : ia(std::move(adapter))
117 , ignore_comments(ignore_comments_)
118 , decimal_point_char(
static_cast<char_int_type
>(get_decimal_point()))
135 static char get_decimal_point() noexcept
137 const auto* loc = localeconv();
138 JSON_ASSERT(loc !=
nullptr);
139 return (loc->decimal_point ==
nullptr) ?
'.' : *(loc->decimal_point);
164 JSON_ASSERT(current ==
'u');
167 const auto factors = { 12u, 8u, 4u, 0u };
168 for (
const auto factor : factors)
172 if (current >=
'0' && current <=
'9')
174 codepoint +=
static_cast<int>((
static_cast<unsigned int>(current) - 0x30u) << factor);
176 else if (current >=
'A' && current <=
'F')
178 codepoint +=
static_cast<int>((
static_cast<unsigned int>(current) - 0x37u) << factor);
180 else if (current >=
'a' && current <=
'f')
182 codepoint +=
static_cast<int>((
static_cast<unsigned int>(current) - 0x57u) << factor);
190 JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
209 bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
211 JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
214 for (
auto range = ranges.begin(); range != ranges.end(); ++range)
217 if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
223 error_message =
"invalid string: ill-formed UTF-8 byte";
252 JSON_ASSERT(current ==
'\"');
260 case std::char_traits<char_type>::eof():
262 error_message =
"invalid string: missing closing quote";
263 return token_type::parse_error;
269 return token_type::value_string;
313 const int codepoint1 = get_codepoint();
314 int codepoint = codepoint1;
316 if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
318 error_message =
"invalid string: '\\u' must be followed by 4 hex digits";
319 return token_type::parse_error;
323 if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
326 if (JSON_HEDLEY_LIKELY(get() ==
'\\' && get() ==
'u'))
328 const int codepoint2 = get_codepoint();
330 if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
332 error_message =
"invalid string: '\\u' must be followed by 4 hex digits";
333 return token_type::parse_error;
337 if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
340 codepoint =
static_cast<int>(
342 (
static_cast<unsigned int>(codepoint1) << 10u)
344 +
static_cast<unsigned int>(codepoint2)
352 error_message =
"invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
353 return token_type::parse_error;
358 error_message =
"invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
359 return token_type::parse_error;
364 if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
366 error_message =
"invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
367 return token_type::parse_error;
372 JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
375 if (codepoint < 0x80)
378 add(
static_cast<char_int_type
>(codepoint));
380 else if (codepoint <= 0x7FF)
383 add(
static_cast<char_int_type
>(0xC0u | (
static_cast<unsigned int>(codepoint) >> 6u)));
384 add(
static_cast<char_int_type
>(0x80u | (
static_cast<unsigned int>(codepoint) & 0x3Fu)));
386 else if (codepoint <= 0xFFFF)
389 add(
static_cast<char_int_type
>(0xE0u | (
static_cast<unsigned int>(codepoint) >> 12u)));
390 add(
static_cast<char_int_type
>(0x80u | ((
static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
391 add(
static_cast<char_int_type
>(0x80u | (
static_cast<unsigned int>(codepoint) & 0x3Fu)));
396 add(
static_cast<char_int_type
>(0xF0u | (
static_cast<unsigned int>(codepoint) >> 18u)));
397 add(
static_cast<char_int_type
>(0x80u | ((
static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
398 add(
static_cast<char_int_type
>(0x80u | ((
static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
399 add(
static_cast<char_int_type
>(0x80u | (
static_cast<unsigned int>(codepoint) & 0x3Fu)));
407 error_message =
"invalid string: forbidden character after backslash";
408 return token_type::parse_error;
417 error_message =
"invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
418 return token_type::parse_error;
423 error_message =
"invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
424 return token_type::parse_error;
429 error_message =
"invalid string: control character U+0002 (STX) must be escaped to \\u0002";
430 return token_type::parse_error;
435 error_message =
"invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
436 return token_type::parse_error;
441 error_message =
"invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
442 return token_type::parse_error;
447 error_message =
"invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
448 return token_type::parse_error;
453 error_message =
"invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
454 return token_type::parse_error;
459 error_message =
"invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
460 return token_type::parse_error;
465 error_message =
"invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
466 return token_type::parse_error;
471 error_message =
"invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
472 return token_type::parse_error;
477 error_message =
"invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
478 return token_type::parse_error;
483 error_message =
"invalid string: control character U+000B (VT) must be escaped to \\u000B";
484 return token_type::parse_error;
489 error_message =
"invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
490 return token_type::parse_error;
495 error_message =
"invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
496 return token_type::parse_error;
501 error_message =
"invalid string: control character U+000E (SO) must be escaped to \\u000E";
502 return token_type::parse_error;
507 error_message =
"invalid string: control character U+000F (SI) must be escaped to \\u000F";
508 return token_type::parse_error;
513 error_message =
"invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
514 return token_type::parse_error;
519 error_message =
"invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
520 return token_type::parse_error;
525 error_message =
"invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
526 return token_type::parse_error;
531 error_message =
"invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
532 return token_type::parse_error;
537 error_message =
"invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
538 return token_type::parse_error;
543 error_message =
"invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
544 return token_type::parse_error;
549 error_message =
"invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
550 return token_type::parse_error;
555 error_message =
"invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
556 return token_type::parse_error;
561 error_message =
"invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
562 return token_type::parse_error;
567 error_message =
"invalid string: control character U+0019 (EM) must be escaped to \\u0019";
568 return token_type::parse_error;
573 error_message =
"invalid string: control character U+001A (SUB) must be escaped to \\u001A";
574 return token_type::parse_error;
579 error_message =
"invalid string: control character U+001B (ESC) must be escaped to \\u001B";
580 return token_type::parse_error;
585 error_message =
"invalid string: control character U+001C (FS) must be escaped to \\u001C";
586 return token_type::parse_error;
591 error_message =
"invalid string: control character U+001D (GS) must be escaped to \\u001D";
592 return token_type::parse_error;
597 error_message =
"invalid string: control character U+001E (RS) must be escaped to \\u001E";
598 return token_type::parse_error;
603 error_message =
"invalid string: control character U+001F (US) must be escaped to \\u001F";
604 return token_type::parse_error;
739 if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
741 return token_type::parse_error;
749 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
751 return token_type::parse_error;
773 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
775 return token_type::parse_error;
783 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
785 return token_type::parse_error;
793 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
795 return token_type::parse_error;
805 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
807 return token_type::parse_error;
815 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
817 return token_type::parse_error;
825 error_message =
"invalid string: ill-formed UTF-8 byte";
826 return token_type::parse_error;
849 case std::char_traits<char_type>::eof():
866 case std::char_traits<char_type>::eof():
869 error_message =
"invalid comment; missing closing '*/'";
897 error_message =
"invalid comment; expecting '/' or '*' after '/'";
903 JSON_HEDLEY_NON_NULL(2)
904 static void strtof(
float& f,
const char* str,
char** endptr) noexcept
906 f = std::strtof(str, endptr);
909 JSON_HEDLEY_NON_NULL(2)
910 static void strtof(
double& f,
const char* str,
char** endptr) noexcept
912 f = std::strtod(str, endptr);
915 JSON_HEDLEY_NON_NULL(2)
916 static void strtof(
long double& f,
const char* str,
char** endptr) noexcept
918 f = std::strtold(str, endptr);
968 token_type number_type = token_type::value_unsigned;
976 goto scan_number_minus;
982 goto scan_number_zero;
996 goto scan_number_any1;
1006 number_type = token_type::value_integer;
1012 goto scan_number_zero;
1026 goto scan_number_any1;
1031 error_message =
"invalid number; expected digit after '-'";
1032 return token_type::parse_error;
1042 add(decimal_point_char);
1043 goto scan_number_decimal1;
1050 goto scan_number_exponent;
1054 goto scan_number_done;
1073 goto scan_number_any1;
1078 add(decimal_point_char);
1079 goto scan_number_decimal1;
1086 goto scan_number_exponent;
1090 goto scan_number_done;
1093 scan_number_decimal1:
1095 number_type = token_type::value_float;
1110 goto scan_number_decimal2;
1115 error_message =
"invalid number; expected digit after '.'";
1116 return token_type::parse_error;
1120 scan_number_decimal2:
1136 goto scan_number_decimal2;
1143 goto scan_number_exponent;
1147 goto scan_number_done;
1150 scan_number_exponent:
1152 number_type = token_type::value_float;
1159 goto scan_number_sign;
1174 goto scan_number_any2;
1180 "invalid number; expected '+', '-', or digit after exponent";
1181 return token_type::parse_error;
1201 goto scan_number_any2;
1206 error_message =
"invalid number; expected digit after exponent sign";
1207 return token_type::parse_error;
1227 goto scan_number_any2;
1231 goto scan_number_done;
1239 char* endptr =
nullptr;
1243 if (number_type == token_type::value_unsigned)
1245 const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
1248 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1252 value_unsigned =
static_cast<number_unsigned_t
>(x);
1253 if (value_unsigned == x)
1255 return token_type::value_unsigned;
1259 else if (number_type == token_type::value_integer)
1261 const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1264 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1268 value_integer =
static_cast<number_integer_t
>(x);
1269 if (value_integer == x)
1271 return token_type::value_integer;
1278 strtof(value_float, token_buffer.data(), &endptr);
1281 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1283 return token_type::value_float;
1291 JSON_HEDLEY_NON_NULL(2)
1292 token_type scan_literal(
const char_type* literal_text,
const std::size_t length,
1295 JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
1296 for (std::size_t i = 1; i < length; ++i)
1298 if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
1300 error_message =
"invalid literal";
1301 return token_type::parse_error;
1312 void reset() noexcept
1314 token_buffer.clear();
1315 token_string.clear();
1316 token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1341 current = ia.get_character();
1344 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1346 token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1349 if (current ==
'\n')
1385 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1387 JSON_ASSERT(!token_string.empty());
1388 token_string.pop_back();
1393 void add(char_int_type c)
1395 token_buffer.push_back(
static_cast<typename string_t::value_type
>(c));
1406 return value_integer;
1412 return value_unsigned;
1424 return token_buffer;
1444 for (
const auto c : token_string)
1446 if (
static_cast<unsigned char>(c) <=
'\x1F')
1449 std::array<char, 9> cs{{}};
1450 (std::snprintf)(cs.data(), cs.size(),
"<U+%.4X>",
static_cast<unsigned char>(c));
1451 result += cs.data();
1456 result.push_back(
static_cast<std::string::value_type
>(c));
1464 JSON_HEDLEY_RETURNS_NON_NULL
1467 return error_message;
1483 return get() == 0xBB && get() == 0xBF;
1492 void skip_whitespace()
1498 while (current ==
' ' || current ==
'\t' || current ==
'\n' || current ==
'\r');
1506 error_message =
"invalid BOM; must be 0xEF 0xBB 0xBF if given";
1507 return token_type::parse_error;
1514 while (ignore_comments && current ==
'/')
1516 if (!scan_comment())
1518 return token_type::parse_error;
1529 return token_type::begin_array;
1531 return token_type::end_array;
1533 return token_type::begin_object;
1535 return token_type::end_object;
1537 return token_type::name_separator;
1539 return token_type::value_separator;
1544 std::array<char_type, 4> true_literal = {{char_type(
't'), char_type(
'r'), char_type(
'u'), char_type(
'e')}};
1545 return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
1549 std::array<char_type, 5> false_literal = {{char_type(
'f'), char_type(
'a'), char_type(
'l'), char_type(
's'), char_type(
'e')}};
1550 return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
1554 std::array<char_type, 4> null_literal = {{char_type(
'n'), char_type(
'u'), char_type(
'l'), char_type(
'l')}};
1555 return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
1560 return scan_string();
1574 return scan_number();
1579 case std::char_traits<char_type>::eof():
1580 return token_type::end_of_input;
1584 error_message =
"invalid literal";
1585 return token_type::parse_error;
1591 InputAdapterType ia;
1594 const bool ignore_comments =
false;
1597 char_int_type current = std::char_traits<char_type>::eof();
1600 bool next_unget =
false;
1603 position_t position {};
1606 std::vector<char_type> token_string {};
1609 string_t token_buffer {};
1612 const char* error_message =
"";
1615 number_integer_t value_integer = 0;
1616 number_unsigned_t value_unsigned = 0;
1617 number_float_t value_float = 0;
1620 const char_int_type decimal_point_char =
'.';
token_type
token types for the parser
Definition: lexer.hpp:31
@ value_float
an floating point number – use get_number_float() for actual value
@ begin_array
the character for array begin [
@ value_string
a string – use get_string() for actual value
@ end_array
the character for array end ]
@ uninitialized
indicating the scanner is uninitialized
@ parse_error
indicating a parse error
@ value_integer
a signed integer – use get_number_integer() for actual value
@ value_separator
the value separator ,
@ end_object
the character for object end }
@ literal_true
the true literal
@ begin_object
the character for object begin {
@ value_unsigned
an unsigned integer – use get_number_unsigned() for actual value
@ literal_null
the null literal
@ end_of_input
indicating the end of the input buffer
@ name_separator
the name separator :
@ literal_or_value
a literal or the begin of a value (only for diagnostics)
@ literal_false
the false literal
JSON_HEDLEY_RETURNS_NON_NULL static JSON_HEDLEY_CONST const char * token_type_name(const token_type t) noexcept
return name of values of type token_type (only used for errors)
Definition: lexer.hpp:54
lexical analysis
Definition: lexer.hpp:104
string_t & get_string()
return current string value (implicitly resets the token; useful only once)
Definition: lexer.hpp:1422
bool skip_bom()
skip the UTF-8 byte order mark
Definition: lexer.hpp:1478
constexpr position_t get_position() const noexcept
return position of last read token
Definition: lexer.hpp:1432
constexpr number_integer_t get_number_integer() const noexcept
return integer value
Definition: lexer.hpp:1404
constexpr JSON_HEDLEY_RETURNS_NON_NULL const char * get_error_message() const noexcept
return syntax error message
Definition: lexer.hpp:1465
constexpr number_unsigned_t get_number_unsigned() const noexcept
return unsigned integer value
Definition: lexer.hpp:1410
constexpr number_float_t get_number_float() const noexcept
return floating-point value
Definition: lexer.hpp:1416
std::string get_token_string() const
return the last read token (for errors only).
Definition: lexer.hpp:1440
namespace for Niels Lohmann
Definition: adl_serializer.hpp:12
struct to capture the start position of the current token
Definition: position_t.hpp:11
std::size_t lines_read
the number of lines read
Definition: position_t.hpp:17
std::size_t chars_read_current_line
the number of characters read in the current line
Definition: position_t.hpp:15
std::size_t chars_read_total
the total number of characters read
Definition: position_t.hpp:13