Horizon
lexer.hpp
1 #pragma once
2 
3 #include <array> // array
4 #include <clocale> // localeconv
5 #include <cstddef> // size_t
6 #include <cstdio> // snprintf
7 #include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
8 #include <initializer_list> // initializer_list
9 #include <string> // char_traits, string
10 #include <utility> // move
11 #include <vector> // vector
12 
13 #include <nlohmann/detail/input/input_adapters.hpp>
14 #include <nlohmann/detail/input/position_t.hpp>
15 #include <nlohmann/detail/macro_scope.hpp>
16 
17 namespace nlohmann
18 {
19 namespace detail
20 {
22 // lexer //
24 
25 template<typename BasicJsonType>
27 {
28  public:
30  enum class token_type
31  {
33  literal_true,
35  literal_null,
36  value_string,
39  value_float,
40  begin_array,
41  begin_object,
42  end_array,
43  end_object,
46  parse_error,
47  end_of_input,
49  };
50 
52  JSON_HEDLEY_RETURNS_NON_NULL
53  JSON_HEDLEY_CONST
54  static const char* token_type_name(const token_type t) noexcept
55  {
56  switch (t)
57  {
59  return "<uninitialized>";
61  return "true literal";
63  return "false literal";
65  return "null literal";
67  return "string literal";
71  return "number literal";
73  return "'['";
75  return "'{'";
77  return "']'";
79  return "'}'";
81  return "':'";
83  return "','";
85  return "<parse error>";
87  return "end of input";
89  return "'[', '{', or a literal";
90  // LCOV_EXCL_START
91  default: // catch non-enum values
92  return "unknown token";
93  // LCOV_EXCL_STOP
94  }
95  }
96 };
102 template<typename BasicJsonType, typename InputAdapterType>
103 class lexer : public lexer_base<BasicJsonType>
104 {
105  using number_integer_t = typename BasicJsonType::number_integer_t;
106  using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
107  using number_float_t = typename BasicJsonType::number_float_t;
108  using string_t = typename BasicJsonType::string_t;
109  using char_type = typename InputAdapterType::char_type;
110  using char_int_type = typename std::char_traits<char_type>::int_type;
111 
112  public:
114 
115  explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
116  : ia(std::move(adapter))
117  , ignore_comments(ignore_comments_)
118  , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
119  {}
120 
121  // delete because of pointer members
122  lexer(const lexer&) = delete;
123  lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
124  lexer& operator=(lexer&) = delete;
125  lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
126  ~lexer() = default;
127 
128  private:
130  // locales
132 
134  JSON_HEDLEY_PURE
135  static char get_decimal_point() noexcept
136  {
137  const auto* loc = localeconv();
138  JSON_ASSERT(loc != nullptr);
139  return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
140  }
141 
143  // scan functions
145 
161  int get_codepoint()
162  {
163  // this function only makes sense after reading `\u`
164  JSON_ASSERT(current == 'u');
165  int codepoint = 0;
166 
167  const auto factors = { 12u, 8u, 4u, 0u };
168  for (const auto factor : factors)
169  {
170  get();
171 
172  if (current >= '0' && current <= '9')
173  {
174  codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
175  }
176  else if (current >= 'A' && current <= 'F')
177  {
178  codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
179  }
180  else if (current >= 'a' && current <= 'f')
181  {
182  codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
183  }
184  else
185  {
186  return -1;
187  }
188  }
189 
190  JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
191  return codepoint;
192  }
193 
209  bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
210  {
211  JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
212  add(current);
213 
214  for (auto range = ranges.begin(); range != ranges.end(); ++range)
215  {
216  get();
217  if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
218  {
219  add(current);
220  }
221  else
222  {
223  error_message = "invalid string: ill-formed UTF-8 byte";
224  return false;
225  }
226  }
227 
228  return true;
229  }
230 
246  token_type scan_string()
247  {
248  // reset token_buffer (ignore opening quote)
249  reset();
250 
251  // we entered the function by reading an open quote
252  JSON_ASSERT(current == '\"');
253 
254  while (true)
255  {
256  // get next character
257  switch (get())
258  {
259  // end of file while parsing string
260  case std::char_traits<char_type>::eof():
261  {
262  error_message = "invalid string: missing closing quote";
263  return token_type::parse_error;
264  }
265 
266  // closing quote
267  case '\"':
268  {
269  return token_type::value_string;
270  }
271 
272  // escapes
273  case '\\':
274  {
275  switch (get())
276  {
277  // quotation mark
278  case '\"':
279  add('\"');
280  break;
281  // reverse solidus
282  case '\\':
283  add('\\');
284  break;
285  // solidus
286  case '/':
287  add('/');
288  break;
289  // backspace
290  case 'b':
291  add('\b');
292  break;
293  // form feed
294  case 'f':
295  add('\f');
296  break;
297  // line feed
298  case 'n':
299  add('\n');
300  break;
301  // carriage return
302  case 'r':
303  add('\r');
304  break;
305  // tab
306  case 't':
307  add('\t');
308  break;
309 
310  // unicode escapes
311  case 'u':
312  {
313  const int codepoint1 = get_codepoint();
314  int codepoint = codepoint1; // start with codepoint1
315 
316  if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
317  {
318  error_message = "invalid string: '\\u' must be followed by 4 hex digits";
319  return token_type::parse_error;
320  }
321 
322  // check if code point is a high surrogate
323  if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
324  {
325  // expect next \uxxxx entry
326  if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
327  {
328  const int codepoint2 = get_codepoint();
329 
330  if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
331  {
332  error_message = "invalid string: '\\u' must be followed by 4 hex digits";
333  return token_type::parse_error;
334  }
335 
336  // check if codepoint2 is a low surrogate
337  if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
338  {
339  // overwrite codepoint
340  codepoint = static_cast<int>(
341  // high surrogate occupies the most significant 22 bits
342  (static_cast<unsigned int>(codepoint1) << 10u)
343  // low surrogate occupies the least significant 15 bits
344  + static_cast<unsigned int>(codepoint2)
345  // there is still the 0xD800, 0xDC00 and 0x10000 noise
346  // in the result so we have to subtract with:
347  // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
348  - 0x35FDC00u);
349  }
350  else
351  {
352  error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
353  return token_type::parse_error;
354  }
355  }
356  else
357  {
358  error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
359  return token_type::parse_error;
360  }
361  }
362  else
363  {
364  if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
365  {
366  error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
367  return token_type::parse_error;
368  }
369  }
370 
371  // result of the above calculation yields a proper codepoint
372  JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
373 
374  // translate codepoint into bytes
375  if (codepoint < 0x80)
376  {
377  // 1-byte characters: 0xxxxxxx (ASCII)
378  add(static_cast<char_int_type>(codepoint));
379  }
380  else if (codepoint <= 0x7FF)
381  {
382  // 2-byte characters: 110xxxxx 10xxxxxx
383  add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
384  add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
385  }
386  else if (codepoint <= 0xFFFF)
387  {
388  // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
389  add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
390  add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
391  add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
392  }
393  else
394  {
395  // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
396  add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
397  add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
398  add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
399  add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
400  }
401 
402  break;
403  }
404 
405  // other characters after escape
406  default:
407  error_message = "invalid string: forbidden character after backslash";
408  return token_type::parse_error;
409  }
410 
411  break;
412  }
413 
414  // invalid control characters
415  case 0x00:
416  {
417  error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
418  return token_type::parse_error;
419  }
420 
421  case 0x01:
422  {
423  error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
424  return token_type::parse_error;
425  }
426 
427  case 0x02:
428  {
429  error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
430  return token_type::parse_error;
431  }
432 
433  case 0x03:
434  {
435  error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
436  return token_type::parse_error;
437  }
438 
439  case 0x04:
440  {
441  error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
442  return token_type::parse_error;
443  }
444 
445  case 0x05:
446  {
447  error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
448  return token_type::parse_error;
449  }
450 
451  case 0x06:
452  {
453  error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
454  return token_type::parse_error;
455  }
456 
457  case 0x07:
458  {
459  error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
460  return token_type::parse_error;
461  }
462 
463  case 0x08:
464  {
465  error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
466  return token_type::parse_error;
467  }
468 
469  case 0x09:
470  {
471  error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
472  return token_type::parse_error;
473  }
474 
475  case 0x0A:
476  {
477  error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
478  return token_type::parse_error;
479  }
480 
481  case 0x0B:
482  {
483  error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
484  return token_type::parse_error;
485  }
486 
487  case 0x0C:
488  {
489  error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
490  return token_type::parse_error;
491  }
492 
493  case 0x0D:
494  {
495  error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
496  return token_type::parse_error;
497  }
498 
499  case 0x0E:
500  {
501  error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
502  return token_type::parse_error;
503  }
504 
505  case 0x0F:
506  {
507  error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
508  return token_type::parse_error;
509  }
510 
511  case 0x10:
512  {
513  error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
514  return token_type::parse_error;
515  }
516 
517  case 0x11:
518  {
519  error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
520  return token_type::parse_error;
521  }
522 
523  case 0x12:
524  {
525  error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
526  return token_type::parse_error;
527  }
528 
529  case 0x13:
530  {
531  error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
532  return token_type::parse_error;
533  }
534 
535  case 0x14:
536  {
537  error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
538  return token_type::parse_error;
539  }
540 
541  case 0x15:
542  {
543  error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
544  return token_type::parse_error;
545  }
546 
547  case 0x16:
548  {
549  error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
550  return token_type::parse_error;
551  }
552 
553  case 0x17:
554  {
555  error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
556  return token_type::parse_error;
557  }
558 
559  case 0x18:
560  {
561  error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
562  return token_type::parse_error;
563  }
564 
565  case 0x19:
566  {
567  error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
568  return token_type::parse_error;
569  }
570 
571  case 0x1A:
572  {
573  error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
574  return token_type::parse_error;
575  }
576 
577  case 0x1B:
578  {
579  error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
580  return token_type::parse_error;
581  }
582 
583  case 0x1C:
584  {
585  error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
586  return token_type::parse_error;
587  }
588 
589  case 0x1D:
590  {
591  error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
592  return token_type::parse_error;
593  }
594 
595  case 0x1E:
596  {
597  error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
598  return token_type::parse_error;
599  }
600 
601  case 0x1F:
602  {
603  error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
604  return token_type::parse_error;
605  }
606 
607  // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
608  case 0x20:
609  case 0x21:
610  case 0x23:
611  case 0x24:
612  case 0x25:
613  case 0x26:
614  case 0x27:
615  case 0x28:
616  case 0x29:
617  case 0x2A:
618  case 0x2B:
619  case 0x2C:
620  case 0x2D:
621  case 0x2E:
622  case 0x2F:
623  case 0x30:
624  case 0x31:
625  case 0x32:
626  case 0x33:
627  case 0x34:
628  case 0x35:
629  case 0x36:
630  case 0x37:
631  case 0x38:
632  case 0x39:
633  case 0x3A:
634  case 0x3B:
635  case 0x3C:
636  case 0x3D:
637  case 0x3E:
638  case 0x3F:
639  case 0x40:
640  case 0x41:
641  case 0x42:
642  case 0x43:
643  case 0x44:
644  case 0x45:
645  case 0x46:
646  case 0x47:
647  case 0x48:
648  case 0x49:
649  case 0x4A:
650  case 0x4B:
651  case 0x4C:
652  case 0x4D:
653  case 0x4E:
654  case 0x4F:
655  case 0x50:
656  case 0x51:
657  case 0x52:
658  case 0x53:
659  case 0x54:
660  case 0x55:
661  case 0x56:
662  case 0x57:
663  case 0x58:
664  case 0x59:
665  case 0x5A:
666  case 0x5B:
667  case 0x5D:
668  case 0x5E:
669  case 0x5F:
670  case 0x60:
671  case 0x61:
672  case 0x62:
673  case 0x63:
674  case 0x64:
675  case 0x65:
676  case 0x66:
677  case 0x67:
678  case 0x68:
679  case 0x69:
680  case 0x6A:
681  case 0x6B:
682  case 0x6C:
683  case 0x6D:
684  case 0x6E:
685  case 0x6F:
686  case 0x70:
687  case 0x71:
688  case 0x72:
689  case 0x73:
690  case 0x74:
691  case 0x75:
692  case 0x76:
693  case 0x77:
694  case 0x78:
695  case 0x79:
696  case 0x7A:
697  case 0x7B:
698  case 0x7C:
699  case 0x7D:
700  case 0x7E:
701  case 0x7F:
702  {
703  add(current);
704  break;
705  }
706 
707  // U+0080..U+07FF: bytes C2..DF 80..BF
708  case 0xC2:
709  case 0xC3:
710  case 0xC4:
711  case 0xC5:
712  case 0xC6:
713  case 0xC7:
714  case 0xC8:
715  case 0xC9:
716  case 0xCA:
717  case 0xCB:
718  case 0xCC:
719  case 0xCD:
720  case 0xCE:
721  case 0xCF:
722  case 0xD0:
723  case 0xD1:
724  case 0xD2:
725  case 0xD3:
726  case 0xD4:
727  case 0xD5:
728  case 0xD6:
729  case 0xD7:
730  case 0xD8:
731  case 0xD9:
732  case 0xDA:
733  case 0xDB:
734  case 0xDC:
735  case 0xDD:
736  case 0xDE:
737  case 0xDF:
738  {
739  if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
740  {
741  return token_type::parse_error;
742  }
743  break;
744  }
745 
746  // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
747  case 0xE0:
748  {
749  if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
750  {
751  return token_type::parse_error;
752  }
753  break;
754  }
755 
756  // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
757  // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
758  case 0xE1:
759  case 0xE2:
760  case 0xE3:
761  case 0xE4:
762  case 0xE5:
763  case 0xE6:
764  case 0xE7:
765  case 0xE8:
766  case 0xE9:
767  case 0xEA:
768  case 0xEB:
769  case 0xEC:
770  case 0xEE:
771  case 0xEF:
772  {
773  if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
774  {
775  return token_type::parse_error;
776  }
777  break;
778  }
779 
780  // U+D000..U+D7FF: bytes ED 80..9F 80..BF
781  case 0xED:
782  {
783  if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
784  {
785  return token_type::parse_error;
786  }
787  break;
788  }
789 
790  // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
791  case 0xF0:
792  {
793  if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
794  {
795  return token_type::parse_error;
796  }
797  break;
798  }
799 
800  // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
801  case 0xF1:
802  case 0xF2:
803  case 0xF3:
804  {
805  if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
806  {
807  return token_type::parse_error;
808  }
809  break;
810  }
811 
812  // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
813  case 0xF4:
814  {
815  if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
816  {
817  return token_type::parse_error;
818  }
819  break;
820  }
821 
822  // remaining bytes (80..C1 and F5..FF) are ill-formed
823  default:
824  {
825  error_message = "invalid string: ill-formed UTF-8 byte";
826  return token_type::parse_error;
827  }
828  }
829  }
830  }
831 
836  bool scan_comment()
837  {
838  switch (get())
839  {
840  // single-line comments skip input until a newline or EOF is read
841  case '/':
842  {
843  while (true)
844  {
845  switch (get())
846  {
847  case '\n':
848  case '\r':
849  case std::char_traits<char_type>::eof():
850  case '\0':
851  return true;
852 
853  default:
854  break;
855  }
856  }
857  }
858 
859  // multi-line comments skip input until */ is read
860  case '*':
861  {
862  while (true)
863  {
864  switch (get())
865  {
866  case std::char_traits<char_type>::eof():
867  case '\0':
868  {
869  error_message = "invalid comment; missing closing '*/'";
870  return false;
871  }
872 
873  case '*':
874  {
875  switch (get())
876  {
877  case '/':
878  return true;
879 
880  default:
881  {
882  unget();
883  continue;
884  }
885  }
886  }
887 
888  default:
889  continue;
890  }
891  }
892  }
893 
894  // unexpected character after reading '/'
895  default:
896  {
897  error_message = "invalid comment; expecting '/' or '*' after '/'";
898  return false;
899  }
900  }
901  }
902 
903  JSON_HEDLEY_NON_NULL(2)
904  static void strtof(float& f, const char* str, char** endptr) noexcept
905  {
906  f = std::strtof(str, endptr);
907  }
908 
909  JSON_HEDLEY_NON_NULL(2)
910  static void strtof(double& f, const char* str, char** endptr) noexcept
911  {
912  f = std::strtod(str, endptr);
913  }
914 
915  JSON_HEDLEY_NON_NULL(2)
916  static void strtof(long double& f, const char* str, char** endptr) noexcept
917  {
918  f = std::strtold(str, endptr);
919  }
920 
961  token_type scan_number() // lgtm [cpp/use-of-goto]
962  {
963  // reset token_buffer to store the number's bytes
964  reset();
965 
966  // the type of the parsed number; initially set to unsigned; will be
967  // changed if minus sign, decimal point or exponent is read
968  token_type number_type = token_type::value_unsigned;
969 
970  // state (init): we just found out we need to scan a number
971  switch (current)
972  {
973  case '-':
974  {
975  add(current);
976  goto scan_number_minus;
977  }
978 
979  case '0':
980  {
981  add(current);
982  goto scan_number_zero;
983  }
984 
985  case '1':
986  case '2':
987  case '3':
988  case '4':
989  case '5':
990  case '6':
991  case '7':
992  case '8':
993  case '9':
994  {
995  add(current);
996  goto scan_number_any1;
997  }
998 
999  // all other characters are rejected outside scan_number()
1000  default: // LCOV_EXCL_LINE
1001  JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
1002  }
1003 
1004 scan_number_minus:
1005  // state: we just parsed a leading minus sign
1006  number_type = token_type::value_integer;
1007  switch (get())
1008  {
1009  case '0':
1010  {
1011  add(current);
1012  goto scan_number_zero;
1013  }
1014 
1015  case '1':
1016  case '2':
1017  case '3':
1018  case '4':
1019  case '5':
1020  case '6':
1021  case '7':
1022  case '8':
1023  case '9':
1024  {
1025  add(current);
1026  goto scan_number_any1;
1027  }
1028 
1029  default:
1030  {
1031  error_message = "invalid number; expected digit after '-'";
1032  return token_type::parse_error;
1033  }
1034  }
1035 
1036 scan_number_zero:
1037  // state: we just parse a zero (maybe with a leading minus sign)
1038  switch (get())
1039  {
1040  case '.':
1041  {
1042  add(decimal_point_char);
1043  goto scan_number_decimal1;
1044  }
1045 
1046  case 'e':
1047  case 'E':
1048  {
1049  add(current);
1050  goto scan_number_exponent;
1051  }
1052 
1053  default:
1054  goto scan_number_done;
1055  }
1056 
1057 scan_number_any1:
1058  // state: we just parsed a number 0-9 (maybe with a leading minus sign)
1059  switch (get())
1060  {
1061  case '0':
1062  case '1':
1063  case '2':
1064  case '3':
1065  case '4':
1066  case '5':
1067  case '6':
1068  case '7':
1069  case '8':
1070  case '9':
1071  {
1072  add(current);
1073  goto scan_number_any1;
1074  }
1075 
1076  case '.':
1077  {
1078  add(decimal_point_char);
1079  goto scan_number_decimal1;
1080  }
1081 
1082  case 'e':
1083  case 'E':
1084  {
1085  add(current);
1086  goto scan_number_exponent;
1087  }
1088 
1089  default:
1090  goto scan_number_done;
1091  }
1092 
1093 scan_number_decimal1:
1094  // state: we just parsed a decimal point
1095  number_type = token_type::value_float;
1096  switch (get())
1097  {
1098  case '0':
1099  case '1':
1100  case '2':
1101  case '3':
1102  case '4':
1103  case '5':
1104  case '6':
1105  case '7':
1106  case '8':
1107  case '9':
1108  {
1109  add(current);
1110  goto scan_number_decimal2;
1111  }
1112 
1113  default:
1114  {
1115  error_message = "invalid number; expected digit after '.'";
1116  return token_type::parse_error;
1117  }
1118  }
1119 
1120 scan_number_decimal2:
1121  // we just parsed at least one number after a decimal point
1122  switch (get())
1123  {
1124  case '0':
1125  case '1':
1126  case '2':
1127  case '3':
1128  case '4':
1129  case '5':
1130  case '6':
1131  case '7':
1132  case '8':
1133  case '9':
1134  {
1135  add(current);
1136  goto scan_number_decimal2;
1137  }
1138 
1139  case 'e':
1140  case 'E':
1141  {
1142  add(current);
1143  goto scan_number_exponent;
1144  }
1145 
1146  default:
1147  goto scan_number_done;
1148  }
1149 
1150 scan_number_exponent:
1151  // we just parsed an exponent
1152  number_type = token_type::value_float;
1153  switch (get())
1154  {
1155  case '+':
1156  case '-':
1157  {
1158  add(current);
1159  goto scan_number_sign;
1160  }
1161 
1162  case '0':
1163  case '1':
1164  case '2':
1165  case '3':
1166  case '4':
1167  case '5':
1168  case '6':
1169  case '7':
1170  case '8':
1171  case '9':
1172  {
1173  add(current);
1174  goto scan_number_any2;
1175  }
1176 
1177  default:
1178  {
1179  error_message =
1180  "invalid number; expected '+', '-', or digit after exponent";
1181  return token_type::parse_error;
1182  }
1183  }
1184 
1185 scan_number_sign:
1186  // we just parsed an exponent sign
1187  switch (get())
1188  {
1189  case '0':
1190  case '1':
1191  case '2':
1192  case '3':
1193  case '4':
1194  case '5':
1195  case '6':
1196  case '7':
1197  case '8':
1198  case '9':
1199  {
1200  add(current);
1201  goto scan_number_any2;
1202  }
1203 
1204  default:
1205  {
1206  error_message = "invalid number; expected digit after exponent sign";
1207  return token_type::parse_error;
1208  }
1209  }
1210 
1211 scan_number_any2:
1212  // we just parsed a number after the exponent or exponent sign
1213  switch (get())
1214  {
1215  case '0':
1216  case '1':
1217  case '2':
1218  case '3':
1219  case '4':
1220  case '5':
1221  case '6':
1222  case '7':
1223  case '8':
1224  case '9':
1225  {
1226  add(current);
1227  goto scan_number_any2;
1228  }
1229 
1230  default:
1231  goto scan_number_done;
1232  }
1233 
1234 scan_number_done:
1235  // unget the character after the number (we only read it to know that
1236  // we are done scanning a number)
1237  unget();
1238 
1239  char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1240  errno = 0;
1241 
1242  // try to parse integers first and fall back to floats
1243  if (number_type == token_type::value_unsigned)
1244  {
1245  const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
1246 
1247  // we checked the number format before
1248  JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1249 
1250  if (errno == 0)
1251  {
1252  value_unsigned = static_cast<number_unsigned_t>(x);
1253  if (value_unsigned == x)
1254  {
1255  return token_type::value_unsigned;
1256  }
1257  }
1258  }
1259  else if (number_type == token_type::value_integer)
1260  {
1261  const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1262 
1263  // we checked the number format before
1264  JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1265 
1266  if (errno == 0)
1267  {
1268  value_integer = static_cast<number_integer_t>(x);
1269  if (value_integer == x)
1270  {
1271  return token_type::value_integer;
1272  }
1273  }
1274  }
1275 
1276  // this code is reached if we parse a floating-point number or if an
1277  // integer conversion above failed
1278  strtof(value_float, token_buffer.data(), &endptr);
1279 
1280  // we checked the number format before
1281  JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1282 
1283  return token_type::value_float;
1284  }
1285 
1291  JSON_HEDLEY_NON_NULL(2)
1292  token_type scan_literal(const char_type* literal_text, const std::size_t length,
1293  token_type return_type)
1294  {
1295  JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
1296  for (std::size_t i = 1; i < length; ++i)
1297  {
1298  if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
1299  {
1300  error_message = "invalid literal";
1301  return token_type::parse_error;
1302  }
1303  }
1304  return return_type;
1305  }
1306 
1308  // input management
1310 
1312  void reset() noexcept
1313  {
1314  token_buffer.clear();
1315  token_string.clear();
1316  token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1317  }
1318 
1319  /*
1320  @brief get next character from the input
1321 
1322  This function provides the interface to the used input adapter. It does
1323  not throw in case the input reached EOF, but returns a
1324  `std::char_traits<char>::eof()` in that case. Stores the scanned characters
1325  for use in error messages.
1326 
1327  @return character read from the input
1328  */
1329  char_int_type get()
1330  {
1331  ++position.chars_read_total;
1332  ++position.chars_read_current_line;
1333 
1334  if (next_unget)
1335  {
1336  // just reset the next_unget variable and work with current
1337  next_unget = false;
1338  }
1339  else
1340  {
1341  current = ia.get_character();
1342  }
1343 
1344  if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1345  {
1346  token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1347  }
1348 
1349  if (current == '\n')
1350  {
1351  ++position.lines_read;
1352  position.chars_read_current_line = 0;
1353  }
1354 
1355  return current;
1356  }
1357 
1366  void unget()
1367  {
1368  next_unget = true;
1369 
1370  --position.chars_read_total;
1371 
1372  // in case we "unget" a newline, we have to also decrement the lines_read
1373  if (position.chars_read_current_line == 0)
1374  {
1375  if (position.lines_read > 0)
1376  {
1377  --position.lines_read;
1378  }
1379  }
1380  else
1381  {
1382  --position.chars_read_current_line;
1383  }
1384 
1385  if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1386  {
1387  JSON_ASSERT(!token_string.empty());
1388  token_string.pop_back();
1389  }
1390  }
1391 
1393  void add(char_int_type c)
1394  {
1395  token_buffer.push_back(static_cast<typename string_t::value_type>(c));
1396  }
1397 
1398  public:
1400  // value getters
1402 
1404  constexpr number_integer_t get_number_integer() const noexcept
1405  {
1406  return value_integer;
1407  }
1408 
1410  constexpr number_unsigned_t get_number_unsigned() const noexcept
1411  {
1412  return value_unsigned;
1413  }
1414 
1416  constexpr number_float_t get_number_float() const noexcept
1417  {
1418  return value_float;
1419  }
1420 
1422  string_t& get_string()
1423  {
1424  return token_buffer;
1425  }
1426 
1428  // diagnostics
1430 
1432  constexpr position_t get_position() const noexcept
1433  {
1434  return position;
1435  }
1436 
1440  std::string get_token_string() const
1441  {
1442  // escape control characters
1443  std::string result;
1444  for (const auto c : token_string)
1445  {
1446  if (static_cast<unsigned char>(c) <= '\x1F')
1447  {
1448  // escape control characters
1449  std::array<char, 9> cs{{}};
1450  (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1451  result += cs.data();
1452  }
1453  else
1454  {
1455  // add character as is
1456  result.push_back(static_cast<std::string::value_type>(c));
1457  }
1458  }
1459 
1460  return result;
1461  }
1462 
1464  JSON_HEDLEY_RETURNS_NON_NULL
1465  constexpr const char* get_error_message() const noexcept
1466  {
1467  return error_message;
1468  }
1469 
1471  // actual scanner
1473 
1478  bool skip_bom()
1479  {
1480  if (get() == 0xEF)
1481  {
1482  // check if we completely parse the BOM
1483  return get() == 0xBB && get() == 0xBF;
1484  }
1485 
1486  // the first character is not the beginning of the BOM; unget it to
1487  // process is later
1488  unget();
1489  return true;
1490  }
1491 
1492  void skip_whitespace()
1493  {
1494  do
1495  {
1496  get();
1497  }
1498  while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
1499  }
1500 
1501  token_type scan()
1502  {
1503  // initially, skip the BOM
1504  if (position.chars_read_total == 0 && !skip_bom())
1505  {
1506  error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1507  return token_type::parse_error;
1508  }
1509 
1510  // read next character and ignore whitespace
1511  skip_whitespace();
1512 
1513  // ignore comments
1514  while (ignore_comments && current == '/')
1515  {
1516  if (!scan_comment())
1517  {
1518  return token_type::parse_error;
1519  }
1520 
1521  // skip following whitespace
1522  skip_whitespace();
1523  }
1524 
1525  switch (current)
1526  {
1527  // structural characters
1528  case '[':
1529  return token_type::begin_array;
1530  case ']':
1531  return token_type::end_array;
1532  case '{':
1533  return token_type::begin_object;
1534  case '}':
1535  return token_type::end_object;
1536  case ':':
1537  return token_type::name_separator;
1538  case ',':
1539  return token_type::value_separator;
1540 
1541  // literals
1542  case 't':
1543  {
1544  std::array<char_type, 4> true_literal = {{char_type('t'), char_type('r'), char_type('u'), char_type('e')}};
1545  return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
1546  }
1547  case 'f':
1548  {
1549  std::array<char_type, 5> false_literal = {{char_type('f'), char_type('a'), char_type('l'), char_type('s'), char_type('e')}};
1550  return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
1551  }
1552  case 'n':
1553  {
1554  std::array<char_type, 4> null_literal = {{char_type('n'), char_type('u'), char_type('l'), char_type('l')}};
1555  return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
1556  }
1557 
1558  // string
1559  case '\"':
1560  return scan_string();
1561 
1562  // number
1563  case '-':
1564  case '0':
1565  case '1':
1566  case '2':
1567  case '3':
1568  case '4':
1569  case '5':
1570  case '6':
1571  case '7':
1572  case '8':
1573  case '9':
1574  return scan_number();
1575 
1576  // end of input (the null byte is needed when parsing from
1577  // string literals)
1578  case '\0':
1579  case std::char_traits<char_type>::eof():
1580  return token_type::end_of_input;
1581 
1582  // error
1583  default:
1584  error_message = "invalid literal";
1585  return token_type::parse_error;
1586  }
1587  }
1588 
1589  private:
1591  InputAdapterType ia;
1592 
1594  const bool ignore_comments = false;
1595 
1597  char_int_type current = std::char_traits<char_type>::eof();
1598 
1600  bool next_unget = false;
1601 
1603  position_t position {};
1604 
1606  std::vector<char_type> token_string {};
1607 
1609  string_t token_buffer {};
1610 
1612  const char* error_message = "";
1613 
1614  // number values
1615  number_integer_t value_integer = 0;
1616  number_unsigned_t value_unsigned = 0;
1617  number_float_t value_float = 0;
1618 
1620  const char_int_type decimal_point_char = '.';
1621 };
1622 } // namespace detail
1623 } // namespace nlohmann
Definition: lexer.hpp:27
token_type
token types for the parser
Definition: lexer.hpp:31
@ value_float
an floating point number – use get_number_float() for actual value
@ begin_array
the character for array begin [
@ value_string
a string – use get_string() for actual value
@ end_array
the character for array end ]
@ uninitialized
indicating the scanner is uninitialized
@ parse_error
indicating a parse error
@ value_integer
a signed integer – use get_number_integer() for actual value
@ value_separator
the value separator ,
@ end_object
the character for object end }
@ begin_object
the character for object begin {
@ value_unsigned
an unsigned integer – use get_number_unsigned() for actual value
@ end_of_input
indicating the end of the input buffer
@ literal_or_value
a literal or the begin of a value (only for diagnostics)
JSON_HEDLEY_RETURNS_NON_NULL static JSON_HEDLEY_CONST const char * token_type_name(const token_type t) noexcept
return name of values of type token_type (only used for errors)
Definition: lexer.hpp:54
lexical analysis
Definition: lexer.hpp:104
string_t & get_string()
return current string value (implicitly resets the token; useful only once)
Definition: lexer.hpp:1422
bool skip_bom()
skip the UTF-8 byte order mark
Definition: lexer.hpp:1478
constexpr position_t get_position() const noexcept
return position of last read token
Definition: lexer.hpp:1432
constexpr number_integer_t get_number_integer() const noexcept
return integer value
Definition: lexer.hpp:1404
constexpr JSON_HEDLEY_RETURNS_NON_NULL const char * get_error_message() const noexcept
return syntax error message
Definition: lexer.hpp:1465
constexpr number_unsigned_t get_number_unsigned() const noexcept
return unsigned integer value
Definition: lexer.hpp:1410
constexpr number_float_t get_number_float() const noexcept
return floating-point value
Definition: lexer.hpp:1416
std::string get_token_string() const
return the last read token (for errors only).
Definition: lexer.hpp:1440
namespace for Niels Lohmann
Definition: adl_serializer.hpp:12
struct to capture the start position of the current token
Definition: position_t.hpp:11
std::size_t lines_read
the number of lines read
Definition: position_t.hpp:17
std::size_t chars_read_current_line
the number of characters read in the current line
Definition: position_t.hpp:15
std::size_t chars_read_total
the total number of characters read
Definition: position_t.hpp:13