Merge pull request #252 from LeszekSwirski/parse-error

lemire · web-flow · commit 0e7a10ad8091 · 2024-08-03T10:08:48.000-04:00
Record parse failure reason and location
diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
@@ -234,6 +234,25 @@ void loop_parse_if_eight_digits(const char*& p, const char* const pend, uint64_t
   }
 }
 
+enum class parse_error {
+  no_error,
+  // [JSON-only] The minus sign must be followed by an integer.
+  missing_integer_after_sign,
+  // A sign must be followed by an integer or dot.
+  missing_integer_or_dot_after_sign,
+  // [JSON-only] The integer part must not have leading zeros.
+  leading_zeros_in_integer_part,
+  // [JSON-only] The integer part must have at least one digit.
+  no_digits_in_integer_part,
+  // [JSON-only] If there is a decimal point, there must be digits in the
+  // fractional part.
+  no_digits_in_fractional_part,
+  // The mantissa must have at least one digit.
+  no_digits_in_mantissa,
+  // Scientific notation requires an exponential part.
+  missing_exponential_part,
+};
+
 template <typename UC>
 struct parsed_number_string_t {
   int64_t exponent{0};
@@ -245,11 +264,22 @@ struct parsed_number_string_t {
   // contains the range of the significant digits
   span<const UC> integer{};  // non-nullable
   span<const UC> fraction{}; // nullable
+  parse_error error{parse_error::no_error};
 };
 
 using byte_span = span<const char>;
 using parsed_number_string = parsed_number_string_t<char>;
 
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t<UC>
+report_parse_error(UC const* p, parse_error error) {
+  parsed_number_string_t<UC> answer;
+  answer.valid = false;
+  answer.lastmatch = p;
+  answer.error = error;
+  return answer;
+}
+
 // Assuming that you use no more than 19 digits, this will
 // parse an ASCII string.
 template <typename UC>
@@ -269,15 +299,16 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
 #endif
     ++p;
     if (p == pend) {
-      return answer;
+        return report_parse_error<UC>(
+            p, parse_error::missing_integer_or_dot_after_sign);
     }
     if (fmt & FASTFLOAT_JSONFMT) {
       if (!is_integer(*p)) { // a sign must be followed by an integer
-        return answer;
+        return report_parse_error<UC>(p, parse_error::missing_integer_after_sign);
       }    
     } else {
       if (!is_integer(*p) && (*p != decimal_point)) { // a sign must be followed by an integer or the dot
-        return answer;
+        return report_parse_error<UC>(p, parse_error::missing_integer_or_dot_after_sign);
       }
     }
   }
@@ -297,8 +328,12 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
   answer.integer = span<const UC>(start_digits, size_t(digit_count));
   if (fmt & FASTFLOAT_JSONFMT) {
     // at least 1 digit in integer part, without leading zeros
-    if (digit_count == 0 || (start_digits[0] == UC('0') && digit_count > 1)) {
-      return answer;
+    if (digit_count == 0) {
+      return report_parse_error<UC>(p, parse_error::no_digits_in_integer_part);
+    }
+    if ((start_digits[0] == UC('0') && digit_count > 1)) {
+      return report_parse_error<UC>(start_digits,
+                                    parse_error::leading_zeros_in_integer_part);
     }
   }
 
@@ -323,11 +358,10 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
   if (fmt & FASTFLOAT_JSONFMT) {
     // at least 1 digit in fractional part
     if (has_decimal_point && exponent == 0) {
-      return answer;
+      return report_parse_error<UC>(p, parse_error::no_digits_in_fractional_part);
     }
-  } 
-  else if (digit_count == 0) { // we must have encountered at least one integer!
-    return answer;
+  } else if (digit_count == 0) {  // we must have encountered at least one integer!
+    return report_parse_error<UC>(p, parse_error::no_digits_in_mantissa);
   }
   int64_t exp_number = 0;            // explicit exponential part
   if ( ((fmt & chars_format::scientific) &&
@@ -350,8 +384,10 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
     }
     if ((p == pend) || !is_integer(*p)) {
       if(!(fmt & chars_format::fixed)) {
-        // We are in error.
-        return answer;
+        // The exponential part is invalid for scientific notation, so it must
+        // be a trailing token for fixed notation. However, fixed notation is
+        // disabled, so report a scientific notation error.
+        return report_parse_error<UC>(p, parse_error::missing_exponential_part);
       }
       // Otherwise, we will be ignoring the 'e'.
       p = location_of_e;
@@ -368,7 +404,9 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
     }
   } else {
     // If it scientific and not fixed, we have to bail out.
-    if((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { return answer; }
+    if ((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) {
+      return report_parse_error<UC>(p, parse_error::missing_exponential_part);
+    }
   }
   answer.lastmatch = p;
   answer.valid = true;
diff --git a/tests/json_fmt.cpp b/tests/json_fmt.cpp
@@ -45,6 +45,15 @@ struct AcceptedValue {
   ExpectedResult expected;
 };
 
+struct RejectReason {
+  fast_float::parse_error error;
+  intptr_t location_offset;
+};
+struct RejectedValue {
+  std::string input;
+  RejectReason reason;
+};
+
 int main() {
   const std::vector<AcceptedValue> accept{
       {"-0.2", {-0.2, ""}},
@@ -55,8 +64,18 @@ int main() {
       {"1e", {1., "e"}},
       {"1e+", {1., "e+"}},
       {"inf", {std::numeric_limits<double>::infinity(), ""}}};
-  const std::vector<std::string> reject{"-.2", "00.02", "0.e+1", "00.e+1",
-                                        ".25", "+0.25", "inf",   "nan(snan)"};
+  const std::vector<RejectedValue> reject{
+      {"-.2", {fast_float::parse_error::missing_integer_after_sign, 1}},
+      {"00.02", {fast_float::parse_error::leading_zeros_in_integer_part, 0}},
+      {"0.e+1", {fast_float::parse_error::no_digits_in_fractional_part, 2}},
+      {"00.e+1", {fast_float::parse_error::leading_zeros_in_integer_part, 0}},
+      {".25", {fast_float::parse_error::no_digits_in_integer_part, 0}},
+      // The following cases already start as invalid JSON, so they are
+      // handled as trailing junk and the error is for not having digits in the
+      // empty string before the invalid token.
+      {"+0.25", {fast_float::parse_error::no_digits_in_integer_part, 0}},
+      {"inf", {fast_float::parse_error::no_digits_in_integer_part, 0}},
+      {"nan(snan)", {fast_float::parse_error::no_digits_in_integer_part, 0}}};
 
   for (std::size_t i = 0; i < accept.size(); ++i)
   {
@@ -80,7 +99,7 @@ int main() {
 
   for (std::size_t i = 0; i < reject.size(); ++i)
   {
-    const auto& s = reject[i];
+    const auto& s = reject[i].input;
     double result;
     auto answer = fast_float::from_chars(s.data(), s.data() + s.size(), result, fast_float::chars_format::json);
     if (answer.ec == std::errc()) {
@@ -89,6 +108,31 @@ int main() {
     }
   }
 
+  for (std::size_t i = 0; i < reject.size(); ++i)
+  {
+    const auto& f = reject[i].input;
+    const auto& expected_reason = reject[i].reason;
+    auto answer = fast_float::parse_number_string(
+        f.data(), f.data() + f.size(),
+        fast_float::parse_options(fast_float::chars_format::json));
+    if (answer.valid) {
+      std::cerr << "json parse accepted invalid json " << f << std::endl;
+      return EXIT_FAILURE;
+    }
+    if (answer.error != expected_reason.error) {
+      std::cerr << "json parse failure had invalid error reason " << f
+                << std::endl;
+      return EXIT_FAILURE;
+    }
+    intptr_t error_location = answer.lastmatch - f.data();
+    if (error_location != expected_reason.location_offset) {
+      std::cerr << "json parse failure had invalid error location " << f
+                << " (expected " << expected_reason.location_offset << " got "
+                << error_location << ")" << std::endl;
+      return EXIT_FAILURE;
+    }
+  }
+
   if(main_readme() != EXIT_SUCCESS) { return EXIT_FAILURE; }
   if(main_readme2() != EXIT_SUCCESS) { return EXIT_FAILURE; }