fix: handle overflow during exponent computation

Alvaro-Kothe · Alvaro-Kothe · commit 9cf6dc5f24b3 · 2025-10-18T19:27:29.000-03:00
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -21,6 +21,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 
 #include <ctype.h>
 #include <float.h>
+#include <limits.h>
 #include <math.h>
 #include <stdbool.h>
 #include <stdlib.h>
@@ -1620,9 +1621,9 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
   }
 
   double number = 0.;
-  int exponent = 0;
-  int num_digits = 0;
-  int num_decimals = 0;
+  long int exponent = 0;
+  long int num_digits = 0;
+  long int num_decimals = 0;
 
   // Process string of digits.
   while (isdigit_ascii(*p)) {
@@ -1671,39 +1672,29 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
     if (maybe_int != NULL)
       *maybe_int = 0;
 
-    // Handle optional sign
-    negative = 0;
-    switch (*++p) {
-    case '-':
-      negative = 1;
-      PD_FALLTHROUGH; // Fall through to increment position.
-    case '+':
-      p++;
-      break;
-    }
+    // move past scientific notation
+    p++;
 
-    // Process string of digits.
-    num_digits = 0;
-    int n = 0;
-    while (num_digits < max_digits && isdigit_ascii(*p)) {
-      n = n * 10 + (*p - '0');
-      num_digits++;
-      p++;
+    char *endptr;
+    errno = 0;
+    long int n = strtol(p, &endptr, 10);
+
+    if (errno == ERANGE || checked_add(exponent, n, &exponent)) {
+      errno = 0;
+      exponent = n;
     }
 
-    if (negative)
-      exponent -= n;
-    else
-      exponent += n;
+    int num_digits = endptr - p;
 
     // If no digits after the 'e'/'E', un-consume it.
     if (num_digits == 0)
       p--;
+    else
+      p = endptr;
   }
 
   if (exponent > 308) {
-    *error = ERANGE;
-    return HUGE_VAL;
+    number = number < 0 ? -HUGE_VAL : HUGE_VAL;
   } else if (exponent > 0) {
     number *= e[exponent];
   } else if (exponent < -308) { // Subnormal
@@ -1718,9 +1709,6 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
     number /= e[-exponent];
   }
 
-  if (number == HUGE_VAL || number == -HUGE_VAL)
-    *error = ERANGE;
-
   if (skip_trailing) {
     // Skip trailing whitespace.
     while (isspace_ascii(*p))
diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py
@@ -16,7 +16,6 @@
 pytestmark = pytest.mark.filterwarnings(
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
 )
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
@@ -42,40 +41,30 @@ def test_scientific_no_exponent(all_parsers_all_precisions):
 
 
 @pytest.mark.parametrize(
-    "neg_exp",
+    "value, expected_value",
     [
-        -617,
-        -100000,
-        -99999999999999999,
+        ("10E-617", 0.0),
+        ("10E-100000", 0.0),
+        ("-10E-100000", 0.0),
+        ("10e-99999999999", 0.0),
+        ("10e-999999999999", 0.0),
+        ("10e-9999999999999", 0.0),
+        ("10E999", np.inf),
+        ("-10e99999999999", -np.inf),
+        ("10e99999999999", np.inf),
+        ("10e999999999999", np.inf),
+        ("10e9999999999999", np.inf),
+        ("50060e8007123400", np.inf),
+        ("-50060e8007123400", -np.inf),
     ],
 )
-def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
-    # GH#38753
+def test_large_exponent(all_parsers_all_precisions, value, expected_value):
+    # GH#38753; GH#38794; GH#62740
     parser, precision = all_parsers_all_precisions
 
-    data = f"data\n10E{neg_exp}"
+    data = f"data\n{value}"
     result = parser.read_csv(StringIO(data), float_precision=precision)
-    expected = DataFrame({"data": [0.0]})
-    tm.assert_frame_equal(result, expected)
-
-
-@xfail_pyarrow  # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
-@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
-def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
-    # GH#38753
-    parser, precision = all_parsers_all_precisions
-    data = f"data\n10E{exp}"
-    result = parser.read_csv(StringIO(data), float_precision=precision)
-    if precision == "round_trip":
-        if exp == 999999999999999999 and is_platform_linux():
-            mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
-            request.applymarker(mark)
-
-        value = np.inf if exp > 0 else 0.0
-        expected = DataFrame({"data": [value]})
-    else:
-        expected = DataFrame({"data": [f"10E{exp}"]})
-
+    expected = DataFrame({"data": [expected_value]})
     tm.assert_frame_equal(result, expected)
 
 
@@ -104,3 +93,16 @@ def test_small_int_followed_by_float(
     expected = DataFrame({"data": [42.0, expected_value]})
 
     tm.assert_frame_equal(result, expected)
+
+@pytest.mark.parametrize(
+        "value",
+        ["81e31d04049863b72", "d81e31d04049863b72", "81e3104049863b72"]
+        )
+def test_invalid_float_number(all_parsers_all_precisions, value):
+    # GH#62617
+    parser, precision = all_parsers_all_precisions
+    data = f"h1,h2,h3\ndata1,{value},data3"
+
+    result = parser.read_csv(StringIO(data), float_precision=precision)
+    expected = DataFrame({"h1": ["data1"], "h2": [value], "h3": "data3"})
+    tm.assert_frame_equal(result, expected)