Skip to content

Commit 9cf6dc5

Browse files
committed
fix: handle overflow during exponent computation
1 parent ad42d8d commit 9cf6dc5

File tree

2 files changed

+48
-58
lines changed

2 files changed

+48
-58
lines changed

pandas/_libs/src/parser/tokenizer.c

Lines changed: 17 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
2121

2222
#include <ctype.h>
2323
#include <float.h>
24+
#include <limits.h>
2425
#include <math.h>
2526
#include <stdbool.h>
2627
#include <stdlib.h>
@@ -1620,9 +1621,9 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
16201621
}
16211622

16221623
double number = 0.;
1623-
int exponent = 0;
1624-
int num_digits = 0;
1625-
int num_decimals = 0;
1624+
long int exponent = 0;
1625+
long int num_digits = 0;
1626+
long int num_decimals = 0;
16261627

16271628
// Process string of digits.
16281629
while (isdigit_ascii(*p)) {
@@ -1671,39 +1672,29 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
16711672
if (maybe_int != NULL)
16721673
*maybe_int = 0;
16731674

1674-
// Handle optional sign
1675-
negative = 0;
1676-
switch (*++p) {
1677-
case '-':
1678-
negative = 1;
1679-
PD_FALLTHROUGH; // Fall through to increment position.
1680-
case '+':
1681-
p++;
1682-
break;
1683-
}
1675+
// move past scientific notation
1676+
p++;
16841677

1685-
// Process string of digits.
1686-
num_digits = 0;
1687-
int n = 0;
1688-
while (num_digits < max_digits && isdigit_ascii(*p)) {
1689-
n = n * 10 + (*p - '0');
1690-
num_digits++;
1691-
p++;
1678+
char *endptr;
1679+
errno = 0;
1680+
long int n = strtol(p, &endptr, 10);
1681+
1682+
if (errno == ERANGE || checked_add(exponent, n, &exponent)) {
1683+
errno = 0;
1684+
exponent = n;
16921685
}
16931686

1694-
if (negative)
1695-
exponent -= n;
1696-
else
1697-
exponent += n;
1687+
int num_digits = endptr - p;
16981688

16991689
// If no digits after the 'e'/'E', un-consume it.
17001690
if (num_digits == 0)
17011691
p--;
1692+
else
1693+
p = endptr;
17021694
}
17031695

17041696
if (exponent > 308) {
1705-
*error = ERANGE;
1706-
return HUGE_VAL;
1697+
number = number < 0 ? -HUGE_VAL : HUGE_VAL;
17071698
} else if (exponent > 0) {
17081699
number *= e[exponent];
17091700
} else if (exponent < -308) { // Subnormal
@@ -1718,9 +1709,6 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
17181709
number /= e[-exponent];
17191710
}
17201711

1721-
if (number == HUGE_VAL || number == -HUGE_VAL)
1722-
*error = ERANGE;
1723-
17241712
if (skip_trailing) {
17251713
// Skip trailing whitespace.
17261714
while (isspace_ascii(*p))

pandas/tests/io/parser/common/test_float.py

Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
pytestmark = pytest.mark.filterwarnings(
1717
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
1818
)
19-
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
2019
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
2120

2221

@@ -42,40 +41,30 @@ def test_scientific_no_exponent(all_parsers_all_precisions):
4241

4342

4443
@pytest.mark.parametrize(
45-
"neg_exp",
44+
"value, expected_value",
4645
[
47-
-617,
48-
-100000,
49-
-99999999999999999,
46+
("10E-617", 0.0),
47+
("10E-100000", 0.0),
48+
("-10E-100000", 0.0),
49+
("10e-99999999999", 0.0),
50+
("10e-999999999999", 0.0),
51+
("10e-9999999999999", 0.0),
52+
("10E999", np.inf),
53+
("-10e99999999999", -np.inf),
54+
("10e99999999999", np.inf),
55+
("10e999999999999", np.inf),
56+
("10e9999999999999", np.inf),
57+
("50060e8007123400", np.inf),
58+
("-50060e8007123400", -np.inf),
5059
],
5160
)
52-
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
53-
# GH#38753
61+
def test_large_exponent(all_parsers_all_precisions, value, expected_value):
62+
# GH#38753; GH#38794; GH#62740
5463
parser, precision = all_parsers_all_precisions
5564

56-
data = f"data\n10E{neg_exp}"
65+
data = f"data\n{value}"
5766
result = parser.read_csv(StringIO(data), float_precision=precision)
58-
expected = DataFrame({"data": [0.0]})
59-
tm.assert_frame_equal(result, expected)
60-
61-
62-
@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
63-
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
64-
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
65-
# GH#38753
66-
parser, precision = all_parsers_all_precisions
67-
data = f"data\n10E{exp}"
68-
result = parser.read_csv(StringIO(data), float_precision=precision)
69-
if precision == "round_trip":
70-
if exp == 999999999999999999 and is_platform_linux():
71-
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
72-
request.applymarker(mark)
73-
74-
value = np.inf if exp > 0 else 0.0
75-
expected = DataFrame({"data": [value]})
76-
else:
77-
expected = DataFrame({"data": [f"10E{exp}"]})
78-
67+
expected = DataFrame({"data": [expected_value]})
7968
tm.assert_frame_equal(result, expected)
8069

8170

@@ -104,3 +93,16 @@ def test_small_int_followed_by_float(
10493
expected = DataFrame({"data": [42.0, expected_value]})
10594

10695
tm.assert_frame_equal(result, expected)
96+
97+
@pytest.mark.parametrize(
98+
"value",
99+
["81e31d04049863b72", "d81e31d04049863b72", "81e3104049863b72"]
100+
)
101+
def test_invalid_float_number(all_parsers_all_precisions, value):
102+
# GH#62617
103+
parser, precision = all_parsers_all_precisions
104+
data = f"h1,h2,h3\ndata1,{value},data3"
105+
106+
result = parser.read_csv(StringIO(data), float_precision=precision)
107+
expected = DataFrame({"h1": ["data1"], "h2": [value], "h3": "data3"})
108+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)