Skip to content

Commit db31f6a

Browse files
BUG: make read_csv be able to read large floating numbers into float (#62542)
Co-authored-by: Matthew Roeschke <[email protected]>
1 parent d8b3ff3 commit db31f6a

File tree

3 files changed

+84
-0
lines changed

3 files changed

+84
-0
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1080,6 +1080,7 @@ I/O
10801080
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
10811081
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
10821082
- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
1083+
- Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`)
10831084
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
10841085
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
10851086
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)

pandas/_libs/parsers.pyx

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,10 @@ cdef class TextReader:
10701070
else:
10711071
col_res = None
10721072
for dt in self.dtype_cast_order:
1073+
if (dt.kind in "iu" and
1074+
self._column_has_float(i, start, end, na_filter, na_hashset)):
1075+
continue
1076+
10731077
try:
10741078
col_res, na_count = self._convert_with_dtype(
10751079
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
@@ -1347,6 +1351,58 @@ cdef class TextReader:
13471351
else:
13481352
return None
13491353

1354+
cdef bint _column_has_float(self, Py_ssize_t col,
1355+
int64_t start, int64_t end,
1356+
bint na_filter, kh_str_starts_t *na_hashset):
1357+
"""Check if the column contains any float number."""
1358+
cdef:
1359+
Py_ssize_t i, j, lines = end - start
1360+
coliter_t it
1361+
const char *word = NULL
1362+
const char *ignored_chars = " +-"
1363+
const char *digits = "0123456789"
1364+
const char *float_indicating_chars = "eE"
1365+
char null_byte = 0
1366+
1367+
coliter_setup(&it, self.parser, col, start)
1368+
1369+
for i in range(lines):
1370+
COLITER_NEXT(it, word)
1371+
1372+
if na_filter and kh_get_str_starts_item(na_hashset, word):
1373+
continue
1374+
1375+
found_first_digit = False
1376+
j = 0
1377+
while word[j] != null_byte:
1378+
if word[j] == self.parser.decimal:
1379+
return True
1380+
elif not found_first_digit and word[j] in ignored_chars:
1381+
# no-op
1382+
pass
1383+
elif not found_first_digit and word[j] not in digits:
1384+
# word isn't numeric
1385+
return False
1386+
elif not found_first_digit and word[j] in digits:
1387+
found_first_digit = True
1388+
elif word[j] in float_indicating_chars:
1389+
# preceding chars indicates numeric and
1390+
# current char indicates float
1391+
return True
1392+
elif word[j] not in digits:
1393+
# previous characters indicates numeric
1394+
# current character shows otherwise
1395+
return False
1396+
elif word[j] in digits:
1397+
# no-op
1398+
pass
1399+
else:
1400+
raise AssertionError(
1401+
f"Unhandled case {word[j]=} {found_first_digit=}"
1402+
)
1403+
j += 1
1404+
1405+
return False
13501406

13511407
# Factor out code common to TextReader.__dealloc__ and TextReader.close
13521408
# It cannot be a class method, since calling self.close() in __dealloc__

pandas/tests/io/parser/common/test_float.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,30 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
7777
expected = DataFrame({"data": [f"10E{exp}"]})
7878

7979
tm.assert_frame_equal(result, expected)
80+
81+
82+
@pytest.mark.parametrize(
83+
"value, expected_value",
84+
[
85+
("32.0", 32.0),
86+
("32e0", 32.0),
87+
("3.2e1", 32.0),
88+
("3.2e80", 3.2e80),
89+
("3.2e-80", 3.2e-80),
90+
("18446744073709551616.0", float(1 << 64)), # loses precision
91+
("18446744073709551616.5", float(1 << 64)), # loses precision
92+
("36893488147419103232.3", float(1 << 65)), # loses precision
93+
],
94+
)
95+
def test_small_int_followed_by_float(
96+
all_parsers_all_precisions, value, expected_value, request
97+
):
98+
# GH#51295
99+
parser, precision = all_parsers_all_precisions
100+
data = f"""data
101+
42
102+
{value}"""
103+
result = parser.read_csv(StringIO(data), float_precision=precision)
104+
expected = DataFrame({"data": [42.0, expected_value]})
105+
106+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)