Skip to content

Commit 21130b1

Browse files
authored
BUG: make read_csv read large integers (>64bits) as python integers with C engine (#62582)
1 parent 2560788 commit 21130b1

File tree

3 files changed

+72
-14
lines changed

3 files changed

+72
-14
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1079,6 +1079,7 @@ I/O
10791079
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
10801080
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
10811081
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
1082+
- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
10821083
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
10831084
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
10841085
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)

pandas/_libs/parsers.pyx

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ from cpython.exc cimport (
2929
PyErr_Fetch,
3030
PyErr_Occurred,
3131
)
32+
from cpython.long cimport PyLong_FromString
3233
from cpython.object cimport PyObject
3334
from cpython.ref cimport (
3435
Py_INCREF,
@@ -1081,9 +1082,13 @@ cdef class TextReader:
10811082
np.dtype("object"), i, start, end, 0,
10821083
0, na_hashset, na_fset)
10831084
except OverflowError:
1084-
col_res, na_count = self._convert_with_dtype(
1085-
np.dtype("object"), i, start, end, na_filter,
1086-
0, na_hashset, na_fset)
1085+
try:
1086+
col_res, na_count = _try_pylong(self.parser, i, start,
1087+
end, na_filter, na_hashset)
1088+
except ValueError:
1089+
col_res, na_count = self._convert_with_dtype(
1090+
np.dtype("object"), i, start, end, 0,
1091+
0, na_hashset, na_fset)
10871092

10881093
if col_res is not None:
10891094
break
@@ -1873,6 +1878,36 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
18731878

18741879
return 0
18751880

1881+
cdef _try_pylong(parser_t *parser, Py_ssize_t col,
1882+
int64_t line_start, int64_t line_end,
1883+
bint na_filter, kh_str_starts_t *na_hashset):
1884+
cdef:
1885+
int na_count = 0
1886+
Py_ssize_t lines
1887+
coliter_t it
1888+
const char *word = NULL
1889+
ndarray[object] result
1890+
object NA = na_values[np.object_]
1891+
1892+
lines = line_end - line_start
1893+
result = np.empty(lines, dtype=object)
1894+
coliter_setup(&it, parser, col, line_start)
1895+
1896+
for i in range(lines):
1897+
COLITER_NEXT(it, word)
1898+
if na_filter and kh_get_str_starts_item(na_hashset, word):
1899+
# in the hash table
1900+
na_count += 1
1901+
result[i] = NA
1902+
continue
1903+
1904+
py_int = PyLong_FromString(word, NULL, 10)
1905+
if py_int is None:
1906+
raise ValueError("Invalid integer ", word)
1907+
result[i] = py_int
1908+
1909+
return result, na_count
1910+
18761911

18771912
# -> tuple[ndarray[bool], int]
18781913
cdef _try_bool_flex(parser_t *parser, int64_t col,

pandas/tests/io/parser/common/test_ints.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -144,17 +144,22 @@ def test_int64_overflow(all_parsers, conv, request):
144144
if parser.engine == "pyarrow":
145145
mark = pytest.mark.xfail(reason="parses to float64")
146146
request.applymarker(mark)
147+
elif parser.engine == "python":
148+
mark = pytest.mark.xfail(
149+
reason="TODO: Python engine reads bigint as string"
150+
)
151+
request.applymarker(mark)
147152

148153
result = parser.read_csv(StringIO(data))
149154
expected = DataFrame(
150155
[
151-
"00013007854817840016671868",
152-
"00013007854817840016749251",
153-
"00013007854817840016754630",
154-
"00013007854817840016781876",
155-
"00013007854817840017028824",
156-
"00013007854817840017963235",
157-
"00013007854817840018860166",
156+
13007854817840016671868,
157+
13007854817840016749251,
158+
13007854817840016754630,
159+
13007854817840016781876,
160+
13007854817840017028824,
161+
13007854817840017963235,
162+
13007854817840018860166,
158163
],
159164
columns=["ID"],
160165
)
@@ -185,7 +190,7 @@ def test_int64_overflow(all_parsers, conv, request):
185190
)
186191
def test_int64_uint64_range(all_parsers, val):
187192
# These numbers fall right inside the int64-uint64
188-
# range, so they should be parsed as string.
193+
# range, so they should be parsed as integer.
189194
parser = all_parsers
190195
result = parser.read_csv(StringIO(str(val)), header=None)
191196

@@ -197,13 +202,30 @@ def test_int64_uint64_range(all_parsers, val):
197202
@pytest.mark.parametrize(
198203
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
199204
)
200-
def test_outside_int64_uint64_range(all_parsers, val):
205+
def test_outside_int64_uint64_range(all_parsers, val, request):
201206
# These numbers fall just outside the int64-uint64
202-
# range, so they should be parsed as string.
207+
# range, so they should be parsed as object.
203208
parser = all_parsers
209+
if parser.engine == "python":
210+
mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string")
211+
request.applymarker(mark)
212+
204213
result = parser.read_csv(StringIO(str(val)), header=None)
205214

206-
expected = DataFrame([str(val)])
215+
expected = DataFrame([val])
216+
tm.assert_frame_equal(result, expected)
217+
218+
219+
@skip_pyarrow # CSV parse error: Empty CSV file or block
220+
@pytest.mark.parametrize(
221+
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
222+
)
223+
def test_outside_int64_uint64_range_follow_str(all_parsers, val):
224+
parser = all_parsers
225+
226+
result = parser.read_csv(StringIO(f"{val}\nabc"), header=None)
227+
228+
expected = DataFrame([str(val), "abc"])
207229
tm.assert_frame_equal(result, expected)
208230

209231

0 commit comments

Comments
 (0)