BUG: make read_csv read large integers (>64bits) as python integers with C engine (#62582)

Alvaro-Kothe · web-flow · commit 21130b13c5dd · 2025-10-05T12:35:42.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1079,6 +1079,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
+- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
 - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -29,6 +29,7 @@ from cpython.exc cimport (
     PyErr_Fetch,
     PyErr_Occurred,
 )
+from cpython.long cimport PyLong_FromString
 from cpython.object cimport PyObject
 from cpython.ref cimport (
     Py_INCREF,
@@ -1081,9 +1082,13 @@ cdef class TextReader:
                         np.dtype("object"), i, start, end, 0,
                         0, na_hashset, na_fset)
                 except OverflowError:
-                    col_res, na_count = self._convert_with_dtype(
-                        np.dtype("object"), i, start, end, na_filter,
-                        0, na_hashset, na_fset)
+                    try:
+                        col_res, na_count = _try_pylong(self.parser, i, start,
+                                                        end, na_filter, na_hashset)
+                    except ValueError:
+                        col_res, na_count = self._convert_with_dtype(
+                            np.dtype("object"), i, start, end, 0,
+                            0, na_hashset, na_fset)
 
                 if col_res is not None:
                     break
@@ -1873,6 +1878,36 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
 
     return 0
 
+cdef _try_pylong(parser_t *parser, Py_ssize_t col,
+                 int64_t line_start, int64_t line_end,
+                 bint na_filter, kh_str_starts_t *na_hashset):
+    cdef:
+        int na_count = 0
+        Py_ssize_t lines
+        coliter_t it
+        const char *word = NULL
+        ndarray[object] result
+        object NA = na_values[np.object_]
+
+    lines = line_end - line_start
+    result = np.empty(lines, dtype=object)
+    coliter_setup(&it, parser, col, line_start)
+
+    for i in range(lines):
+        COLITER_NEXT(it, word)
+        if na_filter and kh_get_str_starts_item(na_hashset, word):
+            # in the hash table
+            na_count += 1
+            result[i] = NA
+            continue
+
+        py_int = PyLong_FromString(word, NULL, 10)
+        if py_int is None:
+            raise ValueError("Invalid integer ", word)
+        result[i] = py_int
+
+    return result, na_count
+
 
 # -> tuple[ndarray[bool], int]
 cdef _try_bool_flex(parser_t *parser, int64_t col,
diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py
@@ -144,17 +144,22 @@ def test_int64_overflow(all_parsers, conv, request):
         if parser.engine == "pyarrow":
             mark = pytest.mark.xfail(reason="parses to float64")
             request.applymarker(mark)
+        elif parser.engine == "python":
+            mark = pytest.mark.xfail(
+                reason="TODO: Python engine reads bigint as string"
+            )
+            request.applymarker(mark)
 
         result = parser.read_csv(StringIO(data))
         expected = DataFrame(
             [
-                "00013007854817840016671868",
-                "00013007854817840016749251",
-                "00013007854817840016754630",
-                "00013007854817840016781876",
-                "00013007854817840017028824",
-                "00013007854817840017963235",
-                "00013007854817840018860166",
+                13007854817840016671868,
+                13007854817840016749251,
+                13007854817840016754630,
+                13007854817840016781876,
+                13007854817840017028824,
+                13007854817840017963235,
+                13007854817840018860166,
             ],
             columns=["ID"],
         )
@@ -185,7 +190,7 @@ def test_int64_overflow(all_parsers, conv, request):
 )
 def test_int64_uint64_range(all_parsers, val):
     # These numbers fall right inside the int64-uint64
-    # range, so they should be parsed as string.
+    # range, so they should be parsed as integer.
     parser = all_parsers
     result = parser.read_csv(StringIO(str(val)), header=None)
 
@@ -197,13 +202,30 @@ def test_int64_uint64_range(all_parsers, val):
 @pytest.mark.parametrize(
     "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
 )
-def test_outside_int64_uint64_range(all_parsers, val):
+def test_outside_int64_uint64_range(all_parsers, val, request):
     # These numbers fall just outside the int64-uint64
-    # range, so they should be parsed as string.
+    # range, so they should be parsed as object.
     parser = all_parsers
+    if parser.engine == "python":
+        mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string")
+        request.applymarker(mark)
+
     result = parser.read_csv(StringIO(str(val)), header=None)
 
-    expected = DataFrame([str(val)])
+    expected = DataFrame([val])
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow  # CSV parse error: Empty CSV file or block
+@pytest.mark.parametrize(
+    "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
+)
+def test_outside_int64_uint64_range_follow_str(all_parsers, val):
+    parser = all_parsers
+
+    result = parser.read_csv(StringIO(f"{val}\nabc"), header=None)
+
+    expected = DataFrame([str(val), "abc"])
     tm.assert_frame_equal(result, expected)