BUG: make read_csv be able to read large floating numbers into float (#62542)

Alvaro-Kothe · mroeschke · web-flow · commit db31f6a38353 · 2025-10-06T15:13:38.000-07:00
Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1080,6 +1080,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
+- Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`)
 - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1070,6 +1070,10 @@ cdef class TextReader:
         else:
             col_res = None
             for dt in self.dtype_cast_order:
+                if (dt.kind in "iu" and
+                        self._column_has_float(i, start, end, na_filter, na_hashset)):
+                    continue
+
                 try:
                     col_res, na_count = self._convert_with_dtype(
                         dt, i, start, end, na_filter, 0, na_hashset, na_fset)
@@ -1347,6 +1351,58 @@ cdef class TextReader:
             else:
                 return None
 
+    cdef bint _column_has_float(self, Py_ssize_t col,
+                                int64_t start, int64_t end,
+                                bint na_filter, kh_str_starts_t *na_hashset):
+        """Check if the column contains any float number."""
+        cdef:
+            Py_ssize_t i, j, lines = end - start
+            coliter_t it
+            const char *word = NULL
+            const char *ignored_chars = " +-"
+            const char *digits = "0123456789"
+            const char *float_indicating_chars = "eE"
+            char null_byte = 0
+
+        coliter_setup(&it, self.parser, col, start)
+
+        for i in range(lines):
+            COLITER_NEXT(it, word)
+
+            if na_filter and kh_get_str_starts_item(na_hashset, word):
+                continue
+
+            found_first_digit = False
+            j = 0
+            while word[j] != null_byte:
+                if word[j] == self.parser.decimal:
+                    return True
+                elif not found_first_digit and word[j] in ignored_chars:
+                    # no-op
+                    pass
+                elif not found_first_digit and word[j] not in digits:
+                    # word isn't numeric
+                    return False
+                elif not found_first_digit and word[j] in digits:
+                    found_first_digit = True
+                elif word[j] in float_indicating_chars:
+                    # preceding chars indicates numeric and
+                    # current char indicates float
+                    return True
+                elif word[j] not in digits:
+                    # previous characters indicates numeric
+                    # current character shows otherwise
+                    return False
+                elif word[j] in digits:
+                    # no-op
+                    pass
+                else:
+                    raise AssertionError(
+                            f"Unhandled case {word[j]=} {found_first_digit=}"
+                            )
+                j += 1
+
+        return False
 
 # Factor out code common to TextReader.__dealloc__ and TextReader.close
 # It cannot be a class method, since calling self.close() in __dealloc__
diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py
@@ -77,3 +77,30 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
         expected = DataFrame({"data": [f"10E{exp}"]})
 
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "value, expected_value",
+    [
+        ("32.0", 32.0),
+        ("32e0", 32.0),
+        ("3.2e1", 32.0),
+        ("3.2e80", 3.2e80),
+        ("3.2e-80", 3.2e-80),
+        ("18446744073709551616.0", float(1 << 64)),  # loses precision
+        ("18446744073709551616.5", float(1 << 64)),  # loses precision
+        ("36893488147419103232.3", float(1 << 65)),  # loses precision
+    ],
+)
+def test_small_int_followed_by_float(
+    all_parsers_all_precisions, value, expected_value, request
+):
+    # GH#51295
+    parser, precision = all_parsers_all_precisions
+    data = f"""data
+    42
+    {value}"""
+    result = parser.read_csv(StringIO(data), float_precision=precision)
+    expected = DataFrame({"data": [42.0, expected_value]})
+
+    tm.assert_frame_equal(result, expected)