diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 293f1cb6f5e79..c3377db457270 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1080,6 +1080,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`) +- Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index ca87fce555f75..442891949dfd2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1070,6 +1070,10 @@ cdef class TextReader: else: col_res = None for dt in self.dtype_cast_order: + if (dt.kind in "iu" and + self._column_has_float(i, start, end, na_filter, na_hashset)): + continue + try: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_fset) @@ -1347,6 +1351,58 @@ cdef class TextReader: else: return None + cdef bint _column_has_float(self, Py_ssize_t col, + int64_t start, int64_t end, + bint na_filter, kh_str_starts_t *na_hashset): + """Check if the column contains any float number.""" + cdef: + Py_ssize_t i, j, lines = end - start + coliter_t it + const char *word = NULL + const char *ignored_chars = " +-" + const char *digits = "0123456789" + const char *float_indicating_chars = "eE" + char null_byte = 0 + + coliter_setup(&it, self.parser, col, start) + + for i in range(lines): + COLITER_NEXT(it, word) + + if na_filter and kh_get_str_starts_item(na_hashset, word): + continue + + found_first_digit = False + j = 0 + while word[j] != null_byte: + if word[j] == self.parser.decimal: + return True + elif not found_first_digit and word[j] in ignored_chars: + # no-op + pass + elif not found_first_digit and word[j] not in digits: + # word isn't numeric + return False + elif not found_first_digit and word[j] in digits: + found_first_digit = True + elif word[j] in float_indicating_chars: + # preceding chars indicates numeric and + # current char indicates float + return True + elif word[j] not in digits: + # previous characters indicates numeric + # current character shows otherwise + return False + elif word[j] in digits: + # no-op + pass + else: + raise AssertionError( + f"Unhandled case {word[j]=} {found_first_digit=}" + ) + j += 1 + + return False # Factor out code common to TextReader.__dealloc__ and TextReader.close # It cannot be a class method, since calling self.close() in __dealloc__ diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 598f397da686d..072294d34fb75 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -77,3 +77,30 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): expected = DataFrame({"data": [f"10E{exp}"]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "value, expected_value", + [ + ("32.0", 32.0), + ("32e0", 32.0), + ("3.2e1", 32.0), + ("3.2e80", 3.2e80), + ("3.2e-80", 3.2e-80), + ("18446744073709551616.0", float(1 << 64)), # loses precision + ("18446744073709551616.5", float(1 << 64)), # loses precision + ("36893488147419103232.3", float(1 << 65)), # loses precision + ], +) +def test_small_int_followed_by_float( + all_parsers_all_precisions, value, expected_value, request +): + # GH#51295 + parser, precision = all_parsers_all_precisions + data = f"""data + 42 + {value}""" + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [42.0, expected_value]}) + + tm.assert_frame_equal(result, expected)