From 1b6e3a2f8ecc69432e279bc5a03347b2d7f30d39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 30 Sep 2025 13:08:43 -0300 Subject: [PATCH 01/19] fix: make `read_csv` be able to read large numbers into float --- pandas/_libs/parsers.pyx | 7 +++--- pandas/tests/io/parser/common/test_float.py | 25 +++++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 91eddc3261164..560f993ff0484 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1081,10 +1081,9 @@ cdef class TextReader: np.dtype("object"), i, start, end, 0, 0, na_hashset, na_fset) except OverflowError: - col_res, na_count = self._convert_with_dtype( - np.dtype("object"), i, start, end, na_filter, - 0, na_hashset, na_fset) - + # Try other dtypes that can accommodate large numbers. + # (e.g. float and string) + pass if col_res is not None: break diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 598f397da686d..be45d69b3d282 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -77,3 +77,28 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): expected = DataFrame({"data": [f"10E{exp}"]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "value, expected_value", + [ + ("18446744073709551616.5", (1 << 64) + 0.5), + ("18446744073709551616", float(1 << 64)), + ], + ids=["2pow64_plus_half", "2pow64"], +) +def test_small_int_big_number( + all_parsers_all_precisions, value, expected_value, request +): + # GH#51295 + parser, precision = all_parsers_all_precisions + if parser.engine == "python" and value == "18446744073709551616": + mark = pytest.mark.xfail(reason="Still need to work on Python parser") + request.applymarker(mark) + data = f"""data + 42 + {value}""" + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [42.0, expected_value]}) + + tm.assert_frame_equal(result, expected) From 8908d8413896564ec9352fae082fab40be07b37b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 1 Oct 2025 23:31:09 -0300 Subject: [PATCH 02/19] docs(whatsnew): add entry in whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 91ce855f03b08..554aadd94d97b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1076,6 +1076,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) +- Bug in :meth:`read_csv` with ``engine="c"`` reading large numbers as a string. Now reads them as floats. (:issue:`51295`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) From 26866553fc64b3190869284255f7a4519d8073df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 2 Oct 2025 10:19:00 -0300 Subject: [PATCH 03/19] test: test explicit float --- pandas/tests/io/parser/common/test_float.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index be45d69b3d282..7ede1685fd1cc 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -82,19 +82,17 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): @pytest.mark.parametrize( "value, expected_value", [ + ("18446744073709551616.0", float(1 << 64)), ("18446744073709551616.5", (1 << 64) + 0.5), - ("18446744073709551616", float(1 << 64)), + ("36893488147419103232.3", (1 << 65) + 0.3), ], - ids=["2pow64_plus_half", "2pow64"], + ids=["2pow64_float", "2pow64_plus_half", "2pow65_plus_third"], ) -def test_small_int_big_number( +def test_small_int_followed_by_float( all_parsers_all_precisions, value, expected_value, request ): # GH#51295 parser, precision = all_parsers_all_precisions - if parser.engine == "python" and value == "18446744073709551616": - mark = pytest.mark.xfail(reason="Still need to work on Python parser") - request.applymarker(mark) data = f"""data 42 {value}""" From 4aa0ff1de6591cea20724b60855e657816d846b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 2 Oct 2025 11:47:25 -0300 Subject: [PATCH 04/19] hackish solution --- pandas/_libs/parsers.pyx | 43 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 560f993ff0484..db46b147bfc6d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1069,6 +1069,10 @@ cdef class TextReader: else: col_res = None for dt in self.dtype_cast_order: + if (dt.kind in "iu" and + self._column_has_float(i, start, end, na_filter, na_hashset)): + continue + try: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_fset) @@ -1081,9 +1085,9 @@ cdef class TextReader: np.dtype("object"), i, start, end, 0, 0, na_hashset, na_fset) except OverflowError: - # Try other dtypes that can accommodate large numbers. - # (e.g. float and string) - pass + col_res, na_count = self._convert_with_dtype( + np.dtype("object"), i, start, end, na_filter, + 0, na_hashset, na_fset) if col_res is not None: break @@ -1341,6 +1345,39 @@ cdef class TextReader: else: return None + cdef bint _column_has_float(self, int64_t col, + int64_t start, int64_t end, + bint na_filter, kh_str_starts_t *na_hashset): + """Check if the column contains any float number.""" + cdef: + Py_ssize_t i, lines = end - start + coliter_t it + const char *word = NULL + const char *ch + bint found_float = False + + coliter_setup(&it, self.parser, col, start) + + for i in range(lines): + COLITER_NEXT(it, word) + + if na_filter and kh_get_str_starts_item(na_hashset, word): + continue + + ch = word + while ch[0] != b"\0": + token_indicates_float = (ch[0] == self.parser.decimal + or ch[0] == b"e" + or ch[0] == b"E") + if token_indicates_float: + found_float = True + break + ch += 1 + + if found_float: + break + + return found_float # Factor out code common to TextReader.__dealloc__ and TextReader.close # It cannot be a class method, since calling self.close() in __dealloc__ From 706503f8c8c0a24ce7ddb35c6c433fe3d51d6841 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 2 Oct 2025 11:47:44 -0300 Subject: [PATCH 05/19] test: add small float test --- pandas/tests/io/parser/common/test_float.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 7ede1685fd1cc..4d665c8e07c5e 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -82,11 +82,12 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): @pytest.mark.parametrize( "value, expected_value", [ + ("32.0", 32.0), ("18446744073709551616.0", float(1 << 64)), ("18446744073709551616.5", (1 << 64) + 0.5), ("36893488147419103232.3", (1 << 65) + 0.3), ], - ids=["2pow64_float", "2pow64_plus_half", "2pow65_plus_third"], + ids=["small_float", "2pow64_float", "2pow64_plus_half", "2pow65_plus_third"], ) def test_small_int_followed_by_float( all_parsers_all_precisions, value, expected_value, request From 544a25764348d68a48d3a20843dfc12396beca9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 2 Oct 2025 12:01:26 -0300 Subject: [PATCH 06/19] chore: return newline --- pandas/_libs/parsers.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index db46b147bfc6d..3180ba75024fe 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1088,6 +1088,7 @@ cdef class TextReader: col_res, na_count = self._convert_with_dtype( np.dtype("object"), i, start, end, na_filter, 0, na_hashset, na_fset) + if col_res is not None: break From 62b49f9a2525df9d215fc5f6746e76e47ccd111c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 2 Oct 2025 12:04:40 -0300 Subject: [PATCH 07/19] docs: update whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 554aadd94d97b..86f39b984958c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1076,7 +1076,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) -- Bug in :meth:`read_csv` with ``engine="c"`` reading large numbers as a string. Now reads them as floats. (:issue:`51295`) +- Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) From 6c03279142fd359c93dbd2f311819f85c8668f5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 2 Oct 2025 12:18:53 -0300 Subject: [PATCH 08/19] early return --- pandas/_libs/parsers.pyx | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3180ba75024fe..518b3b952b4ed 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1355,7 +1355,6 @@ cdef class TextReader: coliter_t it const char *word = NULL const char *ch - bint found_float = False coliter_setup(&it, self.parser, col, start) @@ -1371,14 +1370,10 @@ cdef class TextReader: or ch[0] == b"e" or ch[0] == b"E") if token_indicates_float: - found_float = True - break + return True ch += 1 - if found_float: - break - - return found_float + return False # Factor out code common to TextReader.__dealloc__ and TextReader.close # It cannot be a class method, since calling self.close() in __dealloc__ From 6aa8398b8d718d31cd2aa9b77e497977789367bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 2 Oct 2025 13:09:05 -0300 Subject: [PATCH 09/19] test: add exponent tests --- pandas/tests/io/parser/common/test_float.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 4d665c8e07c5e..a7d830afa4dd8 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -83,11 +83,22 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): "value, expected_value", [ ("32.0", 32.0), + ("3.2e1", 32.0), + ("3.2e80", 3.2e80), + ("3.2e-80", 3.2e-80), ("18446744073709551616.0", float(1 << 64)), ("18446744073709551616.5", (1 << 64) + 0.5), ("36893488147419103232.3", (1 << 65) + 0.3), ], - ids=["small_float", "2pow64_float", "2pow64_plus_half", "2pow65_plus_third"], + ids=[ + "small_float", + "small_float_exponent", + "big_exponent", + "small_exponent", + "2pow64_float", + "2pow64_plus_half", + "2pow65_plus_third", + ], ) def test_small_int_followed_by_float( all_parsers_all_precisions, value, expected_value, request From 44b68bc3764d79c6ce99237044bf0fa56d07e063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 2 Oct 2025 16:43:32 -0300 Subject: [PATCH 10/19] refactor: simplify code --- pandas/_libs/parsers.pyx | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 518b3b952b4ed..43e1a16ee19c9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1354,7 +1354,6 @@ cdef class TextReader: Py_ssize_t i, lines = end - start coliter_t it const char *word = NULL - const char *ch coliter_setup(&it, self.parser, col, start) @@ -1364,14 +1363,8 @@ cdef class TextReader: if na_filter and kh_get_str_starts_item(na_hashset, word): continue - ch = word - while ch[0] != b"\0": - token_indicates_float = (ch[0] == self.parser.decimal - or ch[0] == b"e" - or ch[0] == b"E") - if token_indicates_float: - return True - ch += 1 + if self.parser.decimal in word or b"e" in word or b"E" in word: + return True return False From e16c71b68155d0bd84e7b03143d4b26b2a6c8ac4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 2 Oct 2025 16:56:13 -0300 Subject: [PATCH 11/19] test: remove ids and add new test with no decimal separator --- pandas/tests/io/parser/common/test_float.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index a7d830afa4dd8..4897624d058c8 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -83,6 +83,7 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): "value, expected_value", [ ("32.0", 32.0), + ("32e0", 32.0), ("3.2e1", 32.0), ("3.2e80", 3.2e80), ("3.2e-80", 3.2e-80), @@ -90,15 +91,6 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): ("18446744073709551616.5", (1 << 64) + 0.5), ("36893488147419103232.3", (1 << 65) + 0.3), ], - ids=[ - "small_float", - "small_float_exponent", - "big_exponent", - "small_exponent", - "2pow64_float", - "2pow64_plus_half", - "2pow65_plus_third", - ], ) def test_small_int_followed_by_float( all_parsers_all_precisions, value, expected_value, request From fc010972e35168cccb12566127c6bd23009a61b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 2 Oct 2025 19:10:33 -0300 Subject: [PATCH 12/19] chore: add comments making explicit about precision loss --- pandas/tests/io/parser/common/test_float.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 4897624d058c8..072294d34fb75 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -87,9 +87,9 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): ("3.2e1", 32.0), ("3.2e80", 3.2e80), ("3.2e-80", 3.2e-80), - ("18446744073709551616.0", float(1 << 64)), - ("18446744073709551616.5", (1 << 64) + 0.5), - ("36893488147419103232.3", (1 << 65) + 0.3), + ("18446744073709551616.0", float(1 << 64)), # loses precision + ("18446744073709551616.5", float(1 << 64)), # loses precision + ("36893488147419103232.3", float(1 << 65)), # loses precision ], ) def test_small_int_followed_by_float( From 380e6ec2b079b831f1d766c79d572a7200e7a5fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Fri, 3 Oct 2025 14:58:08 -0300 Subject: [PATCH 13/19] Update pandas/_libs/parsers.pyx Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 43e1a16ee19c9..f01eda093ff3b 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1346,7 +1346,7 @@ cdef class TextReader: else: return None - cdef bint _column_has_float(self, int64_t col, + cdef bint _column_has_float(self, Py_ssize_t col, int64_t start, int64_t end, bint na_filter, kh_str_starts_t *na_hashset): """Check if the column contains any float number.""" From 6b4cedb72131bb5fbb50979d53d99366fcd04cce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Fri, 3 Oct 2025 16:33:08 -0300 Subject: [PATCH 14/19] fix: early return on non numeric chars --- pandas/_libs/parsers.pyx | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index f01eda093ff3b..326a83f0d4671 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1357,14 +1357,33 @@ cdef class TextReader: coliter_setup(&it, self.parser, col, start) + ignored_chars = b" +-" + digits = b"0123456789" + float_indicating_chars = {self.parser.decimal, b"e", b"E"} + for i in range(lines): COLITER_NEXT(it, word) if na_filter and kh_get_str_starts_item(na_hashset, word): continue - if self.parser.decimal in word or b"e" in word or b"E" in word: - return True + found_first_digit = False + for c in word: + if not found_first_digit and c in ignored_chars: + continue + elif not found_first_digit and c not in digits: + # word isn't numeric + return False + elif not found_first_digit: + found_first_digit = True + elif c in float_indicating_chars: + # preceding chars indicates numeric and + # current char indicates float + return True + elif c not in digits: + # previous characters indicates numeric + # current character shows otherwise + return False return False From b3519c16460c2507e40511a30be649b446a00306 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 4 Oct 2025 12:27:30 -0300 Subject: [PATCH 15/19] perf: improve performance marginally Verification performance overhead reduced from 2.83 to 2.47 --- pandas/_libs/parsers.pyx | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 326a83f0d4671..d94244fdaf1e8 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1351,7 +1351,7 @@ cdef class TextReader: bint na_filter, kh_str_starts_t *na_hashset): """Check if the column contains any float number.""" cdef: - Py_ssize_t i, lines = end - start + Py_ssize_t i, j, lines = end - start coliter_t it const char *word = NULL @@ -1368,22 +1368,32 @@ cdef class TextReader: continue found_first_digit = False - for c in word: - if not found_first_digit and c in ignored_chars: - continue - elif not found_first_digit and c not in digits: + j = 0 + while word[j] != b"\0": + if not found_first_digit and word[j] in ignored_chars: + # no-op + pass + elif not found_first_digit and word[j] not in digits: # word isn't numeric return False - elif not found_first_digit: + elif not found_first_digit and word[j] in digits: found_first_digit = True - elif c in float_indicating_chars: + elif word[j] in float_indicating_chars: # preceding chars indicates numeric and # current char indicates float return True - elif c not in digits: + elif word[j] not in digits: # previous characters indicates numeric # current character shows otherwise return False + elif word[j] in digits: + # no-op + pass + else: + raise AssertionError( + f"Unhandled case {word[j]=} {found_first_digit=}" + ) + j += 1 return False From 2e0af7a742111bb4f9e659d27a2c1bc333e96750 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 4 Oct 2025 12:54:47 -0300 Subject: [PATCH 16/19] perf: early return on decimal char --- pandas/_libs/parsers.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index d94244fdaf1e8..7e231bb3e5c5e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1359,7 +1359,7 @@ cdef class TextReader: ignored_chars = b" +-" digits = b"0123456789" - float_indicating_chars = {self.parser.decimal, b"e", b"E"} + float_indicating_chars = b"eE" for i in range(lines): COLITER_NEXT(it, word) @@ -1370,7 +1370,9 @@ cdef class TextReader: found_first_digit = False j = 0 while word[j] != b"\0": - if not found_first_digit and word[j] in ignored_chars: + if word[j] == self.parser.decimal: + return True + elif not found_first_digit and word[j] in ignored_chars: # no-op pass elif not found_first_digit and word[j] not in digits: From c7ddf17ea7dcfc60a34fd183eb7ab92b9259b08e Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 6 Oct 2025 15:22:28 -0300 Subject: [PATCH 17/19] fix: move variables declarations to cdef --- pandas/_libs/parsers.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b278615ad4b80..2be42df3a538c 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1359,13 +1359,13 @@ cdef class TextReader: Py_ssize_t i, j, lines = end - start coliter_t it const char *word = NULL + const char *ignored_chars = " +-" + const char *digits = "0123456789" + const char *float_indicating_chars = "eE" + char null_byte = '\0' coliter_setup(&it, self.parser, col, start) - ignored_chars = b" +-" - digits = b"0123456789" - float_indicating_chars = b"eE" - for i in range(lines): COLITER_NEXT(it, word) @@ -1374,7 +1374,7 @@ cdef class TextReader: found_first_digit = False j = 0 - while word[j] != b"\0": + while word[j] != null_byte: if word[j] == self.parser.decimal: return True elif not found_first_digit and word[j] in ignored_chars: From a7ab94170ca5056c80f5c945992a59e2a7ce0f13 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 6 Oct 2025 15:30:50 -0300 Subject: [PATCH 18/19] chore: use double quote in cython --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2be42df3a538c..d0b3263b201a9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1362,7 +1362,7 @@ cdef class TextReader: const char *ignored_chars = " +-" const char *digits = "0123456789" const char *float_indicating_chars = "eE" - char null_byte = '\0' + char null_byte = "\0" coliter_setup(&it, self.parser, col, start) From 8b5740136761984deb0921113de54d66b8ce9f96 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 6 Oct 2025 15:41:33 -0300 Subject: [PATCH 19/19] chore: workaround double quote lint --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index d0b3263b201a9..442891949dfd2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1362,7 +1362,7 @@ cdef class TextReader: const char *ignored_chars = " +-" const char *digits = "0123456789" const char *float_indicating_chars = "eE" - char null_byte = "\0" + char null_byte = 0 coliter_setup(&it, self.parser, col, start)