From be21b2e58a6f897ea8989894fda8abc24edbbeeb Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Tue, 7 Oct 2025 23:28:30 -0300 Subject: [PATCH 1/9] perf: verify for float numbers during tokenization --- .../_libs/include/pandas/parser/pd_parser.h | 18 +-- .../_libs/include/pandas/parser/tokenizer.h | 8 +- pandas/_libs/parsers.pyx | 105 ++++++------------ pandas/_libs/src/parser/tokenizer.c | 57 ++++++++-- 4 files changed, 93 insertions(+), 95 deletions(-) diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h index 543839b5d75bf..b64664614fbbc 100644 --- a/pandas/_libs/include/pandas/parser/pd_parser.h +++ b/pandas/_libs/include/pandas/parser/pd_parser.h @@ -37,8 +37,8 @@ typedef struct { int (*parser_trim_buffers)(parser_t *); int (*tokenize_all_rows)(parser_t *, const char *); int (*tokenize_nrows)(parser_t *, size_t, const char *); - int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char); - uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t, + int64_t (*str_to_int64)(const char *, char, int64_t, int64_t, int *, char); + uint64_t (*str_to_uint64)(uint_state *, const char *, char, int64_t, uint64_t, int *, char); double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *); double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *, @@ -87,12 +87,14 @@ static PandasParser_CAPI *PandasParserAPI = NULL; PandasParserAPI->tokenize_all_rows((self), (encoding_errors)) #define tokenize_nrows(self, nrows, encoding_errors) \ PandasParserAPI->tokenize_nrows((self), (nrows), (encoding_errors)) -#define str_to_int64(p_item, int_min, int_max, error, t_sep) \ - PandasParserAPI->str_to_int64((p_item), (int_min), (int_max), (error), \ - (t_sep)) -#define str_to_uint64(state, p_item, int_max, uint_max, error, t_sep) \ - PandasParserAPI->str_to_uint64((state), (p_item), (int_max), (uint_max), \ - (error), (t_sep)) +#define str_to_int64(p_item, decimal_separator, int_min, int_max, error, \ + t_sep) \ + PandasParserAPI->str_to_int64((p_item), (decimal_separator), (int_min), \ + (int_max), (error), (t_sep)) +#define str_to_uint64(state, p_item, decimal_separator, int_max, uint_max, \ + error, t_sep) \ + PandasParserAPI->str_to_uint64((state), (p_item), (decimal_separator), \ + (int_max), (uint_max), (error), (t_sep)) #define xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int) \ PandasParserAPI->xstrtod((p), (q), (decimal), (sci), (tsep), \ (skip_trailing), (error), (maybe_int)) diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h index 209f375a5bf6c..b6200c0032dba 100644 --- a/pandas/_libs/include/pandas/parser/tokenizer.h +++ b/pandas/_libs/include/pandas/parser/tokenizer.h @@ -17,6 +17,7 @@ See LICENSE for the license #define ERROR_NO_DIGITS 1 #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 +#define ERROR_IS_FLOAT 4 #include @@ -208,10 +209,11 @@ void uint_state_init(uint_state *self); int uint64_conflict(uint_state *self); -uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, +uint64_t str_to_uint64(uint_state *state, const char *p_item, + char decimal_separator, int64_t int_max, uint64_t uint_max, int *error, char tsep); -int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, - int *error, char tsep); +int64_t str_to_int64(const char *p_item, char decimal_separator, + int64_t int_min, int64_t int_max, int *error, char tsep); double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); double precise_xstrtod(const char *p, char **q, char decimal, char sci, diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 442891949dfd2..bb46a7ff3f1e8 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h": SKIP_LINE FINISHED - enum: ERROR_OVERFLOW + enum: ERROR_OVERFLOW, ERROR_IS_FLOAT ctypedef enum BadLineHandleMethod: ERROR, @@ -281,10 +281,11 @@ cdef extern from "pandas/parser/pd_parser.h": int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil - int64_t str_to_int64(char *p_item, int64_t int_min, + int64_t str_to_int64(char *p_item, char decimal_separator, int64_t int_min, int64_t int_max, int *error, char tsep) nogil - uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max, - uint64_t uint_max, int *error, char tsep) nogil + uint64_t str_to_uint64(uint_state *state, char *p_item, char decimal_separator, + int64_t int_max, uint64_t uint_max, + int *error, char tsep) nogil double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, @@ -1070,21 +1071,28 @@ cdef class TextReader: else: col_res = None for dt in self.dtype_cast_order: - if (dt.kind in "iu" and - self._column_has_float(i, start, end, na_filter, na_hashset)): - continue - try: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_fset) - except ValueError: - # This error is raised from trying to convert to uint64, - # and we discover that we cannot convert to any numerical - # dtype successfully. As a result, we leave the data - # column AS IS with object dtype. - col_res, na_count = self._convert_with_dtype( - np.dtype("object"), i, start, end, 0, - 0, na_hashset, na_fset) + except ValueError as e: + if str(e) == "Number is float": + try: + col_res, na_count = self._convert_with_dtype( + np.dtype("float64"), i, start, end, 0, + 0, na_hashset, na_fset) + except ValueError: + col_res, na_count = self._convert_with_dtype( + np.dtype("object"), i, start, end, 0, + 0, na_hashset, na_fset) + + else: + # This error is raised from trying to convert to uint64, + # and we discover that we cannot convert to any numerical + # dtype successfully. As a result, we leave the data + # column AS IS with object dtype. + col_res, na_count = self._convert_with_dtype( + np.dtype("object"), i, start, end, 0, + 0, na_hashset, na_fset) except OverflowError: try: col_res, na_count = _try_pylong(self.parser, i, start, @@ -1351,59 +1359,6 @@ cdef class TextReader: else: return None - cdef bint _column_has_float(self, Py_ssize_t col, - int64_t start, int64_t end, - bint na_filter, kh_str_starts_t *na_hashset): - """Check if the column contains any float number.""" - cdef: - Py_ssize_t i, j, lines = end - start - coliter_t it - const char *word = NULL - const char *ignored_chars = " +-" - const char *digits = "0123456789" - const char *float_indicating_chars = "eE" - char null_byte = 0 - - coliter_setup(&it, self.parser, col, start) - - for i in range(lines): - COLITER_NEXT(it, word) - - if na_filter and kh_get_str_starts_item(na_hashset, word): - continue - - found_first_digit = False - j = 0 - while word[j] != null_byte: - if word[j] == self.parser.decimal: - return True - elif not found_first_digit and word[j] in ignored_chars: - # no-op - pass - elif not found_first_digit and word[j] not in digits: - # word isn't numeric - return False - elif not found_first_digit and word[j] in digits: - found_first_digit = True - elif word[j] in float_indicating_chars: - # preceding chars indicates numeric and - # current char indicates float - return True - elif word[j] not in digits: - # previous characters indicates numeric - # current character shows otherwise - return False - elif word[j] in digits: - # no-op - pass - else: - raise AssertionError( - f"Unhandled case {word[j]=} {found_first_digit=}" - ) - j += 1 - - return False - # Factor out code common to TextReader.__dealloc__ and TextReader.close # It cannot be a class method, since calling self.close() in __dealloc__ # which causes a class attribute lookup and violates best practices @@ -1822,6 +1777,8 @@ cdef _try_uint64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") + elif error == ERROR_IS_FLOAT: + raise ValueError("Number is float") return None if uint64_conflict(&state): @@ -1855,14 +1812,14 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col, data[i] = 0 continue - data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, + data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX, &error, parser.thousands) if error != 0: return error else: for i in range(lines): COLITER_NEXT(it, word) - data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, + data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX, &error, parser.thousands) if error != 0: return error @@ -1892,6 +1849,8 @@ cdef _try_int64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") + elif error == ERROR_IS_FLOAT: + raise ValueError("Number is float") return None, None return result, na_count @@ -1920,14 +1879,14 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col, data[i] = NA continue - data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, + data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: return error else: for i in range(lines): COLITER_NEXT(it, word) - data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, + data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: return error diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 61e96fc835e4d..de1ad4454f294 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1834,8 +1834,8 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } -int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, - int *error, char tsep) { +int64_t str_to_int64(const char *p_item, char decimal_separator, + int64_t int_min, int64_t int_max, int *error, char tsep) { const char *p = p_item; // Skip leading spaces. while (isspace_ascii(*p)) { @@ -1879,7 +1879,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, d = *++p; } else { *error = ERROR_OVERFLOW; - return 0; + break; } } } else { @@ -1890,7 +1890,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, d = *++p; } else { *error = ERROR_OVERFLOW; - return 0; + break; } } } @@ -1917,7 +1917,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } else { *error = ERROR_OVERFLOW; - return 0; + break; } } } else { @@ -1929,12 +1929,25 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } else { *error = ERROR_OVERFLOW; - return 0; + break; } } } } + if (*error == ERROR_OVERFLOW) { + // advance digits + while (*p != '\0' && isdigit_ascii(*p)) { + p++; + } + + // check if is float + if (*p == decimal_separator || *p == 'e' || *p == 'E') { + *error = ERROR_IS_FLOAT; + } + return 0; + } + // Skip trailing spaces. while (isspace_ascii(*p)) { ++p; @@ -1942,7 +1955,11 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, // Did we use up all the characters? if (*p) { - *error = ERROR_INVALID_CHARS; + if (*p == decimal_separator || *p == 'e' || *p == 'E') { + *error = ERROR_IS_FLOAT; + } else { + *error = ERROR_INVALID_CHARS; + } return 0; } @@ -1950,7 +1967,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, return number; } -uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, +uint64_t str_to_uint64(uint_state *state, const char *p_item, + char decimal_separator, int64_t int_max, uint64_t uint_max, int *error, char tsep) { const char *p = p_item; // Skip leading spaces. @@ -1997,7 +2015,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } else { *error = ERROR_OVERFLOW; - return 0; + break; } } } else { @@ -2009,11 +2027,24 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } else { *error = ERROR_OVERFLOW; - return 0; + break; } } } + if (*error == ERROR_OVERFLOW) { + // advance digits + while (*p != '\0' && isdigit_ascii(*p)) { + p++; + } + + // check if is float + if (*p == decimal_separator || *p == 'e' || *p == 'E') { + *error = ERROR_IS_FLOAT; + } + return 0; + } + // Skip trailing spaces. while (isspace_ascii(*p)) { ++p; @@ -2021,7 +2052,11 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, // Did we use up all the characters? if (*p) { - *error = ERROR_INVALID_CHARS; + if (*p == decimal_separator || *p == 'e' || *p == 'E') { + *error = ERROR_IS_FLOAT; + } else { + *error = ERROR_INVALID_CHARS; + } return 0; } From fc10a5f487d48a839c8a694437a3069e1739133a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 8 Oct 2025 11:55:00 -0300 Subject: [PATCH 2/9] fix: try other dtypes instead of skipping to float64 --- pandas/_libs/parsers.pyx | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index bb46a7ff3f1e8..de33f6b10109d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1070,21 +1070,18 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) else: col_res = None + maybe_int = True for dt in self.dtype_cast_order: + if not maybe_int and dt.kind in "iu": + continue + try: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_fset) except ValueError as e: if str(e) == "Number is float": - try: - col_res, na_count = self._convert_with_dtype( - np.dtype("float64"), i, start, end, 0, - 0, na_hashset, na_fset) - except ValueError: - col_res, na_count = self._convert_with_dtype( - np.dtype("object"), i, start, end, 0, - 0, na_hashset, na_fset) - + maybe_int = False + continue else: # This error is raised from trying to convert to uint64, # and we discover that we cannot convert to any numerical From ab2fab8d489d96a8f2c9985a5f05070dc613ef16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 8 Oct 2025 12:36:12 -0300 Subject: [PATCH 3/9] fix: don't throw error when casting is expected --- pandas/_libs/parsers.pyx | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index de33f6b10109d..5f9ddd7dbb4a0 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1059,7 +1059,7 @@ cdef class TextReader: if col_dtype is not None: col_res, na_count = self._convert_with_dtype( col_dtype, i, start, end, na_filter, - 1, na_hashset, na_fset) + 1, na_hashset, na_fset, False) # Fallback on the parse (e.g. we requested int dtype, # but its actually a float). @@ -1077,7 +1077,7 @@ cdef class TextReader: try: col_res, na_count = self._convert_with_dtype( - dt, i, start, end, na_filter, 0, na_hashset, na_fset) + dt, i, start, end, na_filter, 0, na_hashset, na_fset, True) except ValueError as e: if str(e) == "Number is float": maybe_int = False @@ -1089,7 +1089,7 @@ cdef class TextReader: # column AS IS with object dtype. col_res, na_count = self._convert_with_dtype( np.dtype("object"), i, start, end, 0, - 0, na_hashset, na_fset) + 0, na_hashset, na_fset, False) except OverflowError: try: col_res, na_count = _try_pylong(self.parser, i, start, @@ -1097,7 +1097,7 @@ cdef class TextReader: except ValueError: col_res, na_count = self._convert_with_dtype( np.dtype("object"), i, start, end, 0, - 0, na_hashset, na_fset) + 0, na_hashset, na_fset, False) if col_res is not None: break @@ -1145,7 +1145,7 @@ cdef class TextReader: bint na_filter, bint user_dtype, kh_str_starts_t *na_hashset, - set na_fset): + set na_fset, bint raise_on_float): if isinstance(dtype, CategoricalDtype): # TODO: I suspect that _categorical_convert could be # optimized when dtype is an instance of CategoricalDtype @@ -1186,14 +1186,14 @@ cdef class TextReader: elif dtype.kind in "iu": try: - result, na_count = _try_int64(self.parser, i, start, - end, na_filter, na_hashset) + result, na_count = _try_int64(self.parser, i, start, end, + na_filter, na_hashset, raise_on_float) if user_dtype and na_count is not None: if na_count > 0: raise ValueError(f"Integer column has NA values in column {i}") except OverflowError: result = _try_uint64(self.parser, i, start, end, - na_filter, na_hashset) + na_filter, na_hashset, raise_on_float) na_count = 0 if result is not None and dtype != "int64": @@ -1752,7 +1752,8 @@ cdef int _try_double_nogil(parser_t *parser, cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_starts_t *na_hashset): + bint na_filter, kh_str_starts_t *na_hashset, + bint raise_on_float): cdef: int error Py_ssize_t lines @@ -1774,9 +1775,10 @@ cdef _try_uint64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") - elif error == ERROR_IS_FLOAT: + elif raise_on_float and error == ERROR_IS_FLOAT: raise ValueError("Number is float") - return None + elif not raise_on_float or error != ERROR_IS_FLOAT: + return None, None if uint64_conflict(&state): raise ValueError("Cannot convert to numerical dtype") @@ -1826,7 +1828,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col, cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_starts_t *na_hashset): + bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float): cdef: int error, na_count = 0 Py_ssize_t lines @@ -1846,9 +1848,10 @@ cdef _try_int64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") - elif error == ERROR_IS_FLOAT: + elif raise_on_float and error == ERROR_IS_FLOAT: raise ValueError("Number is float") - return None, None + elif not raise_on_float or error != ERROR_IS_FLOAT: + return None, None return result, na_count From 7e8033d694456285313118ece947e77e87bea2da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 8 Oct 2025 12:58:40 -0300 Subject: [PATCH 4/9] fix: fix tuple error --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5f9ddd7dbb4a0..ffa7a48e4f87f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1778,7 +1778,7 @@ cdef _try_uint64(parser_t *parser, int64_t col, elif raise_on_float and error == ERROR_IS_FLOAT: raise ValueError("Number is float") elif not raise_on_float or error != ERROR_IS_FLOAT: - return None, None + return None if uint64_conflict(&state): raise ValueError("Cannot convert to numerical dtype") From 5219386cace91c898b2fedd13d41c46c6272d3ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 8 Oct 2025 15:25:24 -0300 Subject: [PATCH 5/9] fix: remove decimal_separator argument --- .../_libs/include/pandas/parser/pd_parser.h | 18 ++++++------- .../_libs/include/pandas/parser/tokenizer.h | 8 +++--- pandas/_libs/parsers.pyx | 25 +++++++++-------- pandas/_libs/src/parser/tokenizer.c | 27 +++++++------------ 4 files changed, 32 insertions(+), 46 deletions(-) diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h index b64664614fbbc..543839b5d75bf 100644 --- a/pandas/_libs/include/pandas/parser/pd_parser.h +++ b/pandas/_libs/include/pandas/parser/pd_parser.h @@ -37,8 +37,8 @@ typedef struct { int (*parser_trim_buffers)(parser_t *); int (*tokenize_all_rows)(parser_t *, const char *); int (*tokenize_nrows)(parser_t *, size_t, const char *); - int64_t (*str_to_int64)(const char *, char, int64_t, int64_t, int *, char); - uint64_t (*str_to_uint64)(uint_state *, const char *, char, int64_t, uint64_t, + int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char); + uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t, int *, char); double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *); double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *, @@ -87,14 +87,12 @@ static PandasParser_CAPI *PandasParserAPI = NULL; PandasParserAPI->tokenize_all_rows((self), (encoding_errors)) #define tokenize_nrows(self, nrows, encoding_errors) \ PandasParserAPI->tokenize_nrows((self), (nrows), (encoding_errors)) -#define str_to_int64(p_item, decimal_separator, int_min, int_max, error, \ - t_sep) \ - PandasParserAPI->str_to_int64((p_item), (decimal_separator), (int_min), \ - (int_max), (error), (t_sep)) -#define str_to_uint64(state, p_item, decimal_separator, int_max, uint_max, \ - error, t_sep) \ - PandasParserAPI->str_to_uint64((state), (p_item), (decimal_separator), \ - (int_max), (uint_max), (error), (t_sep)) +#define str_to_int64(p_item, int_min, int_max, error, t_sep) \ + PandasParserAPI->str_to_int64((p_item), (int_min), (int_max), (error), \ + (t_sep)) +#define str_to_uint64(state, p_item, int_max, uint_max, error, t_sep) \ + PandasParserAPI->str_to_uint64((state), (p_item), (int_max), (uint_max), \ + (error), (t_sep)) #define xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int) \ PandasParserAPI->xstrtod((p), (q), (decimal), (sci), (tsep), \ (skip_trailing), (error), (maybe_int)) diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h index b6200c0032dba..209f375a5bf6c 100644 --- a/pandas/_libs/include/pandas/parser/tokenizer.h +++ b/pandas/_libs/include/pandas/parser/tokenizer.h @@ -17,7 +17,6 @@ See LICENSE for the license #define ERROR_NO_DIGITS 1 #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 -#define ERROR_IS_FLOAT 4 #include @@ -209,11 +208,10 @@ void uint_state_init(uint_state *self); int uint64_conflict(uint_state *self); -uint64_t str_to_uint64(uint_state *state, const char *p_item, - char decimal_separator, int64_t int_max, +uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep); -int64_t str_to_int64(const char *p_item, char decimal_separator, - int64_t int_min, int64_t int_max, int *error, char tsep); +int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, + int *error, char tsep); double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); double precise_xstrtod(const char *p, char **q, char decimal, char sci, diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index ffa7a48e4f87f..785be76eb0545 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h": SKIP_LINE FINISHED - enum: ERROR_OVERFLOW, ERROR_IS_FLOAT + enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS ctypedef enum BadLineHandleMethod: ERROR, @@ -281,11 +281,10 @@ cdef extern from "pandas/parser/pd_parser.h": int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil - int64_t str_to_int64(char *p_item, char decimal_separator, int64_t int_min, + int64_t str_to_int64(char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) nogil - uint64_t str_to_uint64(uint_state *state, char *p_item, char decimal_separator, - int64_t int_max, uint64_t uint_max, - int *error, char tsep) nogil + uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max, + uint64_t uint_max, int *error, char tsep) nogil double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, @@ -1775,9 +1774,9 @@ cdef _try_uint64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") - elif raise_on_float and error == ERROR_IS_FLOAT: + elif raise_on_float and error == ERROR_INVALID_CHARS: raise ValueError("Number is float") - elif not raise_on_float or error != ERROR_IS_FLOAT: + elif not raise_on_float or error != ERROR_INVALID_CHARS: return None if uint64_conflict(&state): @@ -1811,14 +1810,14 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col, data[i] = 0 continue - data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX, + data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, &error, parser.thousands) if error != 0: return error else: for i in range(lines): COLITER_NEXT(it, word) - data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX, + data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, &error, parser.thousands) if error != 0: return error @@ -1848,9 +1847,9 @@ cdef _try_int64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") - elif raise_on_float and error == ERROR_IS_FLOAT: + elif raise_on_float and error == ERROR_INVALID_CHARS: raise ValueError("Number is float") - elif not raise_on_float or error != ERROR_IS_FLOAT: + elif not raise_on_float or error != ERROR_INVALID_CHARS: return None, None return result, na_count @@ -1879,14 +1878,14 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col, data[i] = NA continue - data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX, + data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: return error else: for i in range(lines): COLITER_NEXT(it, word) - data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX, + data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: return error diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index de1ad4454f294..059aa945b32bf 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1834,8 +1834,8 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } -int64_t str_to_int64(const char *p_item, char decimal_separator, - int64_t int_min, int64_t int_max, int *error, char tsep) { +int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, + int *error, char tsep) { const char *p = p_item; // Skip leading spaces. while (isspace_ascii(*p)) { @@ -1942,8 +1942,8 @@ int64_t str_to_int64(const char *p_item, char decimal_separator, } // check if is float - if (*p == decimal_separator || *p == 'e' || *p == 'E') { - *error = ERROR_IS_FLOAT; + if (*p != '\0') { + *error = ERROR_INVALID_CHARS; } return 0; } @@ -1955,11 +1955,7 @@ int64_t str_to_int64(const char *p_item, char decimal_separator, // Did we use up all the characters? if (*p) { - if (*p == decimal_separator || *p == 'e' || *p == 'E') { - *error = ERROR_IS_FLOAT; - } else { - *error = ERROR_INVALID_CHARS; - } + *error = ERROR_INVALID_CHARS; return 0; } @@ -1967,8 +1963,7 @@ int64_t str_to_int64(const char *p_item, char decimal_separator, return number; } -uint64_t str_to_uint64(uint_state *state, const char *p_item, - char decimal_separator, int64_t int_max, +uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { const char *p = p_item; // Skip leading spaces. @@ -2039,8 +2034,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, } // check if is float - if (*p == decimal_separator || *p == 'e' || *p == 'E') { - *error = ERROR_IS_FLOAT; + if (*p != '\0') { + *error = ERROR_INVALID_CHARS; } return 0; } @@ -2052,11 +2047,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, // Did we use up all the characters? if (*p) { - if (*p == decimal_separator || *p == 'e' || *p == 'E') { - *error = ERROR_IS_FLOAT; - } else { - *error = ERROR_INVALID_CHARS; - } + *error = ERROR_INVALID_CHARS; return 0; } From 4ff07e3b5ea09bf15c5bcc6ede75c97d326ee7aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 8 Oct 2025 15:35:31 -0300 Subject: [PATCH 6/9] fix: early return on overflow, but still check next chars --- pandas/_libs/src/parser/tokenizer.c | 55 ++++++++++++----------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 059aa945b32bf..2dfa58a460efc 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1834,6 +1834,17 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } +static inline void check_for_invalid_char(const char *p_item, int *error) { + while (*p_item != '\0' && isdigit_ascii(*p_item)) { + p_item++; + } + + // check if reached the end of string after consuming all digits + if (*p_item != '\0') { + *error = ERROR_INVALID_CHARS; + } +} + int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { const char *p = p_item; @@ -1879,7 +1890,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, d = *++p; } else { *error = ERROR_OVERFLOW; - break; + check_for_invalid_char(p, error); + return 0; } } } else { @@ -1890,7 +1902,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, d = *++p; } else { *error = ERROR_OVERFLOW; - break; + check_for_invalid_char(p, error); + return 0; } } } @@ -1917,7 +1930,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } else { *error = ERROR_OVERFLOW; - break; + check_for_invalid_char(p, error); + return 0; } } } else { @@ -1929,25 +1943,13 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } else { *error = ERROR_OVERFLOW; - break; + check_for_invalid_char(p, error); + return 0; } } } } - if (*error == ERROR_OVERFLOW) { - // advance digits - while (*p != '\0' && isdigit_ascii(*p)) { - p++; - } - - // check if is float - if (*p != '\0') { - *error = ERROR_INVALID_CHARS; - } - return 0; - } - // Skip trailing spaces. while (isspace_ascii(*p)) { ++p; @@ -2010,7 +2012,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } else { *error = ERROR_OVERFLOW; - break; + check_for_invalid_char(p, error); + return 0; } } } else { @@ -2022,24 +2025,12 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } else { *error = ERROR_OVERFLOW; - break; + check_for_invalid_char(p, error); + return 0; } } } - if (*error == ERROR_OVERFLOW) { - // advance digits - while (*p != '\0' && isdigit_ascii(*p)) { - p++; - } - - // check if is float - if (*p != '\0') { - *error = ERROR_INVALID_CHARS; - } - return 0; - } - // Skip trailing spaces. while (isspace_ascii(*p)) { ++p; From c7fc2927bb282c0fa5e4a50ce73f017a51483664 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 8 Oct 2025 16:18:45 -0300 Subject: [PATCH 7/9] fix: don't flag int with trailing whitespace as invalid --- pandas/_libs/src/parser/tokenizer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2dfa58a460efc..e8d794ab8935c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1839,6 +1839,10 @@ static inline void check_for_invalid_char(const char *p_item, int *error) { p_item++; } + while (*p_item != '\0' && isspace_ascii(*p_item)) { + ++p_item; + } + // check if reached the end of string after consuming all digits if (*p_item != '\0') { *error = ERROR_INVALID_CHARS; From 4c8d77041e05a5fcb582ba1fbe6e5c7cca343b03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 8 Oct 2025 16:23:00 -0300 Subject: [PATCH 8/9] chore: better error message --- pandas/_libs/parsers.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 785be76eb0545..68027019e4c3f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1078,7 +1078,7 @@ cdef class TextReader: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_fset, True) except ValueError as e: - if str(e) == "Number is float": + if str(e) == "Number is not int": maybe_int = False continue else: @@ -1775,7 +1775,7 @@ cdef _try_uint64(parser_t *parser, int64_t col, # Can't get the word variable raise OverflowError("Overflow") elif raise_on_float and error == ERROR_INVALID_CHARS: - raise ValueError("Number is float") + raise ValueError("Number is not int") elif not raise_on_float or error != ERROR_INVALID_CHARS: return None @@ -1848,7 +1848,7 @@ cdef _try_int64(parser_t *parser, int64_t col, # Can't get the word variable raise OverflowError("Overflow") elif raise_on_float and error == ERROR_INVALID_CHARS: - raise ValueError("Number is float") + raise ValueError("Number is not int") elif not raise_on_float or error != ERROR_INVALID_CHARS: return None, None From 35f075a03341e4372a251cc99da0dc3e683c8a1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 8 Oct 2025 18:21:55 -0300 Subject: [PATCH 9/9] docs: document function to check for invalid character --- pandas/_libs/src/parser/tokenizer.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index e8d794ab8935c..1561c16e4fd6c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1834,7 +1834,28 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } -static inline void check_for_invalid_char(const char *p_item, int *error) { +/** + * @brief Validates that a string contains only numeric digits and optional + * trailing whitespace. + * + * This function is used after an integer overflow, + * where is checks the rest of the string for a non-numeric character, + * while also ignoring trailing white-space. + * + * Pure integer overflows during CSV parsing are converted to PyLongObjects, + * while, if any invalid character is found, it skips integer + * parsing and tries other conversion methods. + * + * @param p_item Pointer to the string to validate for numeric format + * @param error Pointer to indicate error code. + * Set to ERROR_INVALID_CHARS if an invalid character is found. + * + * @return Pointer to the position in the string where validation stopped. + * - If valid: terminates at the null terminator. + * - If invalid: points to the first invalid character encountered. + */ +static inline const char *check_for_invalid_char(const char *p_item, + int *error) { while (*p_item != '\0' && isdigit_ascii(*p_item)) { p_item++; } @@ -1847,6 +1868,8 @@ static inline void check_for_invalid_char(const char *p_item, int *error) { if (*p_item != '\0') { *error = ERROR_INVALID_CHARS; } + + return p_item; } int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,