From be21b2e58a6f897ea8989894fda8abc24edbbeeb Mon Sep 17 00:00:00 2001
From: Alvaro-Kothe <kothe65@gmail.com>
Date: Tue, 7 Oct 2025 23:28:30 -0300
Subject: [PATCH 1/9] perf: verify for float numbers during tokenization

---
 .../_libs/include/pandas/parser/pd_parser.h   |  18 +--
 .../_libs/include/pandas/parser/tokenizer.h   |   8 +-
 pandas/_libs/parsers.pyx                      | 105 ++++++------------
 pandas/_libs/src/parser/tokenizer.c           |  57 ++++++++--
 4 files changed, 93 insertions(+), 95 deletions(-)

diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h
index 543839b5d75bf..b64664614fbbc 100644
--- a/pandas/_libs/include/pandas/parser/pd_parser.h
+++ b/pandas/_libs/include/pandas/parser/pd_parser.h
@@ -37,8 +37,8 @@ typedef struct {
   int (*parser_trim_buffers)(parser_t *);
   int (*tokenize_all_rows)(parser_t *, const char *);
   int (*tokenize_nrows)(parser_t *, size_t, const char *);
-  int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char);
-  uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t,
+  int64_t (*str_to_int64)(const char *, char, int64_t, int64_t, int *, char);
+  uint64_t (*str_to_uint64)(uint_state *, const char *, char, int64_t, uint64_t,
                             int *, char);
   double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *);
   double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *,
@@ -87,12 +87,14 @@ static PandasParser_CAPI *PandasParserAPI = NULL;
   PandasParserAPI->tokenize_all_rows((self), (encoding_errors))
 #define tokenize_nrows(self, nrows, encoding_errors)                           \
   PandasParserAPI->tokenize_nrows((self), (nrows), (encoding_errors))
-#define str_to_int64(p_item, int_min, int_max, error, t_sep)                   \
-  PandasParserAPI->str_to_int64((p_item), (int_min), (int_max), (error),       \
-                                (t_sep))
-#define str_to_uint64(state, p_item, int_max, uint_max, error, t_sep)          \
-  PandasParserAPI->str_to_uint64((state), (p_item), (int_max), (uint_max),     \
-                                 (error), (t_sep))
+#define str_to_int64(p_item, decimal_separator, int_min, int_max, error,       \
+                     t_sep)                                                    \
+  PandasParserAPI->str_to_int64((p_item), (decimal_separator), (int_min),      \
+                                (int_max), (error), (t_sep))
+#define str_to_uint64(state, p_item, decimal_separator, int_max, uint_max,     \
+                      error, t_sep)                                            \
+  PandasParserAPI->str_to_uint64((state), (p_item), (decimal_separator),       \
+                                 (int_max), (uint_max), (error), (t_sep))
 #define xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)     \
   PandasParserAPI->xstrtod((p), (q), (decimal), (sci), (tsep),                 \
                            (skip_trailing), (error), (maybe_int))
diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h
index 209f375a5bf6c..b6200c0032dba 100644
--- a/pandas/_libs/include/pandas/parser/tokenizer.h
+++ b/pandas/_libs/include/pandas/parser/tokenizer.h
@@ -17,6 +17,7 @@ See LICENSE for the license
 #define ERROR_NO_DIGITS 1
 #define ERROR_OVERFLOW 2
 #define ERROR_INVALID_CHARS 3
+#define ERROR_IS_FLOAT 4
 
 #include <stdint.h>
 
@@ -208,10 +209,11 @@ void uint_state_init(uint_state *self);
 
 int uint64_conflict(uint_state *self);
 
-uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
+uint64_t str_to_uint64(uint_state *state, const char *p_item,
+                       char decimal_separator, int64_t int_max,
                        uint64_t uint_max, int *error, char tsep);
-int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
-                     int *error, char tsep);
+int64_t str_to_int64(const char *p_item, char decimal_separator,
+                     int64_t int_min, int64_t int_max, int *error, char tsep);
 double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
                int skip_trailing, int *error, int *maybe_int);
 double precise_xstrtod(const char *p, char **q, char decimal, char sci,
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 442891949dfd2..bb46a7ff3f1e8 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
         SKIP_LINE
         FINISHED
 
-    enum: ERROR_OVERFLOW
+    enum: ERROR_OVERFLOW, ERROR_IS_FLOAT
 
     ctypedef enum BadLineHandleMethod:
         ERROR,
@@ -281,10 +281,11 @@ cdef extern from "pandas/parser/pd_parser.h":
     int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
     int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
 
-    int64_t str_to_int64(char *p_item, int64_t int_min,
+    int64_t str_to_int64(char *p_item, char decimal_separator, int64_t int_min,
                          int64_t int_max, int *error, char tsep) nogil
-    uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
-                           uint64_t uint_max, int *error, char tsep) nogil
+    uint64_t str_to_uint64(uint_state *state, char *p_item, char decimal_separator,
+                           int64_t int_max, uint64_t uint_max,
+                           int *error, char tsep) nogil
 
     double xstrtod(const char *p, char **q, char decimal,
                    char sci, char tsep, int skip_trailing,
@@ -1070,21 +1071,28 @@ cdef class TextReader:
         else:
             col_res = None
             for dt in self.dtype_cast_order:
-                if (dt.kind in "iu" and
-                        self._column_has_float(i, start, end, na_filter, na_hashset)):
-                    continue
-
                 try:
                     col_res, na_count = self._convert_with_dtype(
                         dt, i, start, end, na_filter, 0, na_hashset, na_fset)
-                except ValueError:
-                    # This error is raised from trying to convert to uint64,
-                    # and we discover that we cannot convert to any numerical
-                    # dtype successfully. As a result, we leave the data
-                    # column AS IS with object dtype.
-                    col_res, na_count = self._convert_with_dtype(
-                        np.dtype("object"), i, start, end, 0,
-                        0, na_hashset, na_fset)
+                except ValueError as e:
+                    if str(e) == "Number is float":
+                        try:
+                            col_res, na_count = self._convert_with_dtype(
+                                np.dtype("float64"), i, start, end, 0,
+                                0, na_hashset, na_fset)
+                        except ValueError:
+                            col_res, na_count = self._convert_with_dtype(
+                                np.dtype("object"), i, start, end, 0,
+                                0, na_hashset, na_fset)
+
+                    else:
+                        # This error is raised from trying to convert to uint64,
+                        # and we discover that we cannot convert to any numerical
+                        # dtype successfully. As a result, we leave the data
+                        # column AS IS with object dtype.
+                        col_res, na_count = self._convert_with_dtype(
+                            np.dtype("object"), i, start, end, 0,
+                            0, na_hashset, na_fset)
                 except OverflowError:
                     try:
                         col_res, na_count = _try_pylong(self.parser, i, start,
@@ -1351,59 +1359,6 @@ cdef class TextReader:
             else:
                 return None
 
-    cdef bint _column_has_float(self, Py_ssize_t col,
-                                int64_t start, int64_t end,
-                                bint na_filter, kh_str_starts_t *na_hashset):
-        """Check if the column contains any float number."""
-        cdef:
-            Py_ssize_t i, j, lines = end - start
-            coliter_t it
-            const char *word = NULL
-            const char *ignored_chars = " +-"
-            const char *digits = "0123456789"
-            const char *float_indicating_chars = "eE"
-            char null_byte = 0
-
-        coliter_setup(&it, self.parser, col, start)
-
-        for i in range(lines):
-            COLITER_NEXT(it, word)
-
-            if na_filter and kh_get_str_starts_item(na_hashset, word):
-                continue
-
-            found_first_digit = False
-            j = 0
-            while word[j] != null_byte:
-                if word[j] == self.parser.decimal:
-                    return True
-                elif not found_first_digit and word[j] in ignored_chars:
-                    # no-op
-                    pass
-                elif not found_first_digit and word[j] not in digits:
-                    # word isn't numeric
-                    return False
-                elif not found_first_digit and word[j] in digits:
-                    found_first_digit = True
-                elif word[j] in float_indicating_chars:
-                    # preceding chars indicates numeric and
-                    # current char indicates float
-                    return True
-                elif word[j] not in digits:
-                    # previous characters indicates numeric
-                    # current character shows otherwise
-                    return False
-                elif word[j] in digits:
-                    # no-op
-                    pass
-                else:
-                    raise AssertionError(
-                            f"Unhandled case {word[j]=} {found_first_digit=}"
-                            )
-                j += 1
-
-        return False
-
 # Factor out code common to TextReader.__dealloc__ and TextReader.close
 # It cannot be a class method, since calling self.close() in __dealloc__
 # which causes a class attribute lookup and violates best practices
@@ -1822,6 +1777,8 @@ cdef _try_uint64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
+        elif error == ERROR_IS_FLOAT:
+            raise ValueError("Number is float")
         return None
 
     if uint64_conflict(&state):
@@ -1855,14 +1812,14 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
                 data[i] = 0
                 continue
 
-            data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
+            data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX,
                                     &error, parser.thousands)
             if error != 0:
                 return error
     else:
         for i in range(lines):
             COLITER_NEXT(it, word)
-            data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
+            data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX,
                                     &error, parser.thousands)
             if error != 0:
                 return error
@@ -1892,6 +1849,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
+        elif error == ERROR_IS_FLOAT:
+            raise ValueError("Number is float")
         return None, None
 
     return result, na_count
@@ -1920,14 +1879,14 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
                 data[i] = NA
                 continue
 
-            data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
+            data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
             if error != 0:
                 return error
     else:
         for i in range(lines):
             COLITER_NEXT(it, word)
-            data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
+            data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
             if error != 0:
                 return error
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 61e96fc835e4d..de1ad4454f294 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1834,8 +1834,8 @@ int uint64_conflict(uint_state *self) {
   return self->seen_uint && (self->seen_sint || self->seen_null);
 }
 
-int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
-                     int *error, char tsep) {
+int64_t str_to_int64(const char *p_item, char decimal_separator,
+                     int64_t int_min, int64_t int_max, int *error, char tsep) {
   const char *p = p_item;
   // Skip leading spaces.
   while (isspace_ascii(*p)) {
@@ -1879,7 +1879,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
           d = *++p;
         } else {
           *error = ERROR_OVERFLOW;
-          return 0;
+          break;
         }
       }
     } else {
@@ -1890,7 +1890,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
           d = *++p;
         } else {
           *error = ERROR_OVERFLOW;
-          return 0;
+          break;
         }
       }
     }
@@ -1917,7 +1917,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
         } else {
           *error = ERROR_OVERFLOW;
-          return 0;
+          break;
         }
       }
     } else {
@@ -1929,12 +1929,25 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
         } else {
           *error = ERROR_OVERFLOW;
-          return 0;
+          break;
         }
       }
     }
   }
 
+  if (*error == ERROR_OVERFLOW) {
+    // advance digits
+    while (*p != '\0' && isdigit_ascii(*p)) {
+      p++;
+    }
+
+    // check if is float
+    if (*p == decimal_separator || *p == 'e' || *p == 'E') {
+      *error = ERROR_IS_FLOAT;
+    }
+    return 0;
+  }
+
   // Skip trailing spaces.
   while (isspace_ascii(*p)) {
     ++p;
@@ -1942,7 +1955,11 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
   // Did we use up all the characters?
   if (*p) {
-    *error = ERROR_INVALID_CHARS;
+    if (*p == decimal_separator || *p == 'e' || *p == 'E') {
+      *error = ERROR_IS_FLOAT;
+    } else {
+      *error = ERROR_INVALID_CHARS;
+    }
     return 0;
   }
 
@@ -1950,7 +1967,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   return number;
 }
 
-uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
+uint64_t str_to_uint64(uint_state *state, const char *p_item,
+                       char decimal_separator, int64_t int_max,
                        uint64_t uint_max, int *error, char tsep) {
   const char *p = p_item;
   // Skip leading spaces.
@@ -1997,7 +2015,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
 
       } else {
         *error = ERROR_OVERFLOW;
-        return 0;
+        break;
       }
     }
   } else {
@@ -2009,11 +2027,24 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
 
       } else {
         *error = ERROR_OVERFLOW;
-        return 0;
+        break;
       }
     }
   }
 
+  if (*error == ERROR_OVERFLOW) {
+    // advance digits
+    while (*p != '\0' && isdigit_ascii(*p)) {
+      p++;
+    }
+
+    // check if is float
+    if (*p == decimal_separator || *p == 'e' || *p == 'E') {
+      *error = ERROR_IS_FLOAT;
+    }
+    return 0;
+  }
+
   // Skip trailing spaces.
   while (isspace_ascii(*p)) {
     ++p;
@@ -2021,7 +2052,11 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
 
   // Did we use up all the characters?
   if (*p) {
-    *error = ERROR_INVALID_CHARS;
+    if (*p == decimal_separator || *p == 'e' || *p == 'E') {
+      *error = ERROR_IS_FLOAT;
+    } else {
+      *error = ERROR_INVALID_CHARS;
+    }
     return 0;
   }
 

From fc10a5f487d48a839c8a694437a3069e1739133a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 8 Oct 2025 11:55:00 -0300
Subject: [PATCH 2/9] fix: try other dtypes instead of skipping to float64

---
 pandas/_libs/parsers.pyx | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index bb46a7ff3f1e8..de33f6b10109d 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1070,21 +1070,18 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter, na_hashset)
         else:
             col_res = None
+            maybe_int = True
             for dt in self.dtype_cast_order:
+                if not maybe_int and dt.kind in "iu":
+                    continue
+
                 try:
                     col_res, na_count = self._convert_with_dtype(
                         dt, i, start, end, na_filter, 0, na_hashset, na_fset)
                 except ValueError as e:
                     if str(e) == "Number is float":
-                        try:
-                            col_res, na_count = self._convert_with_dtype(
-                                np.dtype("float64"), i, start, end, 0,
-                                0, na_hashset, na_fset)
-                        except ValueError:
-                            col_res, na_count = self._convert_with_dtype(
-                                np.dtype("object"), i, start, end, 0,
-                                0, na_hashset, na_fset)
-
+                        maybe_int = False
+                        continue
                     else:
                         # This error is raised from trying to convert to uint64,
                         # and we discover that we cannot convert to any numerical

From ab2fab8d489d96a8f2c9985a5f05070dc613ef16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 8 Oct 2025 12:36:12 -0300
Subject: [PATCH 3/9] fix: don't throw error when casting is expected

---
 pandas/_libs/parsers.pyx | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index de33f6b10109d..5f9ddd7dbb4a0 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1059,7 +1059,7 @@ cdef class TextReader:
         if col_dtype is not None:
             col_res, na_count = self._convert_with_dtype(
                 col_dtype, i, start, end, na_filter,
-                1, na_hashset, na_fset)
+                1, na_hashset, na_fset, False)
 
             # Fallback on the parse (e.g. we requested int dtype,
             # but its actually a float).
@@ -1077,7 +1077,7 @@ cdef class TextReader:
 
                 try:
                     col_res, na_count = self._convert_with_dtype(
-                        dt, i, start, end, na_filter, 0, na_hashset, na_fset)
+                        dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
                 except ValueError as e:
                     if str(e) == "Number is float":
                         maybe_int = False
@@ -1089,7 +1089,7 @@ cdef class TextReader:
                         # column AS IS with object dtype.
                         col_res, na_count = self._convert_with_dtype(
                             np.dtype("object"), i, start, end, 0,
-                            0, na_hashset, na_fset)
+                            0, na_hashset, na_fset, False)
                 except OverflowError:
                     try:
                         col_res, na_count = _try_pylong(self.parser, i, start,
@@ -1097,7 +1097,7 @@ cdef class TextReader:
                     except ValueError:
                         col_res, na_count = self._convert_with_dtype(
                             np.dtype("object"), i, start, end, 0,
-                            0, na_hashset, na_fset)
+                            0, na_hashset, na_fset, False)
 
                 if col_res is not None:
                     break
@@ -1145,7 +1145,7 @@ cdef class TextReader:
                              bint na_filter,
                              bint user_dtype,
                              kh_str_starts_t *na_hashset,
-                             set na_fset):
+                             set na_fset, bint raise_on_float):
         if isinstance(dtype, CategoricalDtype):
             # TODO: I suspect that _categorical_convert could be
             # optimized when dtype is an instance of CategoricalDtype
@@ -1186,14 +1186,14 @@ cdef class TextReader:
 
         elif dtype.kind in "iu":
             try:
-                result, na_count = _try_int64(self.parser, i, start,
-                                              end, na_filter, na_hashset)
+                result, na_count = _try_int64(self.parser, i, start, end,
+                                              na_filter, na_hashset, raise_on_float)
                 if user_dtype and na_count is not None:
                     if na_count > 0:
                         raise ValueError(f"Integer column has NA values in column {i}")
             except OverflowError:
                 result = _try_uint64(self.parser, i, start, end,
-                                     na_filter, na_hashset)
+                                     na_filter, na_hashset, raise_on_float)
                 na_count = 0
 
             if result is not None and dtype != "int64":
@@ -1752,7 +1752,8 @@ cdef int _try_double_nogil(parser_t *parser,
 
 cdef _try_uint64(parser_t *parser, int64_t col,
                  int64_t line_start, int64_t line_end,
-                 bint na_filter, kh_str_starts_t *na_hashset):
+                 bint na_filter, kh_str_starts_t *na_hashset,
+                 bint raise_on_float):
     cdef:
         int error
         Py_ssize_t lines
@@ -1774,9 +1775,10 @@ cdef _try_uint64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
-        elif error == ERROR_IS_FLOAT:
+        elif raise_on_float and error == ERROR_IS_FLOAT:
             raise ValueError("Number is float")
-        return None
+        elif not raise_on_float or error != ERROR_IS_FLOAT:
+            return None, None
 
     if uint64_conflict(&state):
         raise ValueError("Cannot convert to numerical dtype")
@@ -1826,7 +1828,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
 
 cdef _try_int64(parser_t *parser, int64_t col,
                 int64_t line_start, int64_t line_end,
-                bint na_filter, kh_str_starts_t *na_hashset):
+                bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float):
     cdef:
         int error, na_count = 0
         Py_ssize_t lines
@@ -1846,9 +1848,10 @@ cdef _try_int64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
-        elif error == ERROR_IS_FLOAT:
+        elif raise_on_float and error == ERROR_IS_FLOAT:
             raise ValueError("Number is float")
-        return None, None
+        elif not raise_on_float or error != ERROR_IS_FLOAT:
+            return None, None
 
     return result, na_count
 

From 7e8033d694456285313118ece947e77e87bea2da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 8 Oct 2025 12:58:40 -0300
Subject: [PATCH 4/9] fix: fix tuple error

---
 pandas/_libs/parsers.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 5f9ddd7dbb4a0..ffa7a48e4f87f 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1778,7 +1778,7 @@ cdef _try_uint64(parser_t *parser, int64_t col,
         elif raise_on_float and error == ERROR_IS_FLOAT:
             raise ValueError("Number is float")
         elif not raise_on_float or error != ERROR_IS_FLOAT:
-            return None, None
+            return None
 
     if uint64_conflict(&state):
         raise ValueError("Cannot convert to numerical dtype")

From 5219386cace91c898b2fedd13d41c46c6272d3ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 8 Oct 2025 15:25:24 -0300
Subject: [PATCH 5/9] fix: remove decimal_separator argument

---
 .../_libs/include/pandas/parser/pd_parser.h   | 18 ++++++-------
 .../_libs/include/pandas/parser/tokenizer.h   |  8 +++---
 pandas/_libs/parsers.pyx                      | 25 +++++++++--------
 pandas/_libs/src/parser/tokenizer.c           | 27 +++++++------------
 4 files changed, 32 insertions(+), 46 deletions(-)

diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h
index b64664614fbbc..543839b5d75bf 100644
--- a/pandas/_libs/include/pandas/parser/pd_parser.h
+++ b/pandas/_libs/include/pandas/parser/pd_parser.h
@@ -37,8 +37,8 @@ typedef struct {
   int (*parser_trim_buffers)(parser_t *);
   int (*tokenize_all_rows)(parser_t *, const char *);
   int (*tokenize_nrows)(parser_t *, size_t, const char *);
-  int64_t (*str_to_int64)(const char *, char, int64_t, int64_t, int *, char);
-  uint64_t (*str_to_uint64)(uint_state *, const char *, char, int64_t, uint64_t,
+  int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char);
+  uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t,
                             int *, char);
   double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *);
   double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *,
@@ -87,14 +87,12 @@ static PandasParser_CAPI *PandasParserAPI = NULL;
   PandasParserAPI->tokenize_all_rows((self), (encoding_errors))
 #define tokenize_nrows(self, nrows, encoding_errors)                           \
   PandasParserAPI->tokenize_nrows((self), (nrows), (encoding_errors))
-#define str_to_int64(p_item, decimal_separator, int_min, int_max, error,       \
-                     t_sep)                                                    \
-  PandasParserAPI->str_to_int64((p_item), (decimal_separator), (int_min),      \
-                                (int_max), (error), (t_sep))
-#define str_to_uint64(state, p_item, decimal_separator, int_max, uint_max,     \
-                      error, t_sep)                                            \
-  PandasParserAPI->str_to_uint64((state), (p_item), (decimal_separator),       \
-                                 (int_max), (uint_max), (error), (t_sep))
+#define str_to_int64(p_item, int_min, int_max, error, t_sep)                   \
+  PandasParserAPI->str_to_int64((p_item), (int_min), (int_max), (error),       \
+                                (t_sep))
+#define str_to_uint64(state, p_item, int_max, uint_max, error, t_sep)          \
+  PandasParserAPI->str_to_uint64((state), (p_item), (int_max), (uint_max),     \
+                                 (error), (t_sep))
 #define xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)     \
   PandasParserAPI->xstrtod((p), (q), (decimal), (sci), (tsep),                 \
                            (skip_trailing), (error), (maybe_int))
diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h
index b6200c0032dba..209f375a5bf6c 100644
--- a/pandas/_libs/include/pandas/parser/tokenizer.h
+++ b/pandas/_libs/include/pandas/parser/tokenizer.h
@@ -17,7 +17,6 @@ See LICENSE for the license
 #define ERROR_NO_DIGITS 1
 #define ERROR_OVERFLOW 2
 #define ERROR_INVALID_CHARS 3
-#define ERROR_IS_FLOAT 4
 
 #include <stdint.h>
 
@@ -209,11 +208,10 @@ void uint_state_init(uint_state *self);
 
 int uint64_conflict(uint_state *self);
 
-uint64_t str_to_uint64(uint_state *state, const char *p_item,
-                       char decimal_separator, int64_t int_max,
+uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
                        uint64_t uint_max, int *error, char tsep);
-int64_t str_to_int64(const char *p_item, char decimal_separator,
-                     int64_t int_min, int64_t int_max, int *error, char tsep);
+int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
+                     int *error, char tsep);
 double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
                int skip_trailing, int *error, int *maybe_int);
 double precise_xstrtod(const char *p, char **q, char decimal, char sci,
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index ffa7a48e4f87f..785be76eb0545 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
         SKIP_LINE
         FINISHED
 
-    enum: ERROR_OVERFLOW, ERROR_IS_FLOAT
+    enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS
 
     ctypedef enum BadLineHandleMethod:
         ERROR,
@@ -281,11 +281,10 @@ cdef extern from "pandas/parser/pd_parser.h":
     int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
     int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
 
-    int64_t str_to_int64(char *p_item, char decimal_separator, int64_t int_min,
+    int64_t str_to_int64(char *p_item, int64_t int_min,
                          int64_t int_max, int *error, char tsep) nogil
-    uint64_t str_to_uint64(uint_state *state, char *p_item, char decimal_separator,
-                           int64_t int_max, uint64_t uint_max,
-                           int *error, char tsep) nogil
+    uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
+                           uint64_t uint_max, int *error, char tsep) nogil
 
     double xstrtod(const char *p, char **q, char decimal,
                    char sci, char tsep, int skip_trailing,
@@ -1775,9 +1774,9 @@ cdef _try_uint64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
-        elif raise_on_float and error == ERROR_IS_FLOAT:
+        elif raise_on_float and error == ERROR_INVALID_CHARS:
             raise ValueError("Number is float")
-        elif not raise_on_float or error != ERROR_IS_FLOAT:
+        elif not raise_on_float or error != ERROR_INVALID_CHARS:
             return None
 
     if uint64_conflict(&state):
@@ -1811,14 +1810,14 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
                 data[i] = 0
                 continue
 
-            data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX,
+            data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
                                     &error, parser.thousands)
             if error != 0:
                 return error
     else:
         for i in range(lines):
             COLITER_NEXT(it, word)
-            data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX,
+            data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
                                     &error, parser.thousands)
             if error != 0:
                 return error
@@ -1848,9 +1847,9 @@ cdef _try_int64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
-        elif raise_on_float and error == ERROR_IS_FLOAT:
+        elif raise_on_float and error == ERROR_INVALID_CHARS:
             raise ValueError("Number is float")
-        elif not raise_on_float or error != ERROR_IS_FLOAT:
+        elif not raise_on_float or error != ERROR_INVALID_CHARS:
             return None, None
 
     return result, na_count
@@ -1879,14 +1878,14 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
                 data[i] = NA
                 continue
 
-            data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX,
+            data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
             if error != 0:
                 return error
     else:
         for i in range(lines):
             COLITER_NEXT(it, word)
-            data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX,
+            data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
             if error != 0:
                 return error
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index de1ad4454f294..059aa945b32bf 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1834,8 +1834,8 @@ int uint64_conflict(uint_state *self) {
   return self->seen_uint && (self->seen_sint || self->seen_null);
 }
 
-int64_t str_to_int64(const char *p_item, char decimal_separator,
-                     int64_t int_min, int64_t int_max, int *error, char tsep) {
+int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
+                     int *error, char tsep) {
   const char *p = p_item;
   // Skip leading spaces.
   while (isspace_ascii(*p)) {
@@ -1942,8 +1942,8 @@ int64_t str_to_int64(const char *p_item, char decimal_separator,
     }
 
     // check if is float
-    if (*p == decimal_separator || *p == 'e' || *p == 'E') {
-      *error = ERROR_IS_FLOAT;
+    if (*p != '\0') {
+      *error = ERROR_INVALID_CHARS;
     }
     return 0;
   }
@@ -1955,11 +1955,7 @@ int64_t str_to_int64(const char *p_item, char decimal_separator,
 
   // Did we use up all the characters?
   if (*p) {
-    if (*p == decimal_separator || *p == 'e' || *p == 'E') {
-      *error = ERROR_IS_FLOAT;
-    } else {
-      *error = ERROR_INVALID_CHARS;
-    }
+    *error = ERROR_INVALID_CHARS;
     return 0;
   }
 
@@ -1967,8 +1963,7 @@ int64_t str_to_int64(const char *p_item, char decimal_separator,
   return number;
 }
 
-uint64_t str_to_uint64(uint_state *state, const char *p_item,
-                       char decimal_separator, int64_t int_max,
+uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
                        uint64_t uint_max, int *error, char tsep) {
   const char *p = p_item;
   // Skip leading spaces.
@@ -2039,8 +2034,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item,
     }
 
     // check if is float
-    if (*p == decimal_separator || *p == 'e' || *p == 'E') {
-      *error = ERROR_IS_FLOAT;
+    if (*p != '\0') {
+      *error = ERROR_INVALID_CHARS;
     }
     return 0;
   }
@@ -2052,11 +2047,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item,
 
   // Did we use up all the characters?
   if (*p) {
-    if (*p == decimal_separator || *p == 'e' || *p == 'E') {
-      *error = ERROR_IS_FLOAT;
-    } else {
-      *error = ERROR_INVALID_CHARS;
-    }
+    *error = ERROR_INVALID_CHARS;
     return 0;
   }
 

From 4ff07e3b5ea09bf15c5bcc6ede75c97d326ee7aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 8 Oct 2025 15:35:31 -0300
Subject: [PATCH 6/9] fix: early return on overflow, but still check next chars

---
 pandas/_libs/src/parser/tokenizer.c | 55 ++++++++++++-----------------
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 059aa945b32bf..2dfa58a460efc 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1834,6 +1834,17 @@ int uint64_conflict(uint_state *self) {
   return self->seen_uint && (self->seen_sint || self->seen_null);
 }
 
+static inline void check_for_invalid_char(const char *p_item, int *error) {
+  while (*p_item != '\0' && isdigit_ascii(*p_item)) {
+    p_item++;
+  }
+
+  // check if reached the end of string after consuming all digits
+  if (*p_item != '\0') {
+    *error = ERROR_INVALID_CHARS;
+  }
+}
+
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
   const char *p = p_item;
@@ -1879,7 +1890,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
           d = *++p;
         } else {
           *error = ERROR_OVERFLOW;
-          break;
+          check_for_invalid_char(p, error);
+          return 0;
         }
       }
     } else {
@@ -1890,7 +1902,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
           d = *++p;
         } else {
           *error = ERROR_OVERFLOW;
-          break;
+          check_for_invalid_char(p, error);
+          return 0;
         }
       }
     }
@@ -1917,7 +1930,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
         } else {
           *error = ERROR_OVERFLOW;
-          break;
+          check_for_invalid_char(p, error);
+          return 0;
         }
       }
     } else {
@@ -1929,25 +1943,13 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
         } else {
           *error = ERROR_OVERFLOW;
-          break;
+          check_for_invalid_char(p, error);
+          return 0;
         }
       }
     }
   }
 
-  if (*error == ERROR_OVERFLOW) {
-    // advance digits
-    while (*p != '\0' && isdigit_ascii(*p)) {
-      p++;
-    }
-
-    // check if is float
-    if (*p != '\0') {
-      *error = ERROR_INVALID_CHARS;
-    }
-    return 0;
-  }
-
   // Skip trailing spaces.
   while (isspace_ascii(*p)) {
     ++p;
@@ -2010,7 +2012,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
 
       } else {
         *error = ERROR_OVERFLOW;
-        break;
+        check_for_invalid_char(p, error);
+        return 0;
       }
     }
   } else {
@@ -2022,24 +2025,12 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
 
       } else {
         *error = ERROR_OVERFLOW;
-        break;
+        check_for_invalid_char(p, error);
+        return 0;
       }
     }
   }
 
-  if (*error == ERROR_OVERFLOW) {
-    // advance digits
-    while (*p != '\0' && isdigit_ascii(*p)) {
-      p++;
-    }
-
-    // check if is float
-    if (*p != '\0') {
-      *error = ERROR_INVALID_CHARS;
-    }
-    return 0;
-  }
-
   // Skip trailing spaces.
   while (isspace_ascii(*p)) {
     ++p;

From c7fc2927bb282c0fa5e4a50ce73f017a51483664 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 8 Oct 2025 16:18:45 -0300
Subject: [PATCH 7/9] fix: don't flag int with trailing whitespace as invalid

---
 pandas/_libs/src/parser/tokenizer.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 2dfa58a460efc..e8d794ab8935c 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1839,6 +1839,10 @@ static inline void check_for_invalid_char(const char *p_item, int *error) {
     p_item++;
   }
 
+  while (*p_item != '\0' && isspace_ascii(*p_item)) {
+    ++p_item;
+  }
+
   // check if reached the end of string after consuming all digits
   if (*p_item != '\0') {
     *error = ERROR_INVALID_CHARS;

From 4c8d77041e05a5fcb582ba1fbe6e5c7cca343b03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 8 Oct 2025 16:23:00 -0300
Subject: [PATCH 8/9] chore: better error message

---
 pandas/_libs/parsers.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 785be76eb0545..68027019e4c3f 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1078,7 +1078,7 @@ cdef class TextReader:
                     col_res, na_count = self._convert_with_dtype(
                         dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
                 except ValueError as e:
-                    if str(e) == "Number is float":
+                    if str(e) == "Number is not int":
                         maybe_int = False
                         continue
                     else:
@@ -1775,7 +1775,7 @@ cdef _try_uint64(parser_t *parser, int64_t col,
             # Can't get the word variable
             raise OverflowError("Overflow")
         elif raise_on_float and error == ERROR_INVALID_CHARS:
-            raise ValueError("Number is float")
+            raise ValueError("Number is not int")
         elif not raise_on_float or error != ERROR_INVALID_CHARS:
             return None
 
@@ -1848,7 +1848,7 @@ cdef _try_int64(parser_t *parser, int64_t col,
             # Can't get the word variable
             raise OverflowError("Overflow")
         elif raise_on_float and error == ERROR_INVALID_CHARS:
-            raise ValueError("Number is float")
+            raise ValueError("Number is not int")
         elif not raise_on_float or error != ERROR_INVALID_CHARS:
             return None, None
 

From 35f075a03341e4372a251cc99da0dc3e683c8a1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 8 Oct 2025 18:21:55 -0300
Subject: [PATCH 9/9] docs: document function to check for invalid character

---
 pandas/_libs/src/parser/tokenizer.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index e8d794ab8935c..1561c16e4fd6c 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1834,7 +1834,28 @@ int uint64_conflict(uint_state *self) {
   return self->seen_uint && (self->seen_sint || self->seen_null);
 }
 
-static inline void check_for_invalid_char(const char *p_item, int *error) {
+/**
+ * @brief Validates that a string contains only numeric digits and optional
+ * trailing whitespace.
+ *
+ * This function is used after an integer overflow,
+ * where is checks the rest of the string for a non-numeric character,
+ * while also ignoring trailing white-space.
+ *
+ * Pure integer overflows during CSV parsing are converted to PyLongObjects,
+ * while, if any invalid character is found, it skips integer
+ * parsing and tries other conversion methods.
+ *
+ * @param p_item Pointer to the string to validate for numeric format
+ * @param error Pointer to indicate error code.
+ *        Set to ERROR_INVALID_CHARS if an invalid character is found.
+ *
+ * @return Pointer to the position in the string where validation stopped.
+ *         - If valid: terminates at the null terminator.
+ *         - If invalid: points to the first invalid character encountered.
+ */
+static inline const char *check_for_invalid_char(const char *p_item,
+                                                 int *error) {
   while (*p_item != '\0' && isdigit_ascii(*p_item)) {
     p_item++;
   }
@@ -1847,6 +1868,8 @@ static inline void check_for_invalid_char(const char *p_item, int *error) {
   if (*p_item != '\0') {
     *error = ERROR_INVALID_CHARS;
   }
+
+  return p_item;
 }
 
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,