refactor: add TokenizerError enum

Alvaro-Kothe · Alvaro-Kothe · commit 2e5a47ceaf7a · 2025-10-09T16:51:19.000-03:00
diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h
@@ -37,9 +37,10 @@ typedef struct {
   int (*parser_trim_buffers)(parser_t *);
   int (*tokenize_all_rows)(parser_t *, const char *);
   int (*tokenize_nrows)(parser_t *, size_t, const char *);
-  int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char);
+  int64_t (*str_to_int64)(const char *, int64_t, int64_t, TokenizerError *,
+                          char);
   uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t,
-                            int *, char);
+                            TokenizerError *, char);
   double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *);
   double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *,
                             int *);
diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h
@@ -14,10 +14,6 @@ See LICENSE for the license
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define ERROR_NO_DIGITS 1
-#define ERROR_OVERFLOW 2
-#define ERROR_INVALID_CHARS 3
-
 #include <stdint.h>
 
 #define STREAM_INIT_SIZE 32
@@ -50,6 +46,13 @@ See LICENSE for the license
  *        duplication of some file I/O.
  */
 
+typedef enum {
+  TOKENIZER_OK,
+  ERROR_NO_DIGITS,
+  ERROR_OVERFLOW,
+  ERROR_INVALID_CHARS,
+} TokenizerError;
+
 typedef enum {
   START_RECORD,
   START_FIELD,
@@ -209,9 +212,9 @@ void uint_state_init(uint_state *self);
 int uint64_conflict(uint_state *self);
 
 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
-                       uint64_t uint_max, int *error, char tsep);
+                       uint64_t uint_max, TokenizerError *error, char tsep);
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
-                     int *error, char tsep);
+                     TokenizerError *error, char tsep);
 double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
                int skip_trailing, int *error, int *maybe_int);
 double precise_xstrtod(const char *p, char **q, char decimal, char sci,
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -149,7 +149,10 @@ cdef extern from "pandas/parser/tokenizer.h":
         SKIP_LINE
         FINISHED
 
-    enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS
+    ctypedef enum TokenizerError:
+        TOKENIZER_OK,
+        ERROR_OVERFLOW,
+        ERROR_INVALID_CHARS
 
     ctypedef enum BadLineHandleMethod:
         ERROR,
@@ -282,9 +285,9 @@ cdef extern from "pandas/parser/pd_parser.h":
     int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
 
     int64_t str_to_int64(char *p_item, int64_t int_min,
-                         int64_t int_max, int *error, char tsep) nogil
+                         int64_t int_max, TokenizerError *error, char tsep) nogil
     uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
-                           uint64_t uint_max, int *error, char tsep) nogil
+                           uint64_t uint_max, TokenizerError *error, char tsep) nogil
 
     double xstrtod(const char *p, char **q, char decimal,
                    char sci, char tsep, int skip_trailing,
@@ -1794,7 +1797,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
                            const kh_str_starts_t *na_hashset,
                            uint64_t *data, uint_state *state) nogil:
     cdef:
-        int error
+        TokenizerError error = TOKENIZER_OK
         Py_ssize_t i, lines = line_end - line_start
         coliter_t it
         const char *word = NULL
@@ -1829,7 +1832,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
                 int64_t line_start, int64_t line_end,
                 bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float):
     cdef:
-        int error, na_count = 0
+        TokenizerError error = TOKENIZER_OK
+        int na_count = 0
         Py_ssize_t lines
         coliter_t it
         int64_t *data
@@ -1855,13 +1859,13 @@ cdef _try_int64(parser_t *parser, int64_t col,
     return result, na_count
 
 
-cdef int _try_int64_nogil(parser_t *parser, int64_t col,
-                          int64_t line_start,
-                          int64_t line_end, bint na_filter,
-                          const kh_str_starts_t *na_hashset, int64_t NA,
-                          int64_t *data, int *na_count) nogil:
+cdef TokenizerError _try_int64_nogil(parser_t *parser, int64_t col,
+                                     int64_t line_start,
+                                     int64_t line_end, bint na_filter,
+                                     const kh_str_starts_t *na_hashset, int64_t NA,
+                                     int64_t *data, int *na_count) nogil:
     cdef:
-        int error
+        TokenizerError error = TOKENIZER_OK
         Py_ssize_t i, lines = line_end - line_start
         coliter_t it
         const char *word = NULL
@@ -1880,17 +1884,17 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
 
             data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
-            if error != 0:
+            if error != TOKENIZER_OK:
                 return error
     else:
         for i in range(lines):
             COLITER_NEXT(it, word)
             data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
-            if error != 0:
+            if error != TOKENIZER_OK:
                 return error
 
-    return 0
+    return error
 
 cdef _try_pylong(parser_t *parser, Py_ssize_t col,
                  int64_t line_start, int64_t line_end,
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -1849,7 +1849,7 @@ int uint64_conflict(uint_state *self) {
  * @return Integer 0 if the remainder of the string contains only digits,
  *         otherwise returns the error code for [ERROR_INVALID_CHARS].
  */
-static inline int check_for_invalid_char(const char *p_item) {
+static inline TokenizerError check_for_invalid_char(const char *p_item) {
   while (*p_item != '\0' && isdigit_ascii(*p_item)) {
     p_item++;
   }
@@ -1859,11 +1859,11 @@ static inline int check_for_invalid_char(const char *p_item) {
     return ERROR_INVALID_CHARS;
   }
 
-  return 0;
+  return TOKENIZER_OK;
 }
 
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
-                     int *error, char tsep) {
+                     TokenizerError *error, char tsep) {
   const char *p = p_item;
   // Skip leading spaces.
   while (isspace_ascii(*p)) {
@@ -1990,12 +1990,12 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     return 0;
   }
 
-  *error = 0;
+  *error = TOKENIZER_OK;
   return number;
 }
 
 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
-                       uint64_t uint_max, int *error, char tsep) {
+                       uint64_t uint_max, TokenizerError *error, char tsep) {
   const char *p = p_item;
   // Skip leading spaces.
   while (isspace_ascii(*p)) {
@@ -2005,7 +2005,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   // Handle sign.
   if (*p == '-') {
     state->seen_sint = 1;
-    *error = 0;
+    *error = TOKENIZER_OK;
     return 0;
   } else if (*p == '+') {
     p++;
@@ -2081,6 +2081,6 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     state->seen_uint = 1;
   }
 
-  *error = 0;
+  *error = TOKENIZER_OK;
   return number;
 }