Skip to content

Commit 2e5a47c

Browse files
committed
refactor: add TokenizerError enum
1 parent cf0a26d commit 2e5a47c

File tree

4 files changed

+37
-29
lines changed

4 files changed

+37
-29
lines changed

pandas/_libs/include/pandas/parser/pd_parser.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,10 @@ typedef struct {
3737
int (*parser_trim_buffers)(parser_t *);
3838
int (*tokenize_all_rows)(parser_t *, const char *);
3939
int (*tokenize_nrows)(parser_t *, size_t, const char *);
40-
int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char);
40+
int64_t (*str_to_int64)(const char *, int64_t, int64_t, TokenizerError *,
41+
char);
4142
uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t,
42-
int *, char);
43+
TokenizerError *, char);
4344
double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *);
4445
double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *,
4546
int *);

pandas/_libs/include/pandas/parser/tokenizer.h

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@ See LICENSE for the license
1414
#define PY_SSIZE_T_CLEAN
1515
#include <Python.h>
1616

17-
#define ERROR_NO_DIGITS 1
18-
#define ERROR_OVERFLOW 2
19-
#define ERROR_INVALID_CHARS 3
20-
2117
#include <stdint.h>
2218

2319
#define STREAM_INIT_SIZE 32
@@ -50,6 +46,13 @@ See LICENSE for the license
5046
* duplication of some file I/O.
5147
*/
5248

49+
typedef enum {
50+
TOKENIZER_OK,
51+
ERROR_NO_DIGITS,
52+
ERROR_OVERFLOW,
53+
ERROR_INVALID_CHARS,
54+
} TokenizerError;
55+
5356
typedef enum {
5457
START_RECORD,
5558
START_FIELD,
@@ -209,9 +212,9 @@ void uint_state_init(uint_state *self);
209212
int uint64_conflict(uint_state *self);
210213

211214
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
212-
uint64_t uint_max, int *error, char tsep);
215+
uint64_t uint_max, TokenizerError *error, char tsep);
213216
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
214-
int *error, char tsep);
217+
TokenizerError *error, char tsep);
215218
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
216219
int skip_trailing, int *error, int *maybe_int);
217220
double precise_xstrtod(const char *p, char **q, char decimal, char sci,

pandas/_libs/parsers.pyx

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,10 @@ cdef extern from "pandas/parser/tokenizer.h":
149149
SKIP_LINE
150150
FINISHED
151151

152-
enum: ERROR_OVERFLOW, ERROR_INVALID_CHARS
152+
ctypedef enum TokenizerError:
153+
TOKENIZER_OK,
154+
ERROR_OVERFLOW,
155+
ERROR_INVALID_CHARS
153156

154157
ctypedef enum BadLineHandleMethod:
155158
ERROR,
@@ -282,9 +285,9 @@ cdef extern from "pandas/parser/pd_parser.h":
282285
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
283286

284287
int64_t str_to_int64(char *p_item, int64_t int_min,
285-
int64_t int_max, int *error, char tsep) nogil
288+
int64_t int_max, TokenizerError *error, char tsep) nogil
286289
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
287-
uint64_t uint_max, int *error, char tsep) nogil
290+
uint64_t uint_max, TokenizerError *error, char tsep) nogil
288291

289292
double xstrtod(const char *p, char **q, char decimal,
290293
char sci, char tsep, int skip_trailing,
@@ -1794,7 +1797,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
17941797
const kh_str_starts_t *na_hashset,
17951798
uint64_t *data, uint_state *state) nogil:
17961799
cdef:
1797-
int error
1800+
TokenizerError error = TOKENIZER_OK
17981801
Py_ssize_t i, lines = line_end - line_start
17991802
coliter_t it
18001803
const char *word = NULL
@@ -1829,7 +1832,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
18291832
int64_t line_start, int64_t line_end,
18301833
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float):
18311834
cdef:
1832-
int error, na_count = 0
1835+
TokenizerError error = TOKENIZER_OK
1836+
int na_count = 0
18331837
Py_ssize_t lines
18341838
coliter_t it
18351839
int64_t *data
@@ -1855,13 +1859,13 @@ cdef _try_int64(parser_t *parser, int64_t col,
18551859
return result, na_count
18561860

18571861

1858-
cdef int _try_int64_nogil(parser_t *parser, int64_t col,
1859-
int64_t line_start,
1860-
int64_t line_end, bint na_filter,
1861-
const kh_str_starts_t *na_hashset, int64_t NA,
1862-
int64_t *data, int *na_count) nogil:
1862+
cdef TokenizerError _try_int64_nogil(parser_t *parser, int64_t col,
1863+
int64_t line_start,
1864+
int64_t line_end, bint na_filter,
1865+
const kh_str_starts_t *na_hashset, int64_t NA,
1866+
int64_t *data, int *na_count) nogil:
18631867
cdef:
1864-
int error
1868+
TokenizerError error = TOKENIZER_OK
18651869
Py_ssize_t i, lines = line_end - line_start
18661870
coliter_t it
18671871
const char *word = NULL
@@ -1880,17 +1884,17 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
18801884

18811885
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
18821886
&error, parser.thousands)
1883-
if error != 0:
1887+
if error != TOKENIZER_OK:
18841888
return error
18851889
else:
18861890
for i in range(lines):
18871891
COLITER_NEXT(it, word)
18881892
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
18891893
&error, parser.thousands)
1890-
if error != 0:
1894+
if error != TOKENIZER_OK:
18911895
return error
18921896

1893-
return 0
1897+
return error
18941898

18951899
cdef _try_pylong(parser_t *parser, Py_ssize_t col,
18961900
int64_t line_start, int64_t line_end,

pandas/_libs/src/parser/tokenizer.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1849,7 +1849,7 @@ int uint64_conflict(uint_state *self) {
18491849
* @return Integer 0 if the remainder of the string contains only digits,
18501850
* otherwise returns the error code for [ERROR_INVALID_CHARS].
18511851
*/
1852-
static inline int check_for_invalid_char(const char *p_item) {
1852+
static inline TokenizerError check_for_invalid_char(const char *p_item) {
18531853
while (*p_item != '\0' && isdigit_ascii(*p_item)) {
18541854
p_item++;
18551855
}
@@ -1859,11 +1859,11 @@ static inline int check_for_invalid_char(const char *p_item) {
18591859
return ERROR_INVALID_CHARS;
18601860
}
18611861

1862-
return 0;
1862+
return TOKENIZER_OK;
18631863
}
18641864

18651865
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
1866-
int *error, char tsep) {
1866+
TokenizerError *error, char tsep) {
18671867
const char *p = p_item;
18681868
// Skip leading spaces.
18691869
while (isspace_ascii(*p)) {
@@ -1990,12 +1990,12 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
19901990
return 0;
19911991
}
19921992

1993-
*error = 0;
1993+
*error = TOKENIZER_OK;
19941994
return number;
19951995
}
19961996

19971997
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
1998-
uint64_t uint_max, int *error, char tsep) {
1998+
uint64_t uint_max, TokenizerError *error, char tsep) {
19991999
const char *p = p_item;
20002000
// Skip leading spaces.
20012001
while (isspace_ascii(*p)) {
@@ -2005,7 +2005,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
20052005
// Handle sign.
20062006
if (*p == '-') {
20072007
state->seen_sint = 1;
2008-
*error = 0;
2008+
*error = TOKENIZER_OK;
20092009
return 0;
20102010
} else if (*p == '+') {
20112011
p++;
@@ -2081,6 +2081,6 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
20812081
state->seen_uint = 1;
20822082
}
20832083

2084-
*error = 0;
2084+
*error = TOKENIZER_OK;
20852085
return number;
20862086
}

0 commit comments

Comments
 (0)