Skip to content
18 changes: 10 additions & 8 deletions pandas/_libs/include/pandas/parser/pd_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ typedef struct {
int (*parser_trim_buffers)(parser_t *);
int (*tokenize_all_rows)(parser_t *, const char *);
int (*tokenize_nrows)(parser_t *, size_t, const char *);
int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char);
uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t,
int64_t (*str_to_int64)(const char *, char, int64_t, int64_t, int *, char);
uint64_t (*str_to_uint64)(uint_state *, const char *, char, int64_t, uint64_t,
int *, char);
double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *);
double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *,
Expand Down Expand Up @@ -87,12 +87,14 @@ static PandasParser_CAPI *PandasParserAPI = NULL;
PandasParserAPI->tokenize_all_rows((self), (encoding_errors))
#define tokenize_nrows(self, nrows, encoding_errors) \
PandasParserAPI->tokenize_nrows((self), (nrows), (encoding_errors))
#define str_to_int64(p_item, int_min, int_max, error, t_sep) \
PandasParserAPI->str_to_int64((p_item), (int_min), (int_max), (error), \
(t_sep))
#define str_to_uint64(state, p_item, int_max, uint_max, error, t_sep) \
PandasParserAPI->str_to_uint64((state), (p_item), (int_max), (uint_max), \
(error), (t_sep))
#define str_to_int64(p_item, decimal_separator, int_min, int_max, error, \
t_sep) \
PandasParserAPI->str_to_int64((p_item), (decimal_separator), (int_min), \
(int_max), (error), (t_sep))
#define str_to_uint64(state, p_item, decimal_separator, int_max, uint_max, \
error, t_sep) \
PandasParserAPI->str_to_uint64((state), (p_item), (decimal_separator), \
(int_max), (uint_max), (error), (t_sep))
#define xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int) \
PandasParserAPI->xstrtod((p), (q), (decimal), (sci), (tsep), \
(skip_trailing), (error), (maybe_int))
Expand Down
8 changes: 5 additions & 3 deletions pandas/_libs/include/pandas/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ See LICENSE for the license
#define ERROR_NO_DIGITS 1
#define ERROR_OVERFLOW 2
#define ERROR_INVALID_CHARS 3
#define ERROR_IS_FLOAT 4

#include <stdint.h>

Expand Down Expand Up @@ -208,10 +209,11 @@ void uint_state_init(uint_state *self);

int uint64_conflict(uint_state *self);

uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
uint64_t str_to_uint64(uint_state *state, const char *p_item,
char decimal_separator, int64_t int_max,
uint64_t uint_max, int *error, char tsep);
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep);
int64_t str_to_int64(const char *p_item, char decimal_separator,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should add this argument to the int conversion functions - that repurposes these functions in a way that's not really clear.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed it. Additionally, I am no longer checking explicitly if it's a float, just assigning the error code for invalid char.

int64_t int_min, int64_t int_max, int *error, char tsep);
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
int skip_trailing, int *error, int *maybe_int);
double precise_xstrtod(const char *p, char **q, char decimal, char sci,
Expand Down
123 changes: 41 additions & 82 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
SKIP_LINE
FINISHED

enum: ERROR_OVERFLOW
enum: ERROR_OVERFLOW, ERROR_IS_FLOAT

ctypedef enum BadLineHandleMethod:
ERROR,
Expand Down Expand Up @@ -281,10 +281,11 @@ cdef extern from "pandas/parser/pd_parser.h":
int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil

int64_t str_to_int64(char *p_item, int64_t int_min,
int64_t str_to_int64(char *p_item, char decimal_separator, int64_t int_min,
int64_t int_max, int *error, char tsep) nogil
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep) nogil
uint64_t str_to_uint64(uint_state *state, char *p_item, char decimal_separator,
int64_t int_max, uint64_t uint_max,
int *error, char tsep) nogil

double xstrtod(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
Expand Down Expand Up @@ -1058,7 +1059,7 @@ cdef class TextReader:
if col_dtype is not None:
col_res, na_count = self._convert_with_dtype(
col_dtype, i, start, end, na_filter,
1, na_hashset, na_fset)
1, na_hashset, na_fset, False)

# Fallback on the parse (e.g. we requested int dtype,
# but its actually a float).
Expand All @@ -1069,30 +1070,34 @@ cdef class TextReader:
return self._string_convert(i, start, end, na_filter, na_hashset)
else:
col_res = None
maybe_int = True
for dt in self.dtype_cast_order:
if (dt.kind in "iu" and
self._column_has_float(i, start, end, na_filter, na_hashset)):
if not maybe_int and dt.kind in "iu":
continue

try:
col_res, na_count = self._convert_with_dtype(
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
except ValueError:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
except ValueError as e:
if str(e) == "Number is float":
maybe_int = False
continue
else:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset, False)
except OverflowError:
try:
col_res, na_count = _try_pylong(self.parser, i, start,
end, na_filter, na_hashset)
except ValueError:
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
0, na_hashset, na_fset, False)

if col_res is not None:
break
Expand Down Expand Up @@ -1140,7 +1145,7 @@ cdef class TextReader:
bint na_filter,
bint user_dtype,
kh_str_starts_t *na_hashset,
set na_fset):
set na_fset, bint raise_on_float):
if isinstance(dtype, CategoricalDtype):
# TODO: I suspect that _categorical_convert could be
# optimized when dtype is an instance of CategoricalDtype
Expand Down Expand Up @@ -1181,14 +1186,14 @@ cdef class TextReader:

elif dtype.kind in "iu":
try:
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
result, na_count = _try_int64(self.parser, i, start, end,
na_filter, na_hashset, raise_on_float)
if user_dtype and na_count is not None:
if na_count > 0:
raise ValueError(f"Integer column has NA values in column {i}")
except OverflowError:
result = _try_uint64(self.parser, i, start, end,
na_filter, na_hashset)
na_filter, na_hashset, raise_on_float)
na_count = 0

if result is not None and dtype != "int64":
Expand Down Expand Up @@ -1351,59 +1356,6 @@ cdef class TextReader:
else:
return None

cdef bint _column_has_float(self, Py_ssize_t col,
int64_t start, int64_t end,
bint na_filter, kh_str_starts_t *na_hashset):
"""Check if the column contains any float number."""
cdef:
Py_ssize_t i, j, lines = end - start
coliter_t it
const char *word = NULL
const char *ignored_chars = " +-"
const char *digits = "0123456789"
const char *float_indicating_chars = "eE"
char null_byte = 0

coliter_setup(&it, self.parser, col, start)

for i in range(lines):
COLITER_NEXT(it, word)

if na_filter and kh_get_str_starts_item(na_hashset, word):
continue

found_first_digit = False
j = 0
while word[j] != null_byte:
if word[j] == self.parser.decimal:
return True
elif not found_first_digit and word[j] in ignored_chars:
# no-op
pass
elif not found_first_digit and word[j] not in digits:
# word isn't numeric
return False
elif not found_first_digit and word[j] in digits:
found_first_digit = True
elif word[j] in float_indicating_chars:
# preceding chars indicates numeric and
# current char indicates float
return True
elif word[j] not in digits:
# previous characters indicates numeric
# current character shows otherwise
return False
elif word[j] in digits:
# no-op
pass
else:
raise AssertionError(
f"Unhandled case {word[j]=} {found_first_digit=}"
)
j += 1

return False

# Factor out code common to TextReader.__dealloc__ and TextReader.close
# It cannot be a class method, since calling self.close() in __dealloc__
# which causes a class attribute lookup and violates best practices
Expand Down Expand Up @@ -1800,7 +1752,8 @@ cdef int _try_double_nogil(parser_t *parser,

cdef _try_uint64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset,
bint raise_on_float):
cdef:
int error
Py_ssize_t lines
Expand All @@ -1822,7 +1775,10 @@ cdef _try_uint64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None
elif raise_on_float and error == ERROR_IS_FLOAT:
raise ValueError("Number is float")
elif not raise_on_float or error != ERROR_IS_FLOAT:
return None

if uint64_conflict(&state):
raise ValueError("Cannot convert to numerical dtype")
Expand Down Expand Up @@ -1855,14 +1811,14 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
data[i] = 0
continue

data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX,
&error, parser.thousands)
if error != 0:
return error
else:
for i in range(lines):
COLITER_NEXT(it, word)
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX,
&error, parser.thousands)
if error != 0:
return error
Expand All @@ -1872,7 +1828,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,

cdef _try_int64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float):
cdef:
int error, na_count = 0
Py_ssize_t lines
Expand All @@ -1892,7 +1848,10 @@ cdef _try_int64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None, None
elif raise_on_float and error == ERROR_IS_FLOAT:
raise ValueError("Number is float")
elif not raise_on_float or error != ERROR_IS_FLOAT:
return None, None

return result, na_count

Expand Down Expand Up @@ -1920,14 +1879,14 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
data[i] = NA
continue

data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX,
&error, parser.thousands)
if error != 0:
return error
else:
for i in range(lines):
COLITER_NEXT(it, word)
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX,
&error, parser.thousands)
if error != 0:
return error
Expand Down
Loading
Loading