-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
BUG: make read_csv
be able to read large floating numbers into float
#62542
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1b6e3a2
8908d84
2686655
4aa0ff1
706503f
544a257
62b49f9
6c03279
6aa8398
44b68bc
e16c71b
fc01097
93c94b5
380e6ec
6b4cedb
b3519c1
2e0af7a
0973086
c7ddf17
a7ab941
8b57401
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1070,6 +1070,10 @@ cdef class TextReader: | |
else: | ||
col_res = None | ||
for dt in self.dtype_cast_order: | ||
if (dt.kind in "iu" and | ||
self._column_has_float(i, start, end, na_filter, na_hashset)): | ||
continue | ||
|
||
try: | ||
col_res, na_count = self._convert_with_dtype( | ||
dt, i, start, end, na_filter, 0, na_hashset, na_fset) | ||
|
@@ -1347,6 +1351,58 @@ cdef class TextReader: | |
else: | ||
return None | ||
|
||
cdef bint _column_has_float(self, Py_ssize_t col, | ||
int64_t start, int64_t end, | ||
bint na_filter, kh_str_starts_t *na_hashset): | ||
"""Check if the column contains any float number.""" | ||
cdef: | ||
Py_ssize_t i, j, lines = end - start | ||
coliter_t it | ||
const char *word = NULL | ||
const char *ignored_chars = " +-" | ||
const char *digits = "0123456789" | ||
const char *float_indicating_chars = "eE" | ||
char null_byte = 0 | ||
|
||
coliter_setup(&it, self.parser, col, start) | ||
|
||
for i in range(lines): | ||
COLITER_NEXT(it, word) | ||
|
||
if na_filter and kh_get_str_starts_item(na_hashset, word): | ||
continue | ||
|
||
found_first_digit = False | ||
j = 0 | ||
while word[j] != null_byte: | ||
if word[j] == self.parser.decimal: | ||
return True | ||
elif not found_first_digit and word[j] in ignored_chars: | ||
# no-op | ||
pass | ||
elif not found_first_digit and word[j] not in digits: | ||
# word isn't numeric | ||
return False | ||
elif not found_first_digit and word[j] in digits: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. <ctype.h> provides an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's also |
||
found_first_digit = True | ||
elif word[j] in float_indicating_chars: | ||
# preceding chars indicates numeric and | ||
# current char indicates float | ||
return True | ||
elif word[j] not in digits: | ||
# previous characters indicates numeric | ||
# current character shows otherwise | ||
return False | ||
elif word[j] in digits: | ||
# no-op | ||
pass | ||
else: | ||
raise AssertionError( | ||
f"Unhandled case {word[j]=} {found_first_digit=}" | ||
) | ||
j += 1 | ||
|
||
return False | ||
|
||
# Factor out code common to TextReader.__dealloc__ and TextReader.close | ||
# It cannot be a class method, since calling self.close() in __dealloc__ | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Had to use
= 0
instead of= '\0'
because cython-lint doesn't let me use single quote, and it doesn't compile with double quote.