@@ -358,7 +358,7 @@ cdef class TextReader:
358358 int64_t leading_cols, table_width
359359 object delimiter # bytes or str
360360 object converters
361- object na_values
361+ object na_values # dict[hashable, set[str]] | list[str]
362362 list header # list[list[non-negative integers]]
363363 object index_col
364364 object skiprows
@@ -390,8 +390,8 @@ cdef class TextReader:
390390 usecols = None ,
391391 on_bad_lines = ERROR,
392392 bint na_filter = True ,
393- na_values = None ,
394- na_fvalues = None ,
393+ na_values = None , # dict[ hashable , set[str]] | set[str]
394+ na_fvalues = None , # dict[ hashable , set[float]] | set[float]
395395 bint keep_default_na = True ,
396396 true_values = None ,
397397 false_values = None ,
@@ -486,9 +486,17 @@ cdef class TextReader:
486486
487487 self .delimiter = delimiter
488488
489+ # na_fvalues is created from user-provided na_value in _clean_na_values
490+ # which ensures that either
491+ # a) na_values is set[str] and na_fvalues is set[float]
492+ # b) na_values is dict[Hashable, set[str]] and
493+ # na_fvalues is dict[Hashable, set[float]]
494+ # (tests for this case are in test_na_values.py)
495+ if not isinstance (na_values, dict ):
496+ # i.e. it must be a set
497+ na_values = list (na_values)
498+
489499 self .na_values = na_values
490- if na_fvalues is None :
491- na_fvalues = set ()
492500 self .na_fvalues = na_fvalues
493501
494502 self .true_values = _maybe_encode(true_values) + _true_values
@@ -929,7 +937,8 @@ cdef class TextReader:
929937 int nused
930938 kh_str_starts_t *na_hashset = NULL
931939 int64_t start , end
932- object name , na_flist , col_dtype = None
940+ object name , col_dtype = None
941+ set na_fset
933942 bint na_filter = 0
934943 int64_t num_cols
935944 dict results
@@ -1021,18 +1030,15 @@ cdef class TextReader:
10211030 results[i] = _apply_converter(conv, self .parser, i, start, end)
10221031 continue
10231032
1024- # Collect the list of NaN values associated with the column.
1033+ # Collect the set of NaN values associated with the column.
10251034 # If we aren't supposed to do that, or none are collected,
10261035 # we set `na_filter` to `0` (`1` otherwise).
1027- na_flist = set ()
1036+ na_fset = set ()
10281037
10291038 if self .na_filter:
1030- na_list, na_flist = self ._get_na_list(i, name)
1031- if na_list is None :
1032- na_filter = 0
1033- else :
1034- na_filter = 1
1035- na_hashset = kset_from_list(na_list)
1039+ na_list, na_fset = self ._get_na_list(i, name)
1040+ na_filter = 1
1041+ na_hashset = kset_from_list(na_list)
10361042 else :
10371043 na_filter = 0
10381044
@@ -1041,7 +1047,7 @@ cdef class TextReader:
10411047 try :
10421048 col_res, na_count = self ._convert_tokens(
10431049 i, start, end, name, na_filter, na_hashset,
1044- na_flist , col_dtype)
1050+ na_fset , col_dtype)
10451051 finally :
10461052 # gh-21353
10471053 #
@@ -1075,12 +1081,12 @@ cdef class TextReader:
10751081 cdef _convert_tokens(self , Py_ssize_t i, int64_t start,
10761082 int64_t end, object name, bint na_filter,
10771083 kh_str_starts_t * na_hashset,
1078- object na_flist , object col_dtype):
1084+ set na_fset , object col_dtype):
10791085
10801086 if col_dtype is not None :
10811087 col_res, na_count = self ._convert_with_dtype(
10821088 col_dtype, i, start, end, na_filter,
1083- 1 , na_hashset, na_flist )
1089+ 1 , na_hashset, na_fset )
10841090
10851091 # Fallback on the parse (e.g. we requested int dtype,
10861092 # but its actually a float).
@@ -1094,19 +1100,19 @@ cdef class TextReader:
10941100 for dt in self .dtype_cast_order:
10951101 try :
10961102 col_res, na_count = self ._convert_with_dtype(
1097- dt, i, start, end, na_filter, 0 , na_hashset, na_flist )
1103+ dt, i, start, end, na_filter, 0 , na_hashset, na_fset )
10981104 except ValueError :
10991105 # This error is raised from trying to convert to uint64,
11001106 # and we discover that we cannot convert to any numerical
11011107 # dtype successfully. As a result, we leave the data
11021108 # column AS IS with object dtype.
11031109 col_res, na_count = self ._convert_with_dtype(
11041110 np.dtype(" object" ), i, start, end, 0 ,
1105- 0 , na_hashset, na_flist )
1111+ 0 , na_hashset, na_fset )
11061112 except OverflowError :
11071113 col_res, na_count = self ._convert_with_dtype(
11081114 np.dtype(" object" ), i, start, end, na_filter,
1109- 0 , na_hashset, na_flist )
1115+ 0 , na_hashset, na_fset )
11101116
11111117 if col_res is not None :
11121118 break
@@ -1154,7 +1160,7 @@ cdef class TextReader:
11541160 bint na_filter,
11551161 bint user_dtype,
11561162 kh_str_starts_t * na_hashset,
1157- object na_flist ):
1163+ set na_fset ):
11581164 if isinstance (dtype, CategoricalDtype):
11591165 # TODO: I suspect that _categorical_convert could be
11601166 # optimized when dtype is an instance of CategoricalDtype
@@ -1212,7 +1218,7 @@ cdef class TextReader:
12121218
12131219 elif dtype.kind == " f" :
12141220 result, na_count = _try_double(self .parser, i, start, end,
1215- na_filter, na_hashset, na_flist )
1221+ na_filter, na_hashset, na_fset )
12161222
12171223 if result is not None and dtype != " float64" :
12181224 result = result.astype(dtype)
@@ -1272,10 +1278,6 @@ cdef class TextReader:
12721278 return self .converters.get(i)
12731279
12741280 cdef _get_na_list(self , Py_ssize_t i, name):
1275- # Note: updates self.na_values, self.na_fvalues
1276- if self .na_values is None :
1277- return None , set ()
1278-
12791281 if isinstance (self .na_values, dict ):
12801282 key = None
12811283 values = None
@@ -1300,11 +1302,6 @@ cdef class TextReader:
13001302
13011303 return _ensure_encoded(values), fvalues
13021304 else :
1303- if not isinstance (self .na_values, list ):
1304- self .na_values = list (self .na_values)
1305- if not isinstance (self .na_fvalues, set ):
1306- self .na_fvalues = set (self .na_fvalues)
1307-
13081305 return _ensure_encoded(self .na_values), self .na_fvalues
13091306
13101307 cdef _free_na_set(self , kh_str_starts_t * table):
@@ -1622,27 +1619,27 @@ cdef:
16221619# -> tuple[ndarray[float64_t], int] | tuple[None, None]
16231620cdef _try_double(parser_t * parser, int64_t col,
16241621 int64_t line_start, int64_t line_end,
1625- bint na_filter, kh_str_starts_t * na_hashset, object na_flist ):
1622+ bint na_filter, kh_str_starts_t * na_hashset, set na_fset ):
16261623 cdef:
16271624 int error, na_count = 0
16281625 Py_ssize_t lines
16291626 float64_t * data
16301627 float64_t NA = na_values[np.float64]
1631- kh_float64_t * na_fset
1628+ kh_float64_t * na_fhashset
16321629 ndarray[float64_t] result
1633- bint use_na_flist = len (na_flist ) > 0
1630+ bint use_na_flist = len (na_fset ) > 0
16341631
16351632 lines = line_end - line_start
16361633 result = np.empty(lines, dtype = np.float64)
16371634 data = < float64_t * > result.data
1638- na_fset = kset_float64_from_list(na_flist )
1635+ na_fhashset = kset_float64_from_set(na_fset )
16391636 with nogil:
16401637 error = _try_double_nogil(parser, parser.double_converter,
16411638 col, line_start, line_end,
16421639 na_filter, na_hashset, use_na_flist,
1643- na_fset , NA, data, & na_count)
1640+ na_fhashset , NA, data, & na_count)
16441641
1645- kh_destroy_float64(na_fset )
1642+ kh_destroy_float64(na_fhashset )
16461643 if error != 0 :
16471644 return None , None
16481645 return result, na_count
@@ -1655,7 +1652,7 @@ cdef int _try_double_nogil(parser_t *parser,
16551652 int64_t col, int64_t line_start, int64_t line_end,
16561653 bint na_filter, kh_str_starts_t * na_hashset,
16571654 bint use_na_flist,
1658- const kh_float64_t * na_flist ,
1655+ const kh_float64_t * na_fhashset ,
16591656 float64_t NA, float64_t * data,
16601657 int * na_count) nogil:
16611658 cdef:
@@ -1694,8 +1691,8 @@ cdef int _try_double_nogil(parser_t *parser,
16941691 else :
16951692 return 1
16961693 if use_na_flist:
1697- k64 = kh_get_float64(na_flist , data[0 ])
1698- if k64 != na_flist .n_buckets:
1694+ k64 = kh_get_float64(na_fhashset , data[0 ])
1695+ if k64 != na_fhashset .n_buckets:
16991696 na_count[0 ] += 1
17001697 data[0 ] = NA
17011698 data += 1
@@ -1977,7 +1974,7 @@ cdef kh_str_starts_t* kset_from_list(list values) except NULL:
19771974 return table
19781975
19791976
1980- cdef kh_float64_t* kset_float64_from_list( values) except NULL :
1977+ cdef kh_float64_t* kset_float64_from_set( set values) except NULL :
19811978 # caller takes responsibility for freeing the hash table
19821979 cdef:
19831980 kh_float64_t * table
0 commit comments