Skip to content

Commit 3541537

Browse files
committed
Move some set parse_date_cols to pythong parser
1 parent f3ad4d5 commit 3541537

File tree

3 files changed

+145
-137
lines changed

3 files changed

+145
-137
lines changed

pandas/io/parsers/base_parser.py

Lines changed: 49 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,12 @@
3232
from pandas.core.dtypes.common import (
3333
is_bool_dtype,
3434
is_dict_like,
35-
is_extension_array_dtype,
3635
is_float_dtype,
3736
is_integer,
3837
is_integer_dtype,
3938
is_list_like,
4039
is_object_dtype,
4140
is_string_dtype,
42-
pandas_dtype,
4341
)
4442
from pandas.core.dtypes.dtypes import (
4543
CategoricalDtype,
@@ -127,7 +125,6 @@ def __init__(self, kwds) -> None:
127125
"for the 'parse_dates' parameter"
128126
)
129127
self.parse_dates: bool | list = parse_dates
130-
self._parse_date_cols: set = set()
131128
self.date_parser = kwds.pop("date_parser", lib.no_default)
132129
self.date_format = kwds.pop("date_format", None)
133130
self.dayfirst = kwds.pop("dayfirst", False)
@@ -187,52 +184,6 @@ def __init__(self, kwds) -> None:
187184
# Normally, this arg would get pre-processed earlier on
188185
self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
189186

190-
def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set:
191-
"""
192-
Check if parse_dates are in columns.
193-
194-
If user has provided names for parse_dates, check if those columns
195-
are available.
196-
197-
Parameters
198-
----------
199-
columns : list
200-
List of names of the dataframe.
201-
202-
Returns
203-
-------
204-
The names of the columns which will get parsed later if a list
205-
is given as specification.
206-
207-
Raises
208-
------
209-
ValueError
210-
If column to parse_date is not in dataframe.
211-
212-
"""
213-
if not isinstance(self.parse_dates, list):
214-
return set()
215-
216-
# get only columns that are references using names (str), not by index
217-
missing_cols = ", ".join(
218-
sorted(
219-
{
220-
col
221-
for col in self.parse_dates
222-
if isinstance(col, str) and col not in columns
223-
}
224-
)
225-
)
226-
if missing_cols:
227-
raise ValueError(
228-
f"Missing column provided to 'parse_dates': '{missing_cols}'"
229-
)
230-
# Convert positions to actual column names
231-
return {
232-
col if (isinstance(col, str) or col in columns) else columns[col]
233-
for col in self.parse_dates
234-
}
235-
236187
def close(self) -> None:
237188
pass
238189

@@ -420,7 +371,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
420371
assert self.index_names is not None
421372
col_name = self.index_names[i]
422373
if col_name is not None:
423-
col_na_values, col_na_fvalues = _get_na_values(
374+
col_na_values, col_na_fvalues = get_na_values(
424375
col_name, self.na_values, self.na_fvalues, self.keep_default_na
425376
)
426377
else:
@@ -451,90 +402,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
451402

452403
return index
453404

454-
@final
455-
def _convert_to_ndarrays(
456-
self,
457-
dct: Mapping,
458-
na_values,
459-
na_fvalues,
460-
converters=None,
461-
dtypes=None,
462-
) -> dict[Any, np.ndarray]:
463-
result = {}
464-
for c, values in dct.items():
465-
conv_f = None if converters is None else converters.get(c, None)
466-
if isinstance(dtypes, dict):
467-
cast_type = dtypes.get(c, None)
468-
else:
469-
# single dtype or None
470-
cast_type = dtypes
471-
472-
if self.na_filter:
473-
col_na_values, col_na_fvalues = _get_na_values(
474-
c, na_values, na_fvalues, self.keep_default_na
475-
)
476-
else:
477-
col_na_values, col_na_fvalues = set(), set()
478-
479-
if c in self._parse_date_cols:
480-
# GH#26203 Do not convert columns which get converted to dates
481-
# but replace nans to ensure to_datetime works
482-
mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
483-
np.putmask(values, mask, np.nan)
484-
result[c] = values
485-
continue
486-
487-
if conv_f is not None:
488-
# conv_f applied to data before inference
489-
if cast_type is not None:
490-
warnings.warn(
491-
(
492-
"Both a converter and dtype were specified "
493-
f"for column {c} - only the converter will be used."
494-
),
495-
ParserWarning,
496-
stacklevel=find_stack_level(),
497-
)
498-
499-
try:
500-
values = lib.map_infer(values, conv_f)
501-
except ValueError:
502-
mask = algorithms.isin(values, list(na_values)).view(np.uint8)
503-
values = lib.map_infer_mask(values, conv_f, mask)
504-
505-
cvals, na_count = self._infer_types(
506-
values,
507-
set(col_na_values) | col_na_fvalues,
508-
cast_type is None,
509-
try_num_bool=False,
510-
)
511-
else:
512-
is_ea = is_extension_array_dtype(cast_type)
513-
is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
514-
# skip inference if specified dtype is object
515-
# or casting to an EA
516-
try_num_bool = not (cast_type and is_str_or_ea_dtype)
517-
518-
# general type inference and conversion
519-
cvals, na_count = self._infer_types(
520-
values,
521-
set(col_na_values) | col_na_fvalues,
522-
cast_type is None,
523-
try_num_bool,
524-
)
525-
526-
# type specified in dtype param or cast_type is an EA
527-
if cast_type is not None:
528-
cast_type = pandas_dtype(cast_type)
529-
if cast_type and (cvals.dtype != cast_type or is_ea):
530-
if not is_ea and na_count > 0:
531-
if is_bool_dtype(cast_type):
532-
raise ValueError(f"Bool column has NA values in column {c}")
533-
cvals = self._cast_types(cvals, cast_type, c)
534-
535-
result[c] = cvals
536-
return result
537-
538405
@final
539406
def _set_noconvert_dtype_columns(
540407
self, col_indices: list[int], names: Sequence[Hashable]
@@ -580,6 +447,7 @@ def _set(x) -> int:
580447
return x
581448

582449
if isinstance(self.parse_dates, list):
450+
validate_parse_dates_presence(self.parse_dates, names)
583451
for val in self.parse_dates:
584452
noconvert_columns.add(_set(val))
585453

@@ -1154,7 +1022,7 @@ def _process_date_conversion(
11541022
return data_dict
11551023

11561024

1157-
def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
1025+
def get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
11581026
"""
11591027
Get the NaN values for a given column.
11601028
@@ -1191,3 +1059,49 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
11911059

11921060
def is_index_col(col) -> bool:
11931061
return col is not None and col is not False
1062+
1063+
1064+
def validate_parse_dates_presence(
1065+
parse_dates: bool | list, columns: Sequence[Hashable]
1066+
) -> set:
1067+
"""
1068+
Check if parse_dates are in columns.
1069+
1070+
If user has provided names for parse_dates, check if those columns
1071+
are available.
1072+
1073+
Parameters
1074+
----------
1075+
columns : list
1076+
List of names of the dataframe.
1077+
1078+
Returns
1079+
-------
1080+
The names of the columns which will get parsed later if a list
1081+
is given as specification.
1082+
1083+
Raises
1084+
------
1085+
ValueError
1086+
If column to parse_date is not in dataframe.
1087+
1088+
"""
1089+
if not isinstance(parse_dates, list):
1090+
return set()
1091+
1092+
missing = set()
1093+
unique_cols = set()
1094+
for col in parse_dates:
1095+
if isinstance(col, str):
1096+
if col not in columns:
1097+
missing.add(col)
1098+
else:
1099+
unique_cols.add(col)
1100+
elif col in columns:
1101+
unique_cols.add(col)
1102+
else:
1103+
unique_cols.add(columns[col])
1104+
if missing:
1105+
missing_cols = ", ".join(sorted(missing))
1106+
raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'")
1107+
return unique_cols

pandas/io/parsers/c_parser_wrapper.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
ParserBase,
3232
ParserError,
3333
is_index_col,
34+
validate_parse_dates_presence,
3435
)
3536

3637
if TYPE_CHECKING:
@@ -160,7 +161,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
160161
)
161162

162163
# error: Cannot determine type of 'names'
163-
self._validate_parse_dates_presence(self.names) # type: ignore[has-type]
164+
validate_parse_dates_presence(self.parse_dates, self.names) # type: ignore[has-type]
164165
self._set_noconvert_columns()
165166

166167
# error: Cannot determine type of 'names'

pandas/io/parsers/python_parser.py

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@
1010
from typing import (
1111
IO,
1212
TYPE_CHECKING,
13+
Any,
1314
DefaultDict,
1415
Literal,
1516
cast,
17+
final,
1618
)
1719
import warnings
1820

@@ -29,18 +31,25 @@
2931

3032
from pandas.core.dtypes.common import (
3133
is_bool_dtype,
34+
is_extension_array_dtype,
3235
is_integer,
3336
is_numeric_dtype,
37+
is_string_dtype,
38+
pandas_dtype,
3439
)
3540
from pandas.core.dtypes.inference import is_dict_like
3641

42+
from pandas.core import algorithms
43+
3744
from pandas.io.common import (
3845
dedup_names,
3946
is_potential_multi_index,
4047
)
4148
from pandas.io.parsers.base_parser import (
4249
ParserBase,
50+
get_na_values,
4351
parser_defaults,
52+
validate_parse_dates_presence,
4453
)
4554

4655
if TYPE_CHECKING:
@@ -157,7 +166,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
157166
if self._col_indices is None:
158167
self._col_indices = list(range(len(self.columns)))
159168

160-
self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
161169
self._no_thousands_columns = self._set_no_thousand_columns()
162170

163171
if len(self.decimal) != 1:
@@ -370,6 +378,91 @@ def _convert_data(
370378
clean_dtypes,
371379
)
372380

381+
@final
382+
def _convert_to_ndarrays(
383+
self,
384+
dct: Mapping,
385+
na_values,
386+
na_fvalues,
387+
converters=None,
388+
dtypes=None,
389+
) -> dict[Any, np.ndarray]:
390+
result = {}
391+
parse_date_cols = validate_parse_dates_presence(self.parse_dates, self.columns)
392+
for c, values in dct.items():
393+
conv_f = None if converters is None else converters.get(c, None)
394+
if isinstance(dtypes, dict):
395+
cast_type = dtypes.get(c, None)
396+
else:
397+
# single dtype or None
398+
cast_type = dtypes
399+
400+
if self.na_filter:
401+
col_na_values, col_na_fvalues = get_na_values(
402+
c, na_values, na_fvalues, self.keep_default_na
403+
)
404+
else:
405+
col_na_values, col_na_fvalues = set(), set()
406+
407+
if c in parse_date_cols:
408+
# GH#26203 Do not convert columns which get converted to dates
409+
# but replace nans to ensure to_datetime works
410+
mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
411+
np.putmask(values, mask, np.nan)
412+
result[c] = values
413+
continue
414+
415+
if conv_f is not None:
416+
# conv_f applied to data before inference
417+
if cast_type is not None:
418+
warnings.warn(
419+
(
420+
"Both a converter and dtype were specified "
421+
f"for column {c} - only the converter will be used."
422+
),
423+
ParserWarning,
424+
stacklevel=find_stack_level(),
425+
)
426+
427+
try:
428+
values = lib.map_infer(values, conv_f)
429+
except ValueError:
430+
mask = algorithms.isin(values, list(na_values)).view(np.uint8)
431+
values = lib.map_infer_mask(values, conv_f, mask)
432+
433+
cvals, na_count = self._infer_types(
434+
values,
435+
set(col_na_values) | col_na_fvalues,
436+
cast_type is None,
437+
try_num_bool=False,
438+
)
439+
else:
440+
is_ea = is_extension_array_dtype(cast_type)
441+
is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
442+
# skip inference if specified dtype is object
443+
# or casting to an EA
444+
try_num_bool = not (cast_type and is_str_or_ea_dtype)
445+
446+
# general type inference and conversion
447+
cvals, na_count = self._infer_types(
448+
values,
449+
set(col_na_values) | col_na_fvalues,
450+
cast_type is None,
451+
try_num_bool,
452+
)
453+
454+
# type specified in dtype param or cast_type is an EA
455+
if cast_type is not None:
456+
cast_type = pandas_dtype(cast_type)
457+
if cast_type and (cvals.dtype != cast_type or is_ea):
458+
if not is_ea and na_count > 0:
459+
if is_bool_dtype(cast_type):
460+
raise ValueError(f"Bool column has NA values in column {c}")
461+
cvals = self._cast_types(cvals, cast_type, c)
462+
463+
result[c] = cvals
464+
return result
465+
373466
@cache_readonly
374467
def _have_mi_columns(self) -> bool:
375468
if self.header is None:

0 commit comments

Comments
 (0)