|
32 | 32 | from pandas.core.dtypes.common import (
|
33 | 33 | is_bool_dtype,
|
34 | 34 | is_dict_like,
|
35 |
| - is_extension_array_dtype, |
36 | 35 | is_float_dtype,
|
37 | 36 | is_integer,
|
38 | 37 | is_integer_dtype,
|
39 | 38 | is_list_like,
|
40 | 39 | is_object_dtype,
|
41 | 40 | is_string_dtype,
|
42 |
| - pandas_dtype, |
43 | 41 | )
|
44 | 42 | from pandas.core.dtypes.dtypes import (
|
45 | 43 | CategoricalDtype,
|
@@ -127,7 +125,6 @@ def __init__(self, kwds) -> None:
|
127 | 125 | "for the 'parse_dates' parameter"
|
128 | 126 | )
|
129 | 127 | self.parse_dates: bool | list = parse_dates
|
130 |
| - self._parse_date_cols: set = set() |
131 | 128 | self.date_parser = kwds.pop("date_parser", lib.no_default)
|
132 | 129 | self.date_format = kwds.pop("date_format", None)
|
133 | 130 | self.dayfirst = kwds.pop("dayfirst", False)
|
@@ -187,52 +184,6 @@ def __init__(self, kwds) -> None:
|
187 | 184 | # Normally, this arg would get pre-processed earlier on
|
188 | 185 | self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
|
189 | 186 |
|
190 |
| - def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set: |
191 |
| - """ |
192 |
| - Check if parse_dates are in columns. |
193 |
| -
|
194 |
| - If user has provided names for parse_dates, check if those columns |
195 |
| - are available. |
196 |
| -
|
197 |
| - Parameters |
198 |
| - ---------- |
199 |
| - columns : list |
200 |
| - List of names of the dataframe. |
201 |
| -
|
202 |
| - Returns |
203 |
| - ------- |
204 |
| - The names of the columns which will get parsed later if a list |
205 |
| - is given as specification. |
206 |
| -
|
207 |
| - Raises |
208 |
| - ------ |
209 |
| - ValueError |
210 |
| - If column to parse_date is not in dataframe. |
211 |
| -
|
212 |
| - """ |
213 |
| - if not isinstance(self.parse_dates, list): |
214 |
| - return set() |
215 |
| - |
216 |
| - # get only columns that are references using names (str), not by index |
217 |
| - missing_cols = ", ".join( |
218 |
| - sorted( |
219 |
| - { |
220 |
| - col |
221 |
| - for col in self.parse_dates |
222 |
| - if isinstance(col, str) and col not in columns |
223 |
| - } |
224 |
| - ) |
225 |
| - ) |
226 |
| - if missing_cols: |
227 |
| - raise ValueError( |
228 |
| - f"Missing column provided to 'parse_dates': '{missing_cols}'" |
229 |
| - ) |
230 |
| - # Convert positions to actual column names |
231 |
| - return { |
232 |
| - col if (isinstance(col, str) or col in columns) else columns[col] |
233 |
| - for col in self.parse_dates |
234 |
| - } |
235 |
| - |
236 | 187 | def close(self) -> None:
|
237 | 188 | pass
|
238 | 189 |
|
@@ -420,7 +371,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
|
420 | 371 | assert self.index_names is not None
|
421 | 372 | col_name = self.index_names[i]
|
422 | 373 | if col_name is not None:
|
423 |
| - col_na_values, col_na_fvalues = _get_na_values( |
| 374 | + col_na_values, col_na_fvalues = get_na_values( |
424 | 375 | col_name, self.na_values, self.na_fvalues, self.keep_default_na
|
425 | 376 | )
|
426 | 377 | else:
|
@@ -451,90 +402,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
|
451 | 402 |
|
452 | 403 | return index
|
453 | 404 |
|
454 |
| - @final |
455 |
| - def _convert_to_ndarrays( |
456 |
| - self, |
457 |
| - dct: Mapping, |
458 |
| - na_values, |
459 |
| - na_fvalues, |
460 |
| - converters=None, |
461 |
| - dtypes=None, |
462 |
| - ) -> dict[Any, np.ndarray]: |
463 |
| - result = {} |
464 |
| - for c, values in dct.items(): |
465 |
| - conv_f = None if converters is None else converters.get(c, None) |
466 |
| - if isinstance(dtypes, dict): |
467 |
| - cast_type = dtypes.get(c, None) |
468 |
| - else: |
469 |
| - # single dtype or None |
470 |
| - cast_type = dtypes |
471 |
| - |
472 |
| - if self.na_filter: |
473 |
| - col_na_values, col_na_fvalues = _get_na_values( |
474 |
| - c, na_values, na_fvalues, self.keep_default_na |
475 |
| - ) |
476 |
| - else: |
477 |
| - col_na_values, col_na_fvalues = set(), set() |
478 |
| - |
479 |
| - if c in self._parse_date_cols: |
480 |
| - # GH#26203 Do not convert columns which get converted to dates |
481 |
| - # but replace nans to ensure to_datetime works |
482 |
| - mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) |
483 |
| - np.putmask(values, mask, np.nan) |
484 |
| - result[c] = values |
485 |
| - continue |
486 |
| - |
487 |
| - if conv_f is not None: |
488 |
| - # conv_f applied to data before inference |
489 |
| - if cast_type is not None: |
490 |
| - warnings.warn( |
491 |
| - ( |
492 |
| - "Both a converter and dtype were specified " |
493 |
| - f"for column {c} - only the converter will be used." |
494 |
| - ), |
495 |
| - ParserWarning, |
496 |
| - stacklevel=find_stack_level(), |
497 |
| - ) |
498 |
| - |
499 |
| - try: |
500 |
| - values = lib.map_infer(values, conv_f) |
501 |
| - except ValueError: |
502 |
| - mask = algorithms.isin(values, list(na_values)).view(np.uint8) |
503 |
| - values = lib.map_infer_mask(values, conv_f, mask) |
504 |
| - |
505 |
| - cvals, na_count = self._infer_types( |
506 |
| - values, |
507 |
| - set(col_na_values) | col_na_fvalues, |
508 |
| - cast_type is None, |
509 |
| - try_num_bool=False, |
510 |
| - ) |
511 |
| - else: |
512 |
| - is_ea = is_extension_array_dtype(cast_type) |
513 |
| - is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) |
514 |
| - # skip inference if specified dtype is object |
515 |
| - # or casting to an EA |
516 |
| - try_num_bool = not (cast_type and is_str_or_ea_dtype) |
517 |
| - |
518 |
| - # general type inference and conversion |
519 |
| - cvals, na_count = self._infer_types( |
520 |
| - values, |
521 |
| - set(col_na_values) | col_na_fvalues, |
522 |
| - cast_type is None, |
523 |
| - try_num_bool, |
524 |
| - ) |
525 |
| - |
526 |
| - # type specified in dtype param or cast_type is an EA |
527 |
| - if cast_type is not None: |
528 |
| - cast_type = pandas_dtype(cast_type) |
529 |
| - if cast_type and (cvals.dtype != cast_type or is_ea): |
530 |
| - if not is_ea and na_count > 0: |
531 |
| - if is_bool_dtype(cast_type): |
532 |
| - raise ValueError(f"Bool column has NA values in column {c}") |
533 |
| - cvals = self._cast_types(cvals, cast_type, c) |
534 |
| - |
535 |
| - result[c] = cvals |
536 |
| - return result |
537 |
| - |
538 | 405 | @final
|
539 | 406 | def _set_noconvert_dtype_columns(
|
540 | 407 | self, col_indices: list[int], names: Sequence[Hashable]
|
@@ -580,6 +447,7 @@ def _set(x) -> int:
|
580 | 447 | return x
|
581 | 448 |
|
582 | 449 | if isinstance(self.parse_dates, list):
|
| 450 | + validate_parse_dates_presence(self.parse_dates, names) |
583 | 451 | for val in self.parse_dates:
|
584 | 452 | noconvert_columns.add(_set(val))
|
585 | 453 |
|
@@ -1154,7 +1022,7 @@ def _process_date_conversion(
|
1154 | 1022 | return data_dict
|
1155 | 1023 |
|
1156 | 1024 |
|
1157 |
| -def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): |
| 1025 | +def get_na_values(col, na_values, na_fvalues, keep_default_na: bool): |
1158 | 1026 | """
|
1159 | 1027 | Get the NaN values for a given column.
|
1160 | 1028 |
|
@@ -1191,3 +1059,49 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
|
1191 | 1059 |
|
1192 | 1060 | def is_index_col(col) -> bool:
|
1193 | 1061 | return col is not None and col is not False
|
| 1062 | + |
| 1063 | + |
| 1064 | +def validate_parse_dates_presence( |
| 1065 | + parse_dates: bool | list, columns: Sequence[Hashable] |
| 1066 | +) -> set: |
| 1067 | + """ |
| 1068 | + Check if parse_dates are in columns. |
| 1069 | +
|
| 1070 | + If user has provided names for parse_dates, check if those columns |
| 1071 | + are available. |
| 1072 | +
|
| 1073 | + Parameters |
| 1074 | + ---------- |
| 1075 | + columns : list |
| 1076 | + List of names of the dataframe. |
| 1077 | +
|
| 1078 | + Returns |
| 1079 | + ------- |
| 1080 | + The names of the columns which will get parsed later if a list |
| 1081 | + is given as specification. |
| 1082 | +
|
| 1083 | + Raises |
| 1084 | + ------ |
| 1085 | + ValueError |
| 1086 | + If column to parse_date is not in dataframe. |
| 1087 | +
|
| 1088 | + """ |
| 1089 | + if not isinstance(parse_dates, list): |
| 1090 | + return set() |
| 1091 | + |
| 1092 | + missing = set() |
| 1093 | + unique_cols = set() |
| 1094 | + for col in parse_dates: |
| 1095 | + if isinstance(col, str): |
| 1096 | + if col not in columns: |
| 1097 | + missing.add(col) |
| 1098 | + else: |
| 1099 | + unique_cols.add(col) |
| 1100 | + elif col in columns: |
| 1101 | + unique_cols.add(col) |
| 1102 | + else: |
| 1103 | + unique_cols.add(columns[col]) |
| 1104 | + if missing: |
| 1105 | + missing_cols = ", ".join(sorted(missing)) |
| 1106 | + raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'") |
| 1107 | + return unique_cols |
0 commit comments