Skip to content

Commit 33a11fe

Browse files
committed
Move can cast to python parser
1 parent 8dbdf34 commit 33a11fe

File tree

2 files changed

+138
-135
lines changed

2 files changed

+138
-135
lines changed

pandas/io/parsers/base_parser.py

Lines changed: 51 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
)
2929
from pandas.util._exceptions import find_stack_level
3030

31-
from pandas.core.dtypes.astype import astype_array
3231
from pandas.core.dtypes.common import (
3332
is_bool_dtype,
3433
is_dict_like,
@@ -39,10 +38,6 @@
3938
is_object_dtype,
4039
is_string_dtype,
4140
)
42-
from pandas.core.dtypes.dtypes import (
43-
CategoricalDtype,
44-
ExtensionDtype,
45-
)
4641
from pandas.core.dtypes.missing import isna
4742

4843
from pandas import (
@@ -55,12 +50,9 @@
5550
ArrowExtensionArray,
5651
BaseMaskedArray,
5752
BooleanArray,
58-
Categorical,
59-
ExtensionArray,
6053
FloatingArray,
6154
IntegerArray,
6255
)
63-
from pandas.core.arrays.boolean import BooleanDtype
6456
from pandas.core.indexes.api import (
6557
Index,
6658
MultiIndex,
@@ -83,7 +75,6 @@
8375
from pandas._typing import (
8476
ArrayLike,
8577
DtypeArg,
86-
DtypeObj,
8778
Hashable,
8879
HashableT,
8980
Scalar,
@@ -171,7 +162,7 @@ def __init__(self, kwds) -> None:
171162

172163
self._first_chunk = True
173164

174-
self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
165+
self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
175166

176167
# Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
177168
# Normally, this arg would get pre-processed earlier on
@@ -569,80 +560,6 @@ def _infer_types(
569560

570561
return result, na_count
571562

572-
@final
573-
def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
574-
"""
575-
Cast values to specified type
576-
577-
Parameters
578-
----------
579-
values : ndarray or ExtensionArray
580-
cast_type : np.dtype or ExtensionDtype
581-
dtype to cast values to
582-
column : string
583-
column name - used only for error reporting
584-
585-
Returns
586-
-------
587-
converted : ndarray or ExtensionArray
588-
"""
589-
if isinstance(cast_type, CategoricalDtype):
590-
known_cats = cast_type.categories is not None
591-
592-
if not is_object_dtype(values.dtype) and not known_cats:
593-
# TODO: this is for consistency with
594-
# c-parser which parses all categories
595-
# as strings
596-
values = lib.ensure_string_array(
597-
values, skipna=False, convert_na_value=False
598-
)
599-
600-
cats = Index(values).unique().dropna()
601-
values = Categorical._from_inferred_categories(
602-
cats, cats.get_indexer(values), cast_type, true_values=self.true_values
603-
)
604-
605-
# use the EA's implementation of casting
606-
elif isinstance(cast_type, ExtensionDtype):
607-
array_type = cast_type.construct_array_type()
608-
try:
609-
if isinstance(cast_type, BooleanDtype):
610-
# error: Unexpected keyword argument "true_values" for
611-
# "_from_sequence_of_strings" of "ExtensionArray"
612-
values_str = [str(val) for val in values]
613-
return array_type._from_sequence_of_strings( # type: ignore[call-arg]
614-
values_str,
615-
dtype=cast_type,
616-
true_values=self.true_values,
617-
false_values=self.false_values,
618-
none_values=self.na_values,
619-
)
620-
else:
621-
return array_type._from_sequence_of_strings(values, dtype=cast_type)
622-
except NotImplementedError as err:
623-
raise NotImplementedError(
624-
f"Extension Array: {array_type} must implement "
625-
"_from_sequence_of_strings in order to be used in parser methods"
626-
) from err
627-
628-
elif isinstance(values, ExtensionArray):
629-
values = values.astype(cast_type, copy=False)
630-
elif issubclass(cast_type.type, str):
631-
# TODO: why skipna=True here and False above? some tests depend
632-
# on it here, but nothing fails if we change it above
633-
# (as no tests get there as of 2022-12-06)
634-
values = lib.ensure_string_array(
635-
values, skipna=True, convert_na_value=False
636-
)
637-
else:
638-
try:
639-
values = astype_array(values, cast_type, copy=True)
640-
except ValueError as err:
641-
raise ValueError(
642-
f"Unable to convert column {column} to type {cast_type}"
643-
) from err
644-
return values
645-
646563
@overload
647564
def _do_date_conversions(
648565
self,
@@ -776,56 +693,6 @@ def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> Sequen
776693

777694
return usecols
778695

779-
@final
780-
def _validate_usecols_arg(self, usecols):
781-
"""
782-
Validate the 'usecols' parameter.
783-
784-
Checks whether or not the 'usecols' parameter contains all integers
785-
(column selection by index), strings (column by name) or is a callable.
786-
Raises a ValueError if that is not the case.
787-
788-
Parameters
789-
----------
790-
usecols : list-like, callable, or None
791-
List of columns to use when parsing or a callable that can be used
792-
to filter a list of table columns.
793-
794-
Returns
795-
-------
796-
usecols_tuple : tuple
797-
A tuple of (verified_usecols, usecols_dtype).
798-
799-
'verified_usecols' is either a set if an array-like is passed in or
800-
'usecols' if a callable or None is passed in.
801-
802-
'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
803-
is passed in or None if a callable or None is passed in.
804-
"""
805-
msg = (
806-
"'usecols' must either be list-like of all strings, all unicode, "
807-
"all integers or a callable."
808-
)
809-
if usecols is not None:
810-
if callable(usecols):
811-
return usecols, None
812-
813-
if not is_list_like(usecols):
814-
# see gh-20529
815-
#
816-
# Ensure it is iterable container but not string.
817-
raise ValueError(msg)
818-
819-
usecols_dtype = lib.infer_dtype(usecols, skipna=False)
820-
821-
if usecols_dtype not in ("empty", "integer", "string"):
822-
raise ValueError(msg)
823-
824-
usecols = set(usecols)
825-
826-
return usecols, usecols_dtype
827-
return usecols, None
828-
829696
@final
830697
def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
831698
if not is_index_col(index_col):
@@ -1071,3 +938,53 @@ def validate_parse_dates_presence(
1071938
missing_cols = ", ".join(sorted(missing))
1072939
raise ValueError(f"Missing column provided to 'parse_dates': '{missing_cols}'")
1073940
return unique_cols
941+
942+
943+
def _validate_usecols_arg(usecols):
944+
"""
945+
Validate the 'usecols' parameter.
946+
947+
Checks whether or not the 'usecols' parameter contains all integers
948+
(column selection by index), strings (column by name) or is a callable.
949+
Raises a ValueError if that is not the case.
950+
951+
Parameters
952+
----------
953+
usecols : list-like, callable, or None
954+
List of columns to use when parsing or a callable that can be used
955+
to filter a list of table columns.
956+
957+
Returns
958+
-------
959+
usecols_tuple : tuple
960+
A tuple of (verified_usecols, usecols_dtype).
961+
962+
'verified_usecols' is either a set if an array-like is passed in or
963+
'usecols' if a callable or None is passed in.
964+
965+
'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
966+
is passed in or None if a callable or None is passed in.
967+
"""
968+
msg = (
969+
"'usecols' must either be list-like of all strings, all unicode, "
970+
"all integers or a callable."
971+
)
972+
if usecols is not None:
973+
if callable(usecols):
974+
return usecols, None
975+
976+
if not is_list_like(usecols):
977+
# see gh-20529
978+
#
979+
# Ensure it is iterable container but not string.
980+
raise ValueError(msg)
981+
982+
usecols_dtype = lib.infer_dtype(usecols, skipna=False)
983+
984+
if usecols_dtype not in ("empty", "integer", "string"):
985+
raise ValueError(msg)
986+
987+
usecols = set(usecols)
988+
989+
return usecols, usecols_dtype
990+
return usecols, None

pandas/io/parsers/python_parser.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,29 @@
2929
from pandas.util._decorators import cache_readonly
3030
from pandas.util._exceptions import find_stack_level
3131

32+
from pandas.core.dtypes.astype import astype_array
3233
from pandas.core.dtypes.common import (
3334
is_bool_dtype,
3435
is_extension_array_dtype,
3536
is_integer,
3637
is_numeric_dtype,
38+
is_object_dtype,
3739
is_string_dtype,
3840
pandas_dtype,
3941
)
42+
from pandas.core.dtypes.dtypes import (
43+
CategoricalDtype,
44+
ExtensionDtype,
45+
)
4046
from pandas.core.dtypes.inference import is_dict_like
4147

4248
from pandas.core import algorithms
49+
from pandas.core.arrays import (
50+
Categorical,
51+
ExtensionArray,
52+
)
53+
from pandas.core.arrays.boolean import BooleanDtype
54+
from pandas.core.indexes.api import Index
4355

4456
from pandas.io.common import (
4557
dedup_names,
@@ -62,13 +74,13 @@
6274

6375
from pandas._typing import (
6476
ArrayLike,
77+
DtypeObj,
6578
ReadCsvBuffer,
6679
Scalar,
6780
T,
6881
)
6982

7083
from pandas import (
71-
Index,
7284
MultiIndex,
7385
Series,
7486
)
@@ -463,6 +475,80 @@ def _convert_to_ndarrays(
463475
result[c] = cvals
464476
return result
465477

478+
@final
479+
def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
480+
"""
481+
Cast values to specified type
482+
483+
Parameters
484+
----------
485+
values : ndarray or ExtensionArray
486+
cast_type : np.dtype or ExtensionDtype
487+
dtype to cast values to
488+
column : string
489+
column name - used only for error reporting
490+
491+
Returns
492+
-------
493+
converted : ndarray or ExtensionArray
494+
"""
495+
if isinstance(cast_type, CategoricalDtype):
496+
known_cats = cast_type.categories is not None
497+
498+
if not is_object_dtype(values.dtype) and not known_cats:
499+
# TODO: this is for consistency with
500+
# c-parser which parses all categories
501+
# as strings
502+
values = lib.ensure_string_array(
503+
values, skipna=False, convert_na_value=False
504+
)
505+
506+
cats = Index(values).unique().dropna()
507+
values = Categorical._from_inferred_categories(
508+
cats, cats.get_indexer(values), cast_type, true_values=self.true_values
509+
)
510+
511+
# use the EA's implementation of casting
512+
elif isinstance(cast_type, ExtensionDtype):
513+
array_type = cast_type.construct_array_type()
514+
try:
515+
if isinstance(cast_type, BooleanDtype):
516+
# error: Unexpected keyword argument "true_values" for
517+
# "_from_sequence_of_strings" of "ExtensionArray"
518+
values_str = [str(val) for val in values]
519+
return array_type._from_sequence_of_strings( # type: ignore[call-arg]
520+
values_str,
521+
dtype=cast_type,
522+
true_values=self.true_values,
523+
false_values=self.false_values,
524+
none_values=self.na_values,
525+
)
526+
else:
527+
return array_type._from_sequence_of_strings(values, dtype=cast_type)
528+
except NotImplementedError as err:
529+
raise NotImplementedError(
530+
f"Extension Array: {array_type} must implement "
531+
"_from_sequence_of_strings in order to be used in parser methods"
532+
) from err
533+
534+
elif isinstance(values, ExtensionArray):
535+
values = values.astype(cast_type, copy=False)
536+
elif issubclass(cast_type.type, str):
537+
# TODO: why skipna=True here and False above? some tests depend
538+
# on it here, but nothing fails if we change it above
539+
# (as no tests get there as of 2022-12-06)
540+
values = lib.ensure_string_array(
541+
values, skipna=True, convert_na_value=False
542+
)
543+
else:
544+
try:
545+
values = astype_array(values, cast_type, copy=True)
546+
except ValueError as err:
547+
raise ValueError(
548+
f"Unable to convert column {column} to type {cast_type}"
549+
) from err
550+
return values
551+
466552
@cache_readonly
467553
def _have_mi_columns(self) -> bool:
468554
if self.header is None:

0 commit comments

Comments
 (0)