Skip to content

Commit cbf29f2

Browse files
committed
Revert "Move can cast to python parser"
This reverts commit 99ca747.
1 parent 99ca747 commit cbf29f2

File tree

2 files changed

+132
-129
lines changed

2 files changed

+132
-129
lines changed

pandas/io/parsers/base_parser.py

Lines changed: 131 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,25 @@
1515

1616
import numpy as np
1717

18-
from pandas._libs import lib
18+
from pandas._libs import (
19+
lib,
20+
parsers,
21+
)
22+
import pandas._libs.ops as libops
1923
from pandas._libs.parsers import STR_NA_VALUES
24+
from pandas.compat._optional import import_optional_dependency
2025
from pandas.errors import (
2126
ParserError,
2227
ParserWarning,
2328
)
2429
from pandas.util._exceptions import find_stack_level
2530

2631
from pandas.core.dtypes.common import (
32+
is_bool_dtype,
2733
is_dict_like,
34+
is_float_dtype,
2835
is_integer,
36+
is_integer_dtype,
2937
is_list_like,
3038
is_object_dtype,
3139
is_string_dtype,
@@ -35,6 +43,15 @@
3543
from pandas import (
3644
DataFrame,
3745
DatetimeIndex,
46+
StringDtype,
47+
)
48+
from pandas.core import algorithms
49+
from pandas.core.arrays import (
50+
ArrowExtensionArray,
51+
BaseMaskedArray,
52+
BooleanArray,
53+
FloatingArray,
54+
IntegerArray,
3855
)
3956
from pandas.core.indexes.api import (
4057
Index,
@@ -430,6 +447,119 @@ def _set(x) -> int:
430447

431448
return noconvert_columns
432449

450+
@final
451+
def _infer_types(
452+
self, values, na_values, no_dtype_specified, try_num_bool: bool = True
453+
) -> tuple[ArrayLike, int]:
454+
"""
455+
Infer types of values, possibly casting
456+
457+
Parameters
458+
----------
459+
values : ndarray
460+
na_values : set
461+
no_dtype_specified: Specifies if we want to cast explicitly
462+
try_num_bool : bool, default try
463+
try to cast values to numeric (first preference) or boolean
464+
465+
Returns
466+
-------
467+
converted : ndarray or ExtensionArray
468+
na_count : int
469+
"""
470+
na_count = 0
471+
if issubclass(values.dtype.type, (np.number, np.bool_)):
472+
# If our array has numeric dtype, we don't have to check for strings in isin
473+
na_values = np.array([val for val in na_values if not isinstance(val, str)])
474+
mask = algorithms.isin(values, na_values)
475+
na_count = mask.astype("uint8", copy=False).sum()
476+
if na_count > 0:
477+
if is_integer_dtype(values):
478+
values = values.astype(np.float64)
479+
np.putmask(values, mask, np.nan)
480+
return values, na_count
481+
482+
dtype_backend = self.dtype_backend
483+
non_default_dtype_backend = (
484+
no_dtype_specified and dtype_backend is not lib.no_default
485+
)
486+
result: ArrayLike
487+
488+
if try_num_bool and is_object_dtype(values.dtype):
489+
# exclude e.g DatetimeIndex here
490+
try:
491+
result, result_mask = lib.maybe_convert_numeric(
492+
values,
493+
na_values,
494+
False,
495+
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
496+
)
497+
except (ValueError, TypeError):
498+
# e.g. encountering datetime string gets ValueError
499+
# TypeError can be raised in floatify
500+
na_count = parsers.sanitize_objects(values, na_values)
501+
result = values
502+
else:
503+
if non_default_dtype_backend:
504+
if result_mask is None:
505+
result_mask = np.zeros(result.shape, dtype=np.bool_)
506+
507+
if result_mask.all():
508+
result = IntegerArray(
509+
np.ones(result_mask.shape, dtype=np.int64), result_mask
510+
)
511+
elif is_integer_dtype(result):
512+
result = IntegerArray(result, result_mask)
513+
elif is_bool_dtype(result):
514+
result = BooleanArray(result, result_mask)
515+
elif is_float_dtype(result):
516+
result = FloatingArray(result, result_mask)
517+
518+
na_count = result_mask.sum()
519+
else:
520+
na_count = isna(result).sum()
521+
else:
522+
result = values
523+
if values.dtype == np.object_:
524+
na_count = parsers.sanitize_objects(values, na_values)
525+
526+
if result.dtype == np.object_ and try_num_bool:
527+
result, bool_mask = libops.maybe_convert_bool(
528+
np.asarray(values),
529+
true_values=self.true_values,
530+
false_values=self.false_values,
531+
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
532+
)
533+
if result.dtype == np.bool_ and non_default_dtype_backend:
534+
if bool_mask is None:
535+
bool_mask = np.zeros(result.shape, dtype=np.bool_)
536+
result = BooleanArray(result, bool_mask)
537+
elif result.dtype == np.object_ and non_default_dtype_backend:
538+
# read_excel sends array of datetime objects
539+
if not lib.is_datetime_array(result, skipna=True):
540+
dtype = StringDtype()
541+
cls = dtype.construct_array_type()
542+
result = cls._from_sequence(values, dtype=dtype)
543+
544+
if dtype_backend == "pyarrow":
545+
pa = import_optional_dependency("pyarrow")
546+
if isinstance(result, np.ndarray):
547+
result = ArrowExtensionArray(pa.array(result, from_pandas=True))
548+
elif isinstance(result, BaseMaskedArray):
549+
if result._mask.all():
550+
# We want an arrow null array here
551+
result = ArrowExtensionArray(pa.array([None] * len(result)))
552+
else:
553+
result = ArrowExtensionArray(
554+
pa.array(result._data, mask=result._mask)
555+
)
556+
else:
557+
result = ArrowExtensionArray(
558+
pa.array(result.to_numpy(), from_pandas=True)
559+
)
560+
561+
return result, na_count
562+
433563
@overload
434564
def _do_date_conversions(
435565
self,

pandas/io/parsers/python_parser.py

Lines changed: 1 addition & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,7 @@
2020

2121
import numpy as np
2222

23-
from pandas._libs import (
24-
lib,
25-
parsers,
26-
)
27-
import pandas._libs.ops as libops
28-
from pandas.compat._optional import import_optional_dependency
23+
from pandas._libs import lib
2924
from pandas.errors import (
3025
EmptyDataError,
3126
ParserError,
@@ -38,9 +33,7 @@
3833
from pandas.core.dtypes.common import (
3934
is_bool_dtype,
4035
is_extension_array_dtype,
41-
is_float_dtype,
4236
is_integer,
43-
is_integer_dtype,
4437
is_numeric_dtype,
4538
is_object_dtype,
4639
is_string_dtype,
@@ -51,20 +44,13 @@
5144
ExtensionDtype,
5245
)
5346
from pandas.core.dtypes.inference import is_dict_like
54-
from pandas.core.dtypes.missing import isna
5547

5648
from pandas.core import algorithms
5749
from pandas.core.arrays import (
58-
ArrowExtensionArray,
59-
BaseMaskedArray,
60-
BooleanArray,
6150
Categorical,
6251
ExtensionArray,
63-
FloatingArray,
64-
IntegerArray,
6552
)
6653
from pandas.core.arrays.boolean import BooleanDtype
67-
from pandas.core.arrays.string_ import StringDtype
6854
from pandas.core.indexes.api import Index
6955

7056
from pandas.io.common import (
@@ -563,119 +549,6 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi
563549
) from err
564550
return values
565551

566-
@final
567-
def _infer_types(
568-
self, values, na_values, no_dtype_specified, try_num_bool: bool = True
569-
) -> tuple[ArrayLike, int]:
570-
"""
571-
Infer types of values, possibly casting
572-
573-
Parameters
574-
----------
575-
values : ndarray
576-
na_values : set
577-
no_dtype_specified: Specifies if we want to cast explicitly
578-
try_num_bool : bool, default try
579-
try to cast values to numeric (first preference) or boolean
580-
581-
Returns
582-
-------
583-
converted : ndarray or ExtensionArray
584-
na_count : int
585-
"""
586-
na_count = 0
587-
if issubclass(values.dtype.type, (np.number, np.bool_)):
588-
# If our array has numeric dtype, we don't have to check for strings in isin
589-
na_values = np.array([val for val in na_values if not isinstance(val, str)])
590-
mask = algorithms.isin(values, na_values)
591-
na_count = mask.astype("uint8", copy=False).sum()
592-
if na_count > 0:
593-
if is_integer_dtype(values):
594-
values = values.astype(np.float64)
595-
np.putmask(values, mask, np.nan)
596-
return values, na_count
597-
598-
dtype_backend = self.dtype_backend
599-
non_default_dtype_backend = (
600-
no_dtype_specified and dtype_backend is not lib.no_default
601-
)
602-
result: ArrayLike
603-
604-
if try_num_bool and is_object_dtype(values.dtype):
605-
# exclude e.g DatetimeIndex here
606-
try:
607-
result, result_mask = lib.maybe_convert_numeric(
608-
values,
609-
na_values,
610-
False,
611-
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
612-
)
613-
except (ValueError, TypeError):
614-
# e.g. encountering datetime string gets ValueError
615-
# TypeError can be raised in floatify
616-
na_count = parsers.sanitize_objects(values, na_values)
617-
result = values
618-
else:
619-
if non_default_dtype_backend:
620-
if result_mask is None:
621-
result_mask = np.zeros(result.shape, dtype=np.bool_)
622-
623-
if result_mask.all():
624-
result = IntegerArray(
625-
np.ones(result_mask.shape, dtype=np.int64), result_mask
626-
)
627-
elif is_integer_dtype(result):
628-
result = IntegerArray(result, result_mask)
629-
elif is_bool_dtype(result):
630-
result = BooleanArray(result, result_mask)
631-
elif is_float_dtype(result):
632-
result = FloatingArray(result, result_mask)
633-
634-
na_count = result_mask.sum()
635-
else:
636-
na_count = isna(result).sum()
637-
else:
638-
result = values
639-
if values.dtype == np.object_:
640-
na_count = parsers.sanitize_objects(values, na_values)
641-
642-
if result.dtype == np.object_ and try_num_bool:
643-
result, bool_mask = libops.maybe_convert_bool(
644-
np.asarray(values),
645-
true_values=self.true_values,
646-
false_values=self.false_values,
647-
convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type]
648-
)
649-
if result.dtype == np.bool_ and non_default_dtype_backend:
650-
if bool_mask is None:
651-
bool_mask = np.zeros(result.shape, dtype=np.bool_)
652-
result = BooleanArray(result, bool_mask)
653-
elif result.dtype == np.object_ and non_default_dtype_backend:
654-
# read_excel sends array of datetime objects
655-
if not lib.is_datetime_array(result, skipna=True):
656-
dtype = StringDtype()
657-
cls = dtype.construct_array_type()
658-
result = cls._from_sequence(values, dtype=dtype)
659-
660-
if dtype_backend == "pyarrow":
661-
pa = import_optional_dependency("pyarrow")
662-
if isinstance(result, np.ndarray):
663-
result = ArrowExtensionArray(pa.array(result, from_pandas=True))
664-
elif isinstance(result, BaseMaskedArray):
665-
if result._mask.all():
666-
# We want an arrow null array here
667-
result = ArrowExtensionArray(pa.array([None] * len(result)))
668-
else:
669-
result = ArrowExtensionArray(
670-
pa.array(result._data, mask=result._mask)
671-
)
672-
else:
673-
result = ArrowExtensionArray(
674-
pa.array(result.to_numpy(), from_pandas=True)
675-
)
676-
677-
return result, na_count
678-
679552
@cache_readonly
680553
def _have_mi_columns(self) -> bool:
681554
if self.header is None:

0 commit comments

Comments
 (0)