From b00ce10ea4bd301f6e64810869ed4eceba55fc4b Mon Sep 17 00:00:00 2001 From: andrewgsavage S Date: Sun, 1 Sep 2024 17:58:16 +0000 Subject: [PATCH 1/5] infer --- pandas/_libs/lib.pyx | 7 +++++++ pandas/core/construction.py | 4 ++++ pandas/core/dtypes/base.py | 15 +++++++++++++++ pandas/core/dtypes/cast.py | 9 +++++++++ pandas/core/series.py | 2 ++ 5 files changed, 37 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1a2a0142c52e..a5cc78d69e5f7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -107,6 +107,8 @@ from pandas._libs.tslibs.period cimport is_period_object from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 from pandas._libs.tslibs.timezones cimport tz_compare +from pandas.core.dtypes.base import _registry + # constants that will be compared to potentially arbitrarily large # python int cdef: @@ -1693,6 +1695,11 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if is_interval_array(values): return "interval" + print("infer_dtype") + reg_dtype = _registry.match_scalar(val) + if reg_dtype: + return str(reg_dtype) + cnp.PyArray_ITER_RESET(it) for i in range(n): val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 665eb75953078..06e81264cb2c3 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -370,6 +370,10 @@ def array( elif data.dtype.kind == "b": return BooleanArray._from_sequence(data, dtype="boolean", copy=copy) + # elif inferred_dtype != "mixed": + # dtype = pandas_dtype(inferred_dtype) + # cls = dtype.construct_array_type() + # return cls._from_sequence(data, dtype=dtype, copy=copy) else: # e.g. complex return NumpyExtensionArray._from_sequence(data, dtype=data.dtype, copy=copy) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index d8a42d83b6c54..85db5817190bb 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -444,6 +444,13 @@ def _can_fast_transpose(self) -> bool: """ return False + def is_unambiguous_scalar(self): + return False + + @classmethod + def construct_from_scalar(cls, scalar): + return cls() + class StorageExtensionDtype(ExtensionDtype): """ExtensionDtype that may be backed by more than one implementation.""" @@ -582,5 +589,13 @@ def find( return None + def match_scalar( + self, scalar: Any + ) -> type_t[ExtensionDtype] | ExtensionDtype | None: + for dtype in self.dtypes: + if dtype.is_unambiguous_scalar(scalar): + return dtype.construct_from_scalar(scalar) + return None + _registry = Registry() diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6ba07b1761557..2f926b64ed462 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -44,6 +44,7 @@ LossySetitemError, ) +from pandas.core.dtypes.base import _registry from pandas.core.dtypes.common import ( ensure_int8, ensure_int16, @@ -857,6 +858,10 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: subtype = infer_dtype_from_scalar(val.left)[0] dtype = IntervalDtype(subtype=subtype, closed=val.closed) + reg_dtype = _registry.match_scalar(val) + if reg_dtype: + dtype = reg_dtype + return dtype, val @@ -913,6 +918,10 @@ def infer_dtype_from_array(arr) -> tuple[DtypeObj, ArrayLike]: inferred = lib.infer_dtype(arr, skipna=False) if inferred in ["string", "bytes", "mixed", "mixed-integer"]: return (np.dtype(np.object_), arr) + else: + arr_dtype = pandas_dtype_func(inferred) + if isinstance(arr_dtype, ExtensionDtype): + return arr_dtype, arr arr = np.asarray(arr) return arr.dtype, arr diff --git a/pandas/core/series.py b/pandas/core/series.py index 4f79e30f48f3c..ca64e986f702d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -501,6 +501,8 @@ def __init__( elif copy: data = data.copy() else: + if dtype is None: + dtype = infer_dtype_from(data)[0] data = sanitize_array(data, index, dtype, copy) data = SingleBlockManager.from_array(data, index, refs=refs) From c7cc8003a4336c1f8866a9d67735daab174b6982 Mon Sep 17 00:00:00 2001 From: Andrew Savage Date: Sat, 7 Sep 2024 14:46:55 +0100 Subject: [PATCH 2/5] test --- pandas/_libs/lib.pyx | 1 - pandas/core/dtypes/base.py | 7 ++++++- pandas/core/dtypes/cast.py | 2 ++ pandas/core/series.py | 5 ++++- pandas/tests/dtypes/test_inference.py | 7 +++++++ 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a5cc78d69e5f7..dd4ef98ada07b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1695,7 +1695,6 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if is_interval_array(values): return "interval" - print("infer_dtype") reg_dtype = _registry.match_scalar(val) if reg_dtype: return str(reg_dtype) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 85db5817190bb..ccc2143162968 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -444,12 +444,17 @@ def _can_fast_transpose(self) -> bool: """ return False - def is_unambiguous_scalar(self): + @classmethod + def is_unambiguous_scalar(cls, scalar): return False @classmethod def construct_from_scalar(cls, scalar): return cls() + + @property + def is_external_dtype(self) -> bool: + return not self.__module__.split(".")[0] == "pandas" class StorageExtensionDtype(ExtensionDtype): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2f926b64ed462..bcbd933a33c12 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -918,6 +918,8 @@ def infer_dtype_from_array(arr) -> tuple[DtypeObj, ArrayLike]: inferred = lib.infer_dtype(arr, skipna=False) if inferred in ["string", "bytes", "mixed", "mixed-integer"]: return (np.dtype(np.object_), arr) + elif inferred in ["empty", "integer", "floating", "integer-na", "mixed-integer-float", "datetime", "period", "timedelta", "time", "date"]: + pass else: arr_dtype = pandas_dtype_func(inferred) if isinstance(arr_dtype, ExtensionDtype): diff --git a/pandas/core/series.py b/pandas/core/series.py index ca64e986f702d..65cf8fbb3c5a4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -502,7 +502,10 @@ def __init__( data = data.copy() else: if dtype is None: - dtype = infer_dtype_from(data)[0] + inferred_dtype = infer_dtype_from(data)[0] + if isinstance(inferred_dtype, ExtensionDtype) and inferred_dtype.is_external_dtype: + dtype = inferred_dtype + # import pdb; pdb.set_trace() data = sanitize_array(data, index, dtype, copy) data = SingleBlockManager.from_array(data, index, refs=refs) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index da444b55490f0..54187e6c85d00 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -81,6 +81,13 @@ def coerce(request): return request.param +class MockScalar: + pass + +class MockDtype(pd.api.extensions.ExtensionDtype): + + + class MockNumpyLikeArray: """ From 5878fab4da75d91c66b949888d75111a53a01c40 Mon Sep 17 00:00:00 2001 From: Andrew Date: Sat, 7 Sep 2024 18:31:21 +0100 Subject: [PATCH 3/5] test --- pandas/tests/dtypes/test_inference.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 54187e6c85d00..0f0ada868e07e 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -75,19 +75,14 @@ FloatingArray, IntegerArray, ) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core @pytest.fixture(params=[True, False], ids=str) def coerce(request): return request.param -class MockScalar: - pass - -class MockDtype(pd.api.extensions.ExtensionDtype): - - - class MockNumpyLikeArray: """ @@ -2032,3 +2027,18 @@ def test_find_result_type_int_int(right, result): def test_find_result_type_floats(right, result): left_dtype = np.dtype("float16") assert find_result_type(left_dtype, right) == result + +def test_infer_dtype_extensiondtype(): + class MockScalar: + pass + + class MockDtype(ExtensionDtype): + @property + def name(self): + return "MockDtype" + def is_unambiguous_scalar(scalar): + if isinstance(scalar, MockScalar): + return True + return False + arr = [MockScalar()] + assert lib.infer_dtype(arr, skipna=True) == "MockDtype" \ No newline at end of file From 2f992fd198c52ba511f2202e62a8d30145e10bcf Mon Sep 17 00:00:00 2001 From: Andrew Savage Date: Sun, 8 Sep 2024 21:18:23 +0100 Subject: [PATCH 4/5] test --- pandas/core/dtypes/base.py | 2 +- pandas/tests/dtypes/cast/test_infer_dtype.py | 30 +++++++- pandas/tests/dtypes/test_inference.py | 4 +- pandas/tests/series/test_constructors.py | 75 +++++++++++++++++++- 4 files changed, 106 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index ccc2143162968..ba396295efa5e 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -454,7 +454,7 @@ def construct_from_scalar(cls, scalar): @property def is_external_dtype(self) -> bool: - return not self.__module__.split(".")[0] == "pandas" + return self.__module__[:8] == "pandas.c" class StorageExtensionDtype(ExtensionDtype): diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index 679031a625c2d..0ef5a4344ffd5 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -23,7 +23,33 @@ Timestamp, date_range, ) - +from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype + + +class MockScalar: + pass + +@register_extension_dtype +class MockDtype(ExtensionDtype): + @property + def name(self): + return "MockDtype" + def is_unambiguous_scalar(scalar): + if isinstance(scalar, MockScalar): + return True + return False + + @classmethod + def construct_from_string(cls, string: str): + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + + if string == cls.__name__: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") def test_infer_dtype_from_int_scalar(any_int_numpy_dtype): # Test that infer_dtype_from_scalar is @@ -157,6 +183,7 @@ def test_infer_dtype_from_scalar_errors(): (np.datetime64("2016-01-01"), np.dtype("M8[s]")), (Timestamp("20160101"), np.dtype("M8[s]")), (Timestamp("20160101", tz="UTC"), "datetime64[s, UTC]"), + (MockScalar(), MockDtype()) ], ) def test_infer_dtype_from_scalar(value, expected, using_infer_string): @@ -189,6 +216,7 @@ def test_infer_dtype_from_scalar(value, expected, using_infer_string): Series(date_range("20160101", periods=3, tz="US/Eastern")), "datetime64[ns, US/Eastern]", ), + ([MockScalar()], MockDtype()) ], ) def test_infer_dtype_from_array(arr, expected, using_infer_string): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 0f0ada868e07e..691db81c67010 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -75,9 +75,8 @@ FloatingArray, IntegerArray, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype -from pandas.core @pytest.fixture(params=[True, False], ids=str) def coerce(request): @@ -2032,6 +2031,7 @@ def test_infer_dtype_extensiondtype(): class MockScalar: pass + @register_extension_dtype class MockDtype(ExtensionDtype): @property def name(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1771a4dfdb71f..15fb91290f7e7 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -44,10 +44,62 @@ from pandas.core.arrays import ( IntegerArray, IntervalArray, - period_array, + period_array,ExtensionArray ) from pandas.core.internals.blocks import NumpyBlock +from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype + +class MockScalar: + pass + +@register_extension_dtype +class MockDtype(ExtensionDtype): + type = MockScalar + @property + def name(self): + return "MockDtype" + def is_unambiguous_scalar(scalar): + if isinstance(scalar, MockScalar): + return True + return False + + @classmethod + def construct_from_string(cls, string: str): + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + + if string == cls.__name__: + return cls() + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + @classmethod + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return MockArray + + @property + def is_external_dtype(self): + return True + + +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +class MockArray(NDArrayBackedExtensionArray): + dtype = MockDtype() + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy=False): + scalars = np.ndarray([0 for i in scalars]) + return cls(scalars, "O") + class TestSeriesConstructors: def test_from_ints_with_non_nano_dt64_dtype(self, index_or_series): @@ -152,6 +204,27 @@ def test_scalar_extension_dtype(self, ea_scalar_and_dtype): assert ser.dtype == ea_dtype tm.assert_series_equal(ser, expected) + + def test_scalar_extension_dtype2(self): + # GH 28401 + from pandas.core.dtypes.cast import ( + LossySetitemError, + construct_1d_arraylike_from_scalar, + find_common_type, + infer_dtype_from, + maybe_box_native, + maybe_cast_pointwise_result, + ) + ea_scalar, ea_dtype = MockScalar(), MockDtype() + + # import pdb; pdb.set_trace() + infer_dtype_from(MockScalar()) + ser = Series(ea_scalar, index=range(3)) + expected = Series([ea_scalar] * 3, dtype=ea_dtype) + + assert ser.dtype == ea_dtype + # tm.assert_series_equal(ser, expected) + def test_constructor(self, datetime_series, using_infer_string): empty_series = Series() assert datetime_series.index._is_all_dates From d4c83e832babcd0b0bfeebd9ccbfc88e3199ce88 Mon Sep 17 00:00:00 2001 From: Andrew Savage Date: Mon, 9 Sep 2024 20:42:38 +0100 Subject: [PATCH 5/5] test --- pandas/core/construction.py | 4 ---- pandas/tests/series/test_constructors.py | 16 +--------------- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 06e81264cb2c3..665eb75953078 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -370,10 +370,6 @@ def array( elif data.dtype.kind == "b": return BooleanArray._from_sequence(data, dtype="boolean", copy=copy) - # elif inferred_dtype != "mixed": - # dtype = pandas_dtype(inferred_dtype) - # cls = dtype.construct_array_type() - # return cls._from_sequence(data, dtype=dtype, copy=copy) else: # e.g. complex return NumpyExtensionArray._from_sequence(data, dtype=data.dtype, copy=copy) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 15fb91290f7e7..aeec100d6931e 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -205,25 +205,11 @@ def test_scalar_extension_dtype(self, ea_scalar_and_dtype): tm.assert_series_equal(ser, expected) - def test_scalar_extension_dtype2(self): - # GH 28401 - from pandas.core.dtypes.cast import ( - LossySetitemError, - construct_1d_arraylike_from_scalar, - find_common_type, - infer_dtype_from, - maybe_box_native, - maybe_cast_pointwise_result, - ) + def test_unambiguous_scalar(self): ea_scalar, ea_dtype = MockScalar(), MockDtype() - # import pdb; pdb.set_trace() - infer_dtype_from(MockScalar()) ser = Series(ea_scalar, index=range(3)) - expected = Series([ea_scalar] * 3, dtype=ea_dtype) - assert ser.dtype == ea_dtype - # tm.assert_series_equal(ser, expected) def test_constructor(self, datetime_series, using_infer_string): empty_series = Series()