Add method _validate_setitem_value and fix tests

maushumee · maushumee · commit 493dd9d9bd13 · 2024-07-27T22:30:18.000-04:00
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -239,6 +239,84 @@ def _values_for_factorize(self) -> tuple[np.ndarray, float | None]:
             fv = np.nan
         return self._ndarray, fv
 
+    def _validate_setitem_value(self, value):
+        """
+        Check if we have a scalar that we can cast losslessly.
+
+        Raises
+        ------
+        TypeError
+        """
+
+        kind = self.dtype.kind
+
+        if kind == "b":
+            if lib.is_bool(value) or np.can_cast(type(value), self.dtype.type):
+                return value
+            if isinstance(value, NumpyExtensionArray) and (
+                lib.is_bool_array(value.to_numpy())
+            ):
+                return value
+
+        elif kind == "i":
+            if lib.is_integer(value) or np.can_cast(type(value), self.dtype.type):
+                return value
+            if isinstance(value, NumpyExtensionArray) and lib.is_integer_array(
+                value.to_numpy()
+            ):
+                return value
+
+        elif kind == "u":
+            if (lib.is_integer(value) and value > -1) or np.can_cast(
+                type(value), self.dtype.type
+            ):
+                return value
+
+        elif kind == "c":
+            if lib.is_complex(value) or np.can_cast(type(value), self.dtype.type):
+                return value
+
+        elif kind == "S":
+            if isinstance(value, str) or np.can_cast(type(value), self.dtype.type):
+                return value
+            if isinstance(value, NumpyExtensionArray) and lib.is_string_array(
+                value.to_numpy()
+            ):
+                return value
+
+        elif kind == "M":
+            if isinstance(value, np.datetime64):
+                return value
+            if isinstance(value, NumpyExtensionArray) and (
+                lib.is_date_array(value.to_numpy())
+                or lib.is_datetime_array(value.to_numpy())
+                or lib.is_datetime64_array(value.to_numpy())
+                or lib.is_datetime_with_singletz_array(value.to_numpy())
+            ):
+                return value
+
+        elif kind == "m":
+            if isinstance(value, np.timedelta64):
+                return value
+            if isinstance(value, NumpyExtensionArray) and (
+                lib.is_timedelta_or_timedelta64_array(value.to_numpy())
+                or lib.is_time_array(value.to_numpy())
+            ):
+                return value
+
+        elif kind == "f":
+            if lib.is_float(value) or np.can_cast(type(value), self.dtype.type):
+                return value
+            if isinstance(value, NumpyExtensionArray) and lib.is_float_array(
+                value.to_numpy()
+            ):
+                return value
+
+        elif np.can_cast(type(value), self.dtype.type):
+            return value
+
+        raise TypeError(f"Invalid value '{value!s}' for dtype {self.dtype}")
+
     # Base EA class (and all other EA classes) don't have limit_area keyword
     # This can be removed here as well when the interpolate ffill/bfill method
     # deprecation is enforced
diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -303,7 +301,6 @@ def test_iadd_string(self):
         index += "_x"
         assert "a_x" in index
 
-    @pytest.mark.xfail(using_string_dtype(), reason="add doesn't work")
     def test_add(self):
         index = pd.Index([str(i) for i in range(10)])
         expected = pd.Index(index.values * 2)
diff --git a/pandas/tests/arrays/numpy_/test_numpy.py b/pandas/tests/arrays/numpy_/test_numpy.py
@@ -197,10 +197,12 @@ def test_validate_reduction_keyword_args():
         arr.all(keepdims=True)
 
 
-@pytest.mark.xfail
 @pytest.mark.parametrize(
     "value, expectedError",
     [
+        (True, False),
+        (5, False),
+        (5.0, True),
         (5.5, True),
         (1 + 2j, True),
         ("t", True),
@@ -217,10 +219,13 @@ def test_int_arr_validate_setitem_value(value, expectedError):
         assert arr[0] == value
 
 
-@pytest.mark.xfail
 @pytest.mark.parametrize(
     "value, expectedError",
     [
+        (True, False),
+        (5, False),
+        (5.0, False),
+        (5.5, False),
         (1 + 2j, True),
         ("t", True),
         (datetime.now(), True),
@@ -236,10 +241,14 @@ def test_float_arr_validate_setitem_value(value, expectedError):
         assert arr[0] == value
 
 
-@pytest.mark.xfail
 @pytest.mark.parametrize(
     "value, expectedError",
     [
+        (True, False),
+        (5, False),
+        (5.0, False),
+        (5.5, False),
+        ("t", False),
         (datetime.now(), True),
     ],
 )
@@ -333,12 +342,15 @@ def test_setitem_object_typecode(dtype):
 def test_setitem_no_coercion():
     # https://github.com/pandas-dev/pandas/issues/28150
     arr = NumpyExtensionArray(np.array([1, 2, 3]))
-    with pytest.raises(ValueError, match="int"):
+    with pytest.raises(TypeError):
         arr[0] = "a"
 
     # With a value that we do coerce, check that we coerce the value
     #  and not the underlying array.
-    arr[0] = 2.5
+    with pytest.raises(TypeError):
+        arr[0] = 2.5
+
+    arr[0] = 9
     assert isinstance(arr[0], (int, np.integer)), type(arr[0])
 
 
@@ -354,7 +366,10 @@ def test_setitem_preserves_views():
     assert view2[0] == 9
     assert view3[0] == 9
 
-    arr[-1] = 2.5
+    with pytest.raises(TypeError):
+        arr[-1] = 2.5
+
+    arr[-1] = 4
     view1[-1] = 5
     assert arr[-1] == 5
 
diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 import pandas._testing as tm
 from pandas.tests.base.common import allow_na_ops
@@ -100,12 +98,11 @@ def test_nunique_null(null_obj, index_or_series_obj):
 
 
 @pytest.mark.single_cpu
-@pytest.mark.xfail(using_string_dtype(), reason="decoding fails")
 def test_unique_bad_unicode(index_or_series):
     # regression test for #34550
     uval = "\ud83d"  # smiley emoji
 
-    obj = index_or_series([uval] * 2)
+    obj = index_or_series([uval] * 2, dtype=object)
     result = obj.unique()
 
     if isinstance(obj, pd.Index):
diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py
@@ -44,7 +44,7 @@ def test_constructor_single_row(self):
         )
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.skipif(using_string_dtype(), reason="columns inferring logic broken")
+    @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
     def test_constructor_list_of_series(self):
         data = [
             OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
@@ -108,6 +108,7 @@ def test_constructor_list_of_series(self):
         expected = DataFrame.from_dict(sdict, orient="index")
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
     def test_constructor_orient(self, float_string_frame):
         data_dict = float_string_frame.T._series
         recons = DataFrame.from_dict(data_dict, orient="index")
diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py
@@ -57,9 +57,7 @@ def test_from_records_with_datetimes(self):
         expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]")
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.skipif(
-        using_string_dtype(), reason="dtype checking logic doesn't work"
-    )
+    @pytest.mark.xfail(using_string_dtype(), reason="dtype checking logic doesn't work")
     def test_from_records_sequencelike(self):
         df = DataFrame(
             {
diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py
@@ -65,6 +65,7 @@ def test_fillna_datetime(self, datetime_frame):
         with pytest.raises(TypeError, match=msg):
             datetime_frame.fillna()
 
+    # TODO(infer_string) test as actual error instead of xfail
     @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
     def test_fillna_mixed_type(self, float_string_frame):
         mf = float_string_frame
@@ -537,6 +538,7 @@ def test_fillna_col_reordering(self):
         filled = df.ffill()
         assert df.columns.tolist() == filled.columns.tolist()
 
+    # TODO(infer_string) test as actual error instead of xfail
     @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
     def test_fill_corner(self, float_frame, float_string_frame):
         mf = float_string_frame
diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py
@@ -15,6 +15,7 @@
 from pandas import (
     CategoricalIndex,
     DataFrame,
+    Index,
     MultiIndex,
     Series,
     date_range,
@@ -360,7 +361,7 @@ def test_info_memory_usage():
     df = DataFrame(data)
     df.columns = dtypes
 
-    df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
+    df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object))
     df_with_object_index.info(buf=buf, memory_usage=True)
     res = buf.getvalue().splitlines()
     assert re.match(r"memory usage: [^+]+\+", res[-1])
@@ -398,25 +399,25 @@ def test_info_memory_usage():
 
 @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
 def test_info_memory_usage_deep_not_pypy():
-    df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
+    df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object))
     assert (
         df_with_object_index.memory_usage(index=True, deep=True).sum()
         > df_with_object_index.memory_usage(index=True).sum()
     )
 
-    df_object = DataFrame({"a": ["a"]})
+    df_object = DataFrame({"a": Series(["a"], dtype=object)})
     assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
 
 
 @pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result")
 def test_info_memory_usage_deep_pypy():
-    df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
+    df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object))
     assert (
         df_with_object_index.memory_usage(index=True, deep=True).sum()
         == df_with_object_index.memory_usage(index=True).sum()
     )
 
-    df_object = DataFrame({"a": ["a"]})
+    df_object = DataFrame({"a": Series(["a"], dtype=object)})
     assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
 
 
diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py
@@ -64,6 +64,7 @@ def test_interpolate_inplace(self, frame_or_series, request):
         assert np.shares_memory(orig, obj.values)
         assert orig.squeeze()[1] == 1.5
 
+    # TODO(infer_string) raise proper TypeError in case of string dtype
     @pytest.mark.xfail(
         using_string_dtype(), reason="interpolate doesn't work for string"
     )
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -11,8 +11,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -251,7 +249,6 @@ def test_timestamp_compare(self, left, right):
             with pytest.raises(TypeError, match=msg):
                 right_f(pd.Timestamp("nat"), df)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't compare string and int")
     def test_mixed_comparison(self):
         # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False,
         # not raise TypeError
diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     DatetimeIndex,
@@ -42,12 +40,11 @@ def test_repr_missing(self, constructor, expected, using_infer_string, request):
         result = repr(obj)
         assert result == expected
 
-    @pytest.mark.xfail(using_string_dtype(), reason="repr different")
     def test_repr_floats(self):
         # GH 32553
 
         markers = Series(
-            ["foo", "bar"],
+            [1, 2],
             index=IntervalIndex(
                 [
                     Interval(left, right)
@@ -59,7 +56,7 @@ def test_repr_floats(self):
             ),
         )
         result = str(markers)
-        expected = "(329.973, 345.137]    foo\n(345.137, 360.191]    bar\ndtype: object"
+        expected = "(329.973, 345.137]    1\n(345.137, 360.191]    2\ndtype: int64"
         assert result == expected
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py
@@ -9,8 +9,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import (
     IS64,
     is_platform_windows,
@@ -825,8 +823,6 @@ def replacer(self, how, from_key, to_key):
             raise ValueError
         return replacer
 
-    # Expected needs adjustment for the infer string option, seems to work as expecetd
-    @pytest.mark.skipif(using_string_dtype(), reason="TODO: test is to complex")
     def test_replace_series(self, how, to_key, from_key, replacer):
         index = pd.Index([3, 4], name="xxx")
         obj = pd.Series(self.rep[from_key], index=index, name="yyy")
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import IndexingError
 
 from pandas.core.dtypes.common import (
@@ -426,7 +424,6 @@ def test_set_index_nan(self):
         )
         tm.assert_frame_equal(result, df)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't multiply arrow strings")
     def test_multi_assign(self):
         # GH 3626, an assignment of a sub-df to a df
         # set float64 to avoid upcast when setting nan
@@ -652,7 +649,6 @@ def test_loc_setitem_fullindex_views(self):
         df.loc[df.index] = df.loc[df.index]
         tm.assert_frame_equal(df, df2)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string")
     def test_rhs_alignment(self):
         # GH8258, tests that both rows & columns are aligned to what is
         # assigned to. covers both uniform data-type & multi-type cases
diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py
diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,7 @@ def test_interpolate_inplace(self, frame_or_series, request):`
`64`	`64`	`assert np.shares_memory(orig, obj.values)`
`65`	`65`	`assert orig.squeeze()[1] == 1.5`
`66`	`66`
	`67`	`+ # TODO(infer_string) raise proper TypeError in case of string dtype`
`67`	`68`	`@pytest.mark.xfail(`
`68`	`69`	`using_string_dtype(), reason="interpolate doesn't work for string"`
`69`	`70`	`)`