Merge remote-tracking branch 'upstream/main' into np_datetime-ps

yuanx749 · yuanx749 · commit e7a5d082f66a · 2024-11-19T23:50:52.000+08:00
diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
@@ -9,6 +9,8 @@
 
 import numpy as np
 
+from pandas._config import using_string_dtype
+
 from pandas.compat._optional import import_optional_dependency
 
 import pandas as pd
@@ -147,8 +149,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
     -------
     pd.DataFrame
     """
-    # We need a dict of columns here, with each column being a NumPy array (at
-    # least for now, deal with non-NumPy dtypes later).
     columns: dict[str, Any] = {}
     buffers = []  # hold on to buffers, keeps memory alive
     for name in df.column_names():
@@ -347,8 +347,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
         # Add to our list of strings
         str_list[i] = string
 
-    # Convert the string list to a NumPy array
-    return np.asarray(str_list, dtype="object"), buffers
+    if using_string_dtype():
+        res = pd.Series(str_list, dtype="str")
+    else:
+        res = np.asarray(str_list, dtype="object")  # type: ignore[assignment]
+
+    return res, buffers  # type: ignore[return-value]
 
 
 def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import HAS_PYARROW
 from pandas.compat.numpy import np_version_gt2
 
@@ -392,9 +390,6 @@ def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array):
         assert np.may_share_memory(result_nocopy1, result_nocopy2)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
 @pytest.mark.parametrize("as_series", [True, False])
 @pytest.mark.parametrize(
     "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)]
@@ -406,13 +401,13 @@ def test_to_numpy_copy(arr, as_series, using_infer_string):
 
     # no copy by default
     result = obj.to_numpy()
-    if using_infer_string and arr.dtype == object:
+    if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow":
         assert np.shares_memory(arr, result) is False
     else:
         assert np.shares_memory(arr, result) is True
 
     result = obj.to_numpy(copy=False)
-    if using_infer_string and arr.dtype == object:
+    if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow":
         assert np.shares_memory(arr, result) is False
     else:
         assert np.shares_memory(arr, result) is True
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     CategoricalIndex,
@@ -754,13 +752,12 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype):
     tm.assert_index_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_union_with_na_when_constructing_dataframe():
     # GH43222
     series1 = Series(
         (1,),
         index=MultiIndex.from_arrays(
-            [Series([None], dtype="string"), Series([None], dtype="string")]
+            [Series([None], dtype="str"), Series([None], dtype="str")]
         ),
     )
     series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b"))))
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -8,12 +8,7 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import (
-    HAS_PYARROW,
-    IS64,
-)
+from pandas.compat import IS64
 from pandas.errors import InvalidIndexError
 import pandas.util._test_decorators as td
 
@@ -823,11 +818,6 @@ def test_isin(self, values, index, expected):
         expected = np.array(expected, dtype=bool)
         tm.assert_numpy_array_equal(result, expected)
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW,
-        reason="TODO(infer_string)",
-        strict=False,
-    )
     def test_isin_nan_common_object(
         self, nulls_fixture, nulls_fixture2, using_infer_string
     ):
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import iNaT
 from pandas.compat import (
     is_ci_environment,
@@ -401,7 +399,6 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
     pd.api.interchange.from_dataframe(df)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_empty_string_column():
     # https://github.com/pandas-dev/pandas/issues/56703
     df = pd.DataFrame({"a": []}, dtype=str)
@@ -410,13 +407,12 @@ def test_empty_string_column():
     tm.assert_frame_equal(df, result)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_large_string():
     # GH#56702
     pytest.importorskip("pyarrow")
     df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
     result = pd.api.interchange.from_dataframe(df.__dataframe__())
-    expected = pd.DataFrame({"a": ["x"]}, dtype="object")
+    expected = pd.DataFrame({"a": ["x"]}, dtype="str")
     tm.assert_frame_equal(result, expected)
 
 
@@ -427,7 +423,6 @@ def test_non_str_names():
     assert names == ["0"]
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_non_str_names_w_duplicates():
     # https://github.com/pandas-dev/pandas/issues/56701
     df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]})
@@ -438,7 +433,7 @@ def test_non_str_names_w_duplicates():
             "Expected a Series, got a DataFrame. This likely happened because you "
             "called __dataframe__ on a DataFrame which, after converting column "
             r"names to string, resulted in duplicated names: Index\(\['0', '0'\], "
-            r"dtype='object'\). Please rename these columns before using the "
+            r"dtype='(str|object)'\). Please rename these columns before using the "
             "interchange protocol."
         ),
     ):
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -17,8 +17,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -625,7 +623,6 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
         expected = DataFrame(expected)
         tm.assert_frame_equal(actual, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel):
         # GH#36712
         if read_ext in (".xlsb", ".xls"):
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
@@ -13,8 +13,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat._optional import import_optional_dependency
 import pandas.util._test_decorators as td
 
@@ -1387,12 +1385,11 @@ def test_freeze_panes(self, tmp_excel):
         result = pd.read_excel(tmp_excel, index_col=0)
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_path_path_lib(self, engine, ext):
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
             columns=Index(list("ABCD")),
-            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+            index=Index([f"i-{i}" for i in range(30)]),
         )
         writer = partial(df.to_excel, engine=engine)
 
diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py
@@ -5,6 +5,8 @@
 
 from pandas._config import using_string_dtype
 
+from pandas.compat import HAS_PYARROW
+
 from pandas import (
     DataFrame,
     date_range,
@@ -176,7 +178,9 @@ def test_excel_options(fsspectest):
     assert fsspectest.test[0] == "read"
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
+@pytest.mark.xfail(
+    using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet"
+)
 def test_to_parquet_new_file(cleared_fs, df1):
     """Regression test for writing to a not-yet-existent GCS Parquet file."""
     pytest.importorskip("fastparquet")
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat.pyarrow import pa_version_under17p0
 
 from pandas import (
@@ -207,7 +205,6 @@ def test_to_csv_compression_encoding_gcs(
     tm.assert_frame_equal(df, read_df)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
 def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
     """Regression test for writing to a not-yet-existent GCS Parquet file."""
     pytest.importorskip("fastparquet")
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -1174,9 +1174,17 @@ def test_non_nanosecond_timestamps(self, temp_file):
 
 
 class TestParquetFastParquet(Base):
-    @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values")
-    def test_basic(self, fp, df_full):
+    def test_basic(self, fp, df_full, request):
         pytz = pytest.importorskip("pytz")
+        import fastparquet
+
+        if Version(fastparquet.__version__) < Version("2024.11.0"):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=("datetime_with_nat gets incorrect values"),
+                )
+            )
+
         tz = pytz.timezone("US/Eastern")
         df = df_full
 
@@ -1213,11 +1221,17 @@ def test_duplicate_columns(self, fp):
         msg = "Cannot create parquet dataset with duplicate column names"
         self.check_error_on_write(df, fp, ValueError, msg)
 
-    @pytest.mark.xfail(
-        Version(np.__version__) >= Version("2.0.0"),
-        reason="fastparquet uses np.float_ in numpy2",
-    )
-    def test_bool_with_none(self, fp):
+    def test_bool_with_none(self, fp, request):
+        import fastparquet
+
+        if Version(fastparquet.__version__) < Version("2024.11.0") and Version(
+            np.__version__
+        ) >= Version("2.0.0"):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=("fastparquet uses np.float_ in numpy2"),
+                )
+            )
         df = pd.DataFrame({"a": [True, None, False]})
         expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
         # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
@@ -1331,10 +1345,19 @@ def test_empty_dataframe(self, fp):
         expected = df.copy()
         check_round_trip(df, fp, expected=expected)
 
-    @pytest.mark.xfail(
-        reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929"
-    )
-    def test_timezone_aware_index(self, fp, timezone_aware_date_list):
+    def test_timezone_aware_index(self, fp, timezone_aware_date_list, request):
+        import fastparquet
+
+        if Version(fastparquet.__version__) < Version("2024.11.0"):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=(
+                        "fastparquet bug, see "
+                        "https://github.com/dask/fastparquet/issues/929"
+                    ),
+                )
+            )
+
         idx = 5 * [timezone_aware_date_list]
 
         df = pd.DataFrame(index=idx, data={"index_as_col": idx})
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1719,7 +1719,6 @@ def test_date_parsing_ignores_format_details(self, column, datapath):
         formatted = df.loc[0, column + "_fmt"]
         assert unformatted == formatted
 
-    # @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize("byteorder", ["little", "big"])
     def test_writer_117(self, byteorder, temp_file, using_infer_string):
         original = DataFrame(
diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.core.dtypes.concat import union_categoricals
 
 import pandas as pd
@@ -124,12 +122,15 @@ def test_union_categoricals_nan(self):
         exp = Categorical([np.nan, np.nan, np.nan, np.nan])
         tm.assert_categorical_equal(res, exp)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("val", [[], ["1"]])
     def test_union_categoricals_empty(self, val, request, using_infer_string):
         # GH 13759
         if using_infer_string and val == ["1"]:
-            request.applymarker(pytest.mark.xfail("object and strings dont match"))
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason="TDOD(infer_string) object and strings dont match"
+                )
+            )
         res = union_categoricals([Categorical([]), Categorical(val)])
         exp = Categorical(val)
         tm.assert_categorical_equal(res, exp)
diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py
@@ -314,6 +314,17 @@ def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz):
 
         assert result == expected
 
+    def test_timestamp_dst_transition(self):
+        # GH 60084
+        dt_str = "2023-11-05 01:00-08:00"
+        tz_str = "America/Los_Angeles"
+
+        ts1 = Timestamp(dt_str, tz=tz_str)
+        ts2 = ts1 + Timedelta(hours=0)
+
+        assert ts1 == ts2
+        assert hash(ts1) == hash(ts2)
+
 
 class SubDatetime(datetime):
     pass
diff --git a/web/pandas/about/citing.md b/web/pandas/about/citing.md
@@ -20,7 +20,7 @@ following paper:
             url          = {https://doi.org/10.5281/zenodo.3509134}
         }
 
-- [Data structures for statistical computing in python](https://conference.scipy.org/proceedings/scipy2010/pdfs/mckinney.pdf),
+- [Data structures for statistical computing in python](https://pub.curvenote.com/01908378-3686-7168-a380-d82bbf21c799/public/mckinney-57fc0d4e8a08cd7f26a4b8bf468a71f4.pdf),
    McKinney, Proceedings of the 9th Python in Science Conference, Volume 445, 2010.
 
         @InProceedings{ mckinney-proc-scipy-2010,