Skip to content

Commit e7a5d08

Browse files
committed
Merge remote-tracking branch 'upstream/main' into np_datetime-ps
2 parents 4fbd28b + 6a7685f commit e7a5d08

File tree

14 files changed

+71
-61
lines changed

14 files changed

+71
-61
lines changed

pandas/core/interchange/from_dataframe.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
import numpy as np
1111

12+
from pandas._config import using_string_dtype
13+
1214
from pandas.compat._optional import import_optional_dependency
1315

1416
import pandas as pd
@@ -147,8 +149,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
147149
-------
148150
pd.DataFrame
149151
"""
150-
# We need a dict of columns here, with each column being a NumPy array (at
151-
# least for now, deal with non-NumPy dtypes later).
152152
columns: dict[str, Any] = {}
153153
buffers = [] # hold on to buffers, keeps memory alive
154154
for name in df.column_names():
@@ -347,8 +347,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
347347
# Add to our list of strings
348348
str_list[i] = string
349349

350-
# Convert the string list to a NumPy array
351-
return np.asarray(str_list, dtype="object"), buffers
350+
if using_string_dtype():
351+
res = pd.Series(str_list, dtype="str")
352+
else:
353+
res = np.asarray(str_list, dtype="object") # type: ignore[assignment]
354+
355+
return res, buffers # type: ignore[return-value]
352356

353357

354358
def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:

pandas/tests/base/test_conversion.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas.compat import HAS_PYARROW
75
from pandas.compat.numpy import np_version_gt2
86

@@ -392,9 +390,6 @@ def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array):
392390
assert np.may_share_memory(result_nocopy1, result_nocopy2)
393391

394392

395-
@pytest.mark.xfail(
396-
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
397-
)
398393
@pytest.mark.parametrize("as_series", [True, False])
399394
@pytest.mark.parametrize(
400395
"arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)]
@@ -406,13 +401,13 @@ def test_to_numpy_copy(arr, as_series, using_infer_string):
406401

407402
# no copy by default
408403
result = obj.to_numpy()
409-
if using_infer_string and arr.dtype == object:
404+
if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow":
410405
assert np.shares_memory(arr, result) is False
411406
else:
412407
assert np.shares_memory(arr, result) is True
413408

414409
result = obj.to_numpy(copy=False)
415-
if using_infer_string and arr.dtype == object:
410+
if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow":
416411
assert np.shares_memory(arr, result) is False
417412
else:
418413
assert np.shares_memory(arr, result) is True

pandas/tests/indexes/multi/test_setops.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
import pandas as pd
75
from pandas import (
86
CategoricalIndex,
@@ -754,13 +752,12 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype):
754752
tm.assert_index_equal(result, expected)
755753

756754

757-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
758755
def test_union_with_na_when_constructing_dataframe():
759756
# GH43222
760757
series1 = Series(
761758
(1,),
762759
index=MultiIndex.from_arrays(
763-
[Series([None], dtype="string"), Series([None], dtype="string")]
760+
[Series([None], dtype="str"), Series([None], dtype="str")]
764761
),
765762
)
766763
series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b"))))

pandas/tests/indexes/test_base.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,7 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
13-
from pandas.compat import (
14-
HAS_PYARROW,
15-
IS64,
16-
)
11+
from pandas.compat import IS64
1712
from pandas.errors import InvalidIndexError
1813
import pandas.util._test_decorators as td
1914

@@ -823,11 +818,6 @@ def test_isin(self, values, index, expected):
823818
expected = np.array(expected, dtype=bool)
824819
tm.assert_numpy_array_equal(result, expected)
825820

826-
@pytest.mark.xfail(
827-
using_string_dtype() and not HAS_PYARROW,
828-
reason="TODO(infer_string)",
829-
strict=False,
830-
)
831821
def test_isin_nan_common_object(
832822
self, nulls_fixture, nulls_fixture2, using_infer_string
833823
):

pandas/tests/interchange/test_impl.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
import numpy as np
77
import pytest
88

9-
from pandas._config import using_string_dtype
10-
119
from pandas._libs.tslibs import iNaT
1210
from pandas.compat import (
1311
is_ci_environment,
@@ -401,7 +399,6 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
401399
pd.api.interchange.from_dataframe(df)
402400

403401

404-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
405402
def test_empty_string_column():
406403
# https://github.com/pandas-dev/pandas/issues/56703
407404
df = pd.DataFrame({"a": []}, dtype=str)
@@ -410,13 +407,12 @@ def test_empty_string_column():
410407
tm.assert_frame_equal(df, result)
411408

412409

413-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
414410
def test_large_string():
415411
# GH#56702
416412
pytest.importorskip("pyarrow")
417413
df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
418414
result = pd.api.interchange.from_dataframe(df.__dataframe__())
419-
expected = pd.DataFrame({"a": ["x"]}, dtype="object")
415+
expected = pd.DataFrame({"a": ["x"]}, dtype="str")
420416
tm.assert_frame_equal(result, expected)
421417

422418

@@ -427,7 +423,6 @@ def test_non_str_names():
427423
assert names == ["0"]
428424

429425

430-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
431426
def test_non_str_names_w_duplicates():
432427
# https://github.com/pandas-dev/pandas/issues/56701
433428
df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]})
@@ -438,7 +433,7 @@ def test_non_str_names_w_duplicates():
438433
"Expected a Series, got a DataFrame. This likely happened because you "
439434
"called __dataframe__ on a DataFrame which, after converting column "
440435
r"names to string, resulted in duplicated names: Index\(\['0', '0'\], "
441-
r"dtype='object'\). Please rename these columns before using the "
436+
r"dtype='(str|object)'\). Please rename these columns before using the "
442437
"interchange protocol."
443438
),
444439
):

pandas/tests/io/excel/test_readers.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
import numpy as np
1818
import pytest
1919

20-
from pandas._config import using_string_dtype
21-
2220
import pandas.util._test_decorators as td
2321

2422
import pandas as pd
@@ -625,7 +623,6 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
625623
expected = DataFrame(expected)
626624
tm.assert_frame_equal(actual, expected)
627625

628-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
629626
def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel):
630627
# GH#36712
631628
if read_ext in (".xlsb", ".xls"):

pandas/tests/io/excel/test_writers.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
import numpy as np
1414
import pytest
1515

16-
from pandas._config import using_string_dtype
17-
1816
from pandas.compat._optional import import_optional_dependency
1917
import pandas.util._test_decorators as td
2018

@@ -1387,12 +1385,11 @@ def test_freeze_panes(self, tmp_excel):
13871385
result = pd.read_excel(tmp_excel, index_col=0)
13881386
tm.assert_frame_equal(result, expected)
13891387

1390-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
13911388
def test_path_path_lib(self, engine, ext):
13921389
df = DataFrame(
13931390
1.1 * np.arange(120).reshape((30, 4)),
13941391
columns=Index(list("ABCD")),
1395-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
1392+
index=Index([f"i-{i}" for i in range(30)]),
13961393
)
13971394
writer = partial(df.to_excel, engine=engine)
13981395

pandas/tests/io/test_fsspec.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
from pandas._config import using_string_dtype
77

8+
from pandas.compat import HAS_PYARROW
9+
810
from pandas import (
911
DataFrame,
1012
date_range,
@@ -176,7 +178,9 @@ def test_excel_options(fsspectest):
176178
assert fsspectest.test[0] == "read"
177179

178180

179-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
181+
@pytest.mark.xfail(
182+
using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet"
183+
)
180184
def test_to_parquet_new_file(cleared_fs, df1):
181185
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
182186
pytest.importorskip("fastparquet")

pandas/tests/io/test_gcs.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
1210
from pandas.compat.pyarrow import pa_version_under17p0
1311

1412
from pandas import (
@@ -207,7 +205,6 @@ def test_to_csv_compression_encoding_gcs(
207205
tm.assert_frame_equal(df, read_df)
208206

209207

210-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
211208
def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
212209
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
213210
pytest.importorskip("fastparquet")

pandas/tests/io/test_parquet.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,9 +1174,17 @@ def test_non_nanosecond_timestamps(self, temp_file):
11741174

11751175

11761176
class TestParquetFastParquet(Base):
1177-
@pytest.mark.xfail(reason="datetime_with_nat gets incorrect values")
1178-
def test_basic(self, fp, df_full):
1177+
def test_basic(self, fp, df_full, request):
11791178
pytz = pytest.importorskip("pytz")
1179+
import fastparquet
1180+
1181+
if Version(fastparquet.__version__) < Version("2024.11.0"):
1182+
request.applymarker(
1183+
pytest.mark.xfail(
1184+
reason=("datetime_with_nat gets incorrect values"),
1185+
)
1186+
)
1187+
11801188
tz = pytz.timezone("US/Eastern")
11811189
df = df_full
11821190

@@ -1213,11 +1221,17 @@ def test_duplicate_columns(self, fp):
12131221
msg = "Cannot create parquet dataset with duplicate column names"
12141222
self.check_error_on_write(df, fp, ValueError, msg)
12151223

1216-
@pytest.mark.xfail(
1217-
Version(np.__version__) >= Version("2.0.0"),
1218-
reason="fastparquet uses np.float_ in numpy2",
1219-
)
1220-
def test_bool_with_none(self, fp):
1224+
def test_bool_with_none(self, fp, request):
1225+
import fastparquet
1226+
1227+
if Version(fastparquet.__version__) < Version("2024.11.0") and Version(
1228+
np.__version__
1229+
) >= Version("2.0.0"):
1230+
request.applymarker(
1231+
pytest.mark.xfail(
1232+
reason=("fastparquet uses np.float_ in numpy2"),
1233+
)
1234+
)
12211235
df = pd.DataFrame({"a": [True, None, False]})
12221236
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
12231237
# Fastparquet bug in 0.7.1 makes it so that this dtype becomes
@@ -1331,10 +1345,19 @@ def test_empty_dataframe(self, fp):
13311345
expected = df.copy()
13321346
check_round_trip(df, fp, expected=expected)
13331347

1334-
@pytest.mark.xfail(
1335-
reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929"
1336-
)
1337-
def test_timezone_aware_index(self, fp, timezone_aware_date_list):
1348+
def test_timezone_aware_index(self, fp, timezone_aware_date_list, request):
1349+
import fastparquet
1350+
1351+
if Version(fastparquet.__version__) < Version("2024.11.0"):
1352+
request.applymarker(
1353+
pytest.mark.xfail(
1354+
reason=(
1355+
"fastparquet bug, see "
1356+
"https://github.com/dask/fastparquet/issues/929"
1357+
),
1358+
)
1359+
)
1360+
13381361
idx = 5 * [timezone_aware_date_list]
13391362

13401363
df = pd.DataFrame(index=idx, data={"index_as_col": idx})

0 commit comments

Comments
 (0)