Skip to content

Commit 30cbd6a

Browse files
committed
TST(string dtype): Resolve pytable xfails
1 parent 8fe2720 commit 30cbd6a

12 files changed

+339
-184
lines changed

pandas/io/pytables.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3382,6 +3382,8 @@ def read(
33823382
if (
33833383
using_string_dtype()
33843384
and isinstance(values, np.ndarray)
3385+
# TODO: Should is_string_array return True for an empty object ndarray?
3386+
and values.size != 0
33853387
and is_string_array(values, skipna=True)
33863388
):
33873389
df = df.astype(StringDtype(na_value=np.nan))
@@ -4148,6 +4150,9 @@ def _create_axes(
41484150
meta = "category"
41494151
metadata = np.asarray(data_converted.categories).ravel()
41504152

4153+
if isinstance(blk.dtype, StringDtype):
4154+
meta = str(blk.dtype)
4155+
41514156
data, dtype_name = _get_data_and_dtype_name(data_converted)
41524157

41534158
col = klass(
@@ -4407,7 +4412,8 @@ def read_column(
44074412
errors=self.errors,
44084413
)
44094414
cvs = col_values[1]
4410-
return Series(cvs, name=column, copy=False)
4415+
dtype = getattr(self.table.attrs, f"{column}_meta")
4416+
return Series(cvs, name=column, copy=False, dtype=dtype)
44114417

44124418
raise KeyError(f"column [{column}] not found in the table")
44134419

@@ -4523,7 +4529,6 @@ def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
45234529
mask = isna(a.data).all(axis=0)
45244530
if isinstance(mask, np.ndarray):
45254531
masks.append(mask.astype("u1", copy=False))
4526-
45274532
# consolidate masks
45284533
if len(masks):
45294534
mask = masks[0]
@@ -5112,6 +5117,8 @@ def _maybe_convert_for_string_atom(
51125117
errors,
51135118
columns: list[str],
51145119
):
5120+
if isinstance(bvalues.dtype, StringDtype):
5121+
bvalues = bvalues.to_numpy()
51155122
if bvalues.dtype != object:
51165123
return bvalues
51175124

pandas/tests/io/pytables/test_append.py

Lines changed: 58 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
import numpy as np
66
import pytest
77

8-
from pandas._config import using_string_dtype
9-
108
from pandas._libs.tslibs import Timestamp
119
from pandas.compat import PY312
1210

@@ -27,7 +25,6 @@
2725

2826
pytestmark = [
2927
pytest.mark.single_cpu,
30-
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
3128
]
3229

3330
tables = pytest.importorskip("tables")
@@ -40,7 +37,7 @@ def test_append(setup_path):
4037
# tables.NaturalNameWarning):
4138
df = DataFrame(
4239
np.random.default_rng(2).standard_normal((20, 4)),
43-
columns=Index(list("ABCD"), dtype=object),
40+
columns=Index(list("ABCD")),
4441
index=date_range("2000-01-01", periods=20, freq="B"),
4542
)
4643
_maybe_remove(store, "df1")
@@ -203,7 +200,7 @@ def test_append_some_nans(setup_path):
203200
tm.assert_frame_equal(store["df3"], df3, check_index_type=True)
204201

205202

206-
def test_append_all_nans(setup_path):
203+
def test_append_all_nans(setup_path, using_infer_string):
207204
with ensure_clean_store(setup_path) as store:
208205
df = DataFrame(
209206
{
@@ -255,7 +252,13 @@ def test_append_all_nans(setup_path):
255252
_maybe_remove(store, "df")
256253
store.append("df", df[:10], dropna=True)
257254
store.append("df", df[10:], dropna=True)
258-
tm.assert_frame_equal(store["df"], df, check_index_type=True)
255+
result = store["df"]
256+
expected = df
257+
if using_infer_string:
258+
# TODO: Test is incorrect when not using_infer_string.
259+
# Should take the last 4 rows uncondiationally.
260+
expected = expected[16:]
261+
tm.assert_frame_equal(result, expected, check_index_type=True)
259262

260263
_maybe_remove(store, "df2")
261264
store.append("df2", df[:10], dropna=False)
@@ -294,7 +297,7 @@ def test_append_frame_column_oriented(setup_path, request):
294297
# column oriented
295298
df = DataFrame(
296299
np.random.default_rng(2).standard_normal((10, 4)),
297-
columns=Index(list("ABCD"), dtype=object),
300+
columns=Index(list("ABCD")),
298301
index=date_range("2000-01-01", periods=10, freq="B"),
299302
)
300303
df.index = df.index._with_freq(None) # freq doesn't round-trip
@@ -426,7 +429,7 @@ def check_col(key, name, size):
426429
{
427430
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
428431
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
429-
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
432+
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]),
430433
"D": date_range("20130101", periods=5),
431434
}
432435
).set_index("C")
@@ -453,7 +456,7 @@ def check_col(key, name, size):
453456
_maybe_remove(store, "df")
454457
df = DataFrame(
455458
np.random.default_rng(2).standard_normal((10, 4)),
456-
columns=Index(list("ABCD"), dtype=object),
459+
columns=Index(list("ABCD")),
457460
index=date_range("2000-01-01", periods=10, freq="B"),
458461
)
459462
df["string"] = "foo"
@@ -517,7 +520,7 @@ def test_append_with_data_columns(setup_path):
517520
with ensure_clean_store(setup_path) as store:
518521
df = DataFrame(
519522
np.random.default_rng(2).standard_normal((10, 4)),
520-
columns=Index(list("ABCD"), dtype=object),
523+
columns=Index(list("ABCD")),
521524
index=date_range("2000-01-01", periods=10, freq="B"),
522525
)
523526
df.iloc[0, df.columns.get_loc("B")] = 1.0
@@ -693,8 +696,12 @@ def test_append_misc(setup_path):
693696
with ensure_clean_store(setup_path) as store:
694697
df = DataFrame(
695698
1.1 * np.arange(120).reshape((30, 4)),
696-
columns=Index(list("ABCD"), dtype=object),
697-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
699+
columns=Index(
700+
list("ABCD"),
701+
),
702+
index=Index(
703+
[f"i-{i}" for i in range(30)],
704+
),
698705
)
699706
store.append("df", df, chunksize=1)
700707
result = store.select("df")
@@ -710,8 +717,12 @@ def test_append_misc_chunksize(setup_path, chunksize):
710717
# more chunksize in append tests
711718
df = DataFrame(
712719
1.1 * np.arange(120).reshape((30, 4)),
713-
columns=Index(list("ABCD"), dtype=object),
714-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
720+
columns=Index(
721+
list("ABCD"),
722+
),
723+
index=Index(
724+
[f"i-{i}" for i in range(30)],
725+
),
715726
)
716727
df["string"] = "foo"
717728
df["float322"] = 1.0
@@ -747,15 +758,19 @@ def test_append_misc_empty_frame(setup_path):
747758
tm.assert_frame_equal(store.select("df2"), df)
748759

749760

750-
def test_append_raise(setup_path):
761+
def test_append_raise(setup_path, using_infer_string):
751762
with ensure_clean_store(setup_path) as store:
752763
# test append with invalid input to get good error messages
753764

754765
# list in column
755766
df = DataFrame(
756767
1.1 * np.arange(120).reshape((30, 4)),
757-
columns=Index(list("ABCD"), dtype=object),
758-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
768+
columns=Index(
769+
list("ABCD"),
770+
),
771+
index=Index(
772+
[f"i-{i}" for i in range(30)],
773+
),
759774
)
760775
df["invalid"] = [["a"]] * len(df)
761776
assert df.dtypes["invalid"] == np.object_
@@ -775,8 +790,12 @@ def test_append_raise(setup_path):
775790
# datetime with embedded nans as object
776791
df = DataFrame(
777792
1.1 * np.arange(120).reshape((30, 4)),
778-
columns=Index(list("ABCD"), dtype=object),
779-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
793+
columns=Index(
794+
list("ABCD"),
795+
),
796+
index=Index(
797+
[f"i-{i}" for i in range(30)],
798+
),
780799
)
781800
s = Series(datetime.datetime(2001, 1, 2), index=df.index)
782801
s = s.astype(object)
@@ -803,8 +822,12 @@ def test_append_raise(setup_path):
803822
# appending an incompatible table
804823
df = DataFrame(
805824
1.1 * np.arange(120).reshape((30, 4)),
806-
columns=Index(list("ABCD"), dtype=object),
807-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
825+
columns=Index(
826+
list("ABCD"),
827+
),
828+
index=Index(
829+
[f"i-{i}" for i in range(30)],
830+
),
808831
)
809832
store.append("df", df)
810833

@@ -822,10 +845,11 @@ def test_append_raise(setup_path):
822845
df["foo"] = Timestamp("20130101")
823846
store.append("df", df)
824847
df["foo"] = "bar"
848+
shape = "(30,)" if using_infer_string else "(1, 30)"
825849
msg = re.escape(
826850
"invalid combination of [values_axes] on appending data "
827851
"[name->values_block_1,cname->values_block_1,"
828-
"dtype->bytes24,kind->string,shape->(1, 30)] "
852+
f"dtype->bytes24,kind->string,shape->{shape}] "
829853
"vs current table "
830854
"[name->values_block_1,cname->values_block_1,"
831855
"dtype->datetime64[s],kind->datetime64[s],shape->None]"
@@ -884,7 +908,9 @@ def test_append_with_timedelta(setup_path):
884908
def test_append_to_multiple(setup_path):
885909
df1 = DataFrame(
886910
np.random.default_rng(2).standard_normal((10, 4)),
887-
columns=Index(list("ABCD"), dtype=object),
911+
columns=Index(
912+
list("ABCD"),
913+
),
888914
index=date_range("2000-01-01", periods=10, freq="B"),
889915
)
890916
df2 = df1.copy().rename(columns="{}_2".format)
@@ -921,12 +947,16 @@ def test_append_to_multiple(setup_path):
921947
def test_append_to_multiple_dropna(setup_path):
922948
df1 = DataFrame(
923949
np.random.default_rng(2).standard_normal((10, 4)),
924-
columns=Index(list("ABCD"), dtype=object),
950+
columns=Index(
951+
list("ABCD"),
952+
),
925953
index=date_range("2000-01-01", periods=10, freq="B"),
926954
)
927955
df2 = DataFrame(
928956
np.random.default_rng(2).standard_normal((10, 4)),
929-
columns=Index(list("ABCD"), dtype=object),
957+
columns=Index(
958+
list("ABCD"),
959+
),
930960
index=date_range("2000-01-01", periods=10, freq="B"),
931961
).rename(columns="{}_2".format)
932962
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
@@ -946,7 +976,9 @@ def test_append_to_multiple_dropna(setup_path):
946976
def test_append_to_multiple_dropna_false(setup_path):
947977
df1 = DataFrame(
948978
np.random.default_rng(2).standard_normal((10, 4)),
949-
columns=Index(list("ABCD"), dtype=object),
979+
columns=Index(
980+
list("ABCD"),
981+
),
950982
index=date_range("2000-01-01", periods=10, freq="B"),
951983
)
952984
df2 = df1.copy().rename(columns="{}_2".format)

pandas/tests/io/pytables/test_categorical.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas import (
75
Categorical,
86
DataFrame,
@@ -18,7 +16,6 @@
1816

1917
pytestmark = [
2018
pytest.mark.single_cpu,
21-
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
2219
]
2320

2421

pandas/tests/io/pytables/test_complex.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
import pandas as pd
75
from pandas import (
86
DataFrame,
@@ -13,10 +11,6 @@
1311

1412
from pandas.io.pytables import read_hdf
1513

16-
pytestmark = pytest.mark.xfail(
17-
using_string_dtype(), reason="TODO(infer_string)", strict=False
18-
)
19-
2014

2115
def test_complex_fixed(tmp_path, setup_path):
2216
df = DataFrame(

pandas/tests/io/pytables/test_errors.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
import numpy as np
66
import pytest
77

8-
from pandas._config import using_string_dtype
9-
108
from pandas import (
119
CategoricalIndex,
1210
DataFrame,
@@ -26,7 +24,6 @@
2624

2725
pytestmark = [
2826
pytest.mark.single_cpu,
29-
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
3027
]
3128

3229

@@ -93,9 +90,14 @@ def test_unimplemented_dtypes_table_columns(setup_path):
9390

9491
with ensure_clean_store(setup_path) as store:
9592
# this fails because we have a date in the object block......
96-
msg = re.escape(
97-
"""Cannot serialize the column [datetime1]
98-
because its data contents are not [string] but [date] object dtype"""
93+
msg = "|".join(
94+
[
95+
re.escape(
96+
"Cannot serialize the column [datetime1] because its data contents "
97+
"are not [string] but [date] object dtype"
98+
),
99+
re.escape("[date] is not implemented as a table column"),
100+
]
99101
)
100102
with pytest.raises(TypeError, match=msg):
101103
store.append("df_unimplemented", df)

pandas/tests/io/pytables/test_file_handling.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
from pandas.compat import (
97
PY311,
108
is_ci_environment,
@@ -329,7 +327,6 @@ def test_complibs(tmp_path, lvl, lib, request):
329327
assert node.filters.complib == lib
330328

331329

332-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
333330
@pytest.mark.skipif(
334331
not is_platform_little_endian(), reason="reason platform is not little endian"
335332
)
@@ -347,7 +344,6 @@ def test_encoding(setup_path):
347344
tm.assert_frame_equal(result, expected)
348345

349346

350-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
351347
@pytest.mark.parametrize(
352348
"val",
353349
[
@@ -362,7 +358,7 @@ def test_encoding(setup_path):
362358
[b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
363359
],
364360
)
365-
@pytest.mark.parametrize("dtype", ["category", object])
361+
@pytest.mark.parametrize("dtype", ["category", None])
366362
def test_latin_encoding(tmp_path, setup_path, dtype, val):
367363
enc = "latin-1"
368364
nan_rep = ""

pandas/tests/io/pytables/test_keys.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas import (
75
DataFrame,
86
HDFStore,
@@ -17,7 +15,6 @@
1715

1816
pytestmark = [
1917
pytest.mark.single_cpu,
20-
pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
2118
]
2219

2320

0 commit comments

Comments
 (0)