Skip to content

Commit 22b483a

Browse files
chore: Use fixture for list/str accessor tests (#992)
* chore: Use fixture for list/str accessor tests * fix format * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * add more type coverage in tests * fix format * remove unnecessary dep * remove import --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 06c1b33 commit 22b483a

File tree

5 files changed

+134
-30
lines changed

5 files changed

+134
-30
lines changed

tests/data/repeated.jsonl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"rowindex": 0, "int_list_col": [1], "bool_list_col": [true], "float_list_col": [1.2, 2.3], "date_list_col": ["2021-07-21"], "date_time_list_col": ["2021-07-21 11:39:45"], "numeric_list_col": [1.2, 2.3, 3.4], "string_list_col": ["abc", "de", "f"]}
2+
{"rowindex": 1, "int_list_col": [1,2], "bool_list_col": [true, false], "float_list_col": [1.1], "date_list_col": ["2021-07-21", "1987-03-28"], "date_time_list_col": ["1999-03-14 17:22:00"], "numeric_list_col": [5.5, 2.3], "string_list_col": ["a", "bc", "de"]}
3+
{"rowindex": 2, "int_list_col": [1,2,3], "bool_list_col": [true], "float_list_col": [0.5, -1.9, 2.3], "date_list_col": ["2017-08-01", "2004-11-22"], "date_time_list_col": ["1979-06-03 03:20:45"], "numeric_list_col": [1.7], "string_list_col": ["", "a"]}

tests/data/repeated_schema.json

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
[
2+
{
3+
"name": "rowindex",
4+
"type": "INTEGER",
5+
"mode": "REQUIRED"
6+
},
7+
{
8+
"name": "int_list_col",
9+
"type": "INTEGER",
10+
"mode": "REPEATED"
11+
},
12+
{
13+
"name": "bool_list_col",
14+
"type": "BOOLEAN",
15+
"mode": "REPEATED"
16+
},
17+
{
18+
"name": "float_list_col",
19+
"type": "FLOAT",
20+
"mode": "REPEATED"
21+
},
22+
{
23+
"name": "date_list_col",
24+
"type": "DATE",
25+
"mode": "REPEATED"
26+
},
27+
{
28+
"name": "date_time_list_col",
29+
"type": "DATETIME",
30+
"mode": "REPEATED"
31+
},
32+
{
33+
"name": "numeric_list_col",
34+
"type": "NUMERIC",
35+
"mode": "REPEATED"
36+
},
37+
{
38+
"name": "string_list_col",
39+
"type": "STRING",
40+
"mode": "REPEATED"
41+
}
42+
]

tests/system/conftest.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import bigframes
4040
import bigframes.dataframe
4141
import bigframes.pandas as bpd
42+
import bigframes.series
4243
import tests.system.utils
4344

4445
# Use this to control the number of cloud functions being deleted in a single
@@ -294,6 +295,7 @@ def load_test_data_tables(
294295
("scalars", "scalars_schema.json", "scalars.jsonl"),
295296
("scalars_too", "scalars_schema.json", "scalars.jsonl"),
296297
("nested", "nested_schema.json", "nested.jsonl"),
298+
("repeated", "repeated_schema.json", "repeated.jsonl"),
297299
("penguins", "penguins_schema.json", "penguins.jsonl"),
298300
("time_series", "time_series_schema.json", "time_series.jsonl"),
299301
("hockey_players", "hockey_players.json", "hockey_players.jsonl"),
@@ -370,6 +372,11 @@ def nested_table_id(test_data_tables) -> str:
370372
return test_data_tables["nested"]
371373

372374

375+
@pytest.fixture(scope="session")
376+
def repeated_table_id(test_data_tables) -> str:
377+
return test_data_tables["repeated"]
378+
379+
373380
@pytest.fixture(scope="session")
374381
def penguins_table_id(test_data_tables) -> str:
375382
return test_data_tables["penguins"]
@@ -410,6 +417,26 @@ def nested_pandas_df() -> pd.DataFrame:
410417
return df
411418

412419

420+
@pytest.fixture(scope="session")
421+
def repeated_df(
422+
repeated_table_id: str, session: bigframes.Session
423+
) -> bigframes.dataframe.DataFrame:
424+
"""Returns a DataFrame containing columns of list type."""
425+
return session.read_gbq(repeated_table_id, index_col="rowindex")
426+
427+
428+
@pytest.fixture(scope="session")
429+
def repeated_pandas_df() -> pd.DataFrame:
430+
"""Returns a DataFrame containing columns of list type."""
431+
432+
df = pd.read_json(
433+
DATA_DIR / "repeated.jsonl",
434+
lines=True,
435+
)
436+
df = df.set_index("rowindex")
437+
return df
438+
439+
413440
@pytest.fixture(scope="session")
414441
def scalars_df_default_index(
415442
scalars_df_index: bigframes.dataframe.DataFrame,

tests/system/small/operations/test_lists.py

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
import pyarrow as pa
1919
import pytest
2020

21-
import bigframes.pandas as bpd
22-
2321
from ...utils import assert_series_equal
2422

2523

@@ -32,19 +30,34 @@
3230
pytest.param(slice(0, 2, None), id="default_step_slice"),
3331
],
3432
)
35-
def test_getitem(key):
33+
@pytest.mark.parametrize(
34+
("column_name", "dtype"),
35+
[
36+
pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))),
37+
pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))),
38+
pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))),
39+
pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))),
40+
pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))),
41+
pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))),
42+
pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))),
43+
],
44+
)
45+
def test_getitem(key, column_name, dtype, repeated_df, repeated_pandas_df):
3646
if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"):
3747
pytest.skip(
3848
"https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data"
3949
)
40-
data = [[1], [2, 3], [4, 5, 6]]
41-
s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
42-
pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
4350

44-
bf_result = s.list[key].to_pandas()
45-
pd_result = pd_s.list[key]
51+
bf_result = repeated_df[column_name].list[key].to_pandas()
52+
pd_result = repeated_pandas_df[column_name].astype(dtype).list[key]
4653

47-
assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
54+
assert_series_equal(
55+
pd_result,
56+
bf_result,
57+
check_dtype=False,
58+
check_index_type=False,
59+
check_names=False,
60+
)
4861

4962

5063
@pytest.mark.parametrize(
@@ -60,24 +73,36 @@ def test_getitem(key):
6073
(slice(0, 2, 2), pytest.raises(NotImplementedError)),
6174
],
6275
)
63-
def test_getitem_notsupported(key, expectation):
64-
data = [[1], [2, 3], [4, 5, 6]]
65-
s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
66-
76+
def test_getitem_notsupported(key, expectation, repeated_df):
6777
with expectation as e:
68-
assert s.list[key] == e
78+
assert repeated_df["int_list_col"].list[key] == e
6979

7080

71-
def test_len():
81+
@pytest.mark.parametrize(
82+
("column_name", "dtype"),
83+
[
84+
pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))),
85+
pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))),
86+
pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))),
87+
pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))),
88+
pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))),
89+
pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))),
90+
pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))),
91+
],
92+
)
93+
def test_len(column_name, dtype, repeated_df, repeated_pandas_df):
7294
if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"):
7395
pytest.skip(
7496
"https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data"
7597
)
76-
data = [[], [1], [1, 2], [1, 2, 3]]
77-
s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
78-
pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
7998

80-
bf_result = s.list.len().to_pandas()
81-
pd_result = pd_s.list.len()
99+
bf_result = repeated_df[column_name].list.len().to_pandas()
100+
pd_result = repeated_pandas_df[column_name].astype(dtype).list.len()
82101

83-
assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
102+
assert_series_equal(
103+
pd_result,
104+
bf_result,
105+
check_dtype=False,
106+
check_index_type=False,
107+
check_names=False,
108+
)

tests/system/small/operations/test_strings.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -615,21 +615,28 @@ def test_getitem_w_string(scalars_dfs, index):
615615
@pytest.mark.parametrize(
616616
("index"),
617617
[
618-
pytest.param(2, id="int"),
618+
pytest.param(0, id="int"),
619619
pytest.param(slice(None, None, None), id="default_start_slice"),
620620
pytest.param(slice(0, None, 1), id="default_stop_slice"),
621621
pytest.param(slice(0, 2, None), id="default_step_slice"),
622622
pytest.param(slice(0, 0, None), id="single_one_slice"),
623623
],
624624
)
625-
def test_getitem_w_array(index):
626-
data = [[1], [2, 3], [], [4, 5, 6]]
627-
s = bpd.Series(data)
628-
pd_s = pd.Series(data)
629-
630-
bf_result = s.str[index].to_pandas()
631-
pd_result = pd_s.str[index]
632-
# Skip dtype checks here because pandas returns `int64` while BF returns `Int64`.
625+
@pytest.mark.parametrize(
626+
"column_name",
627+
[
628+
pytest.param("int_list_col"),
629+
pytest.param("bool_list_col"),
630+
pytest.param("float_list_col"),
631+
pytest.param("string_list_col"),
632+
# date, date_time and numeric are excluded because their default types are different
633+
# in Pandas and BigFrames
634+
],
635+
)
636+
def test_getitem_w_array(index, column_name, repeated_df, repeated_pandas_df):
637+
bf_result = repeated_df[column_name].str[index].to_pandas()
638+
pd_result = repeated_pandas_df[column_name].str[index]
639+
633640
assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
634641

635642

0 commit comments

Comments
 (0)