chore: Use fixture for list/str accessor tests (#992)

sycai · gcf-owl-bot[bot] · web-flow · commit 22b483a9d9bc · 2024-09-17T12:29:17.000-07:00
* chore: Use fixture for list/str accessor tests * fix format * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * add more type coverage in tests * fix format * remove unnecessary dep * remove import --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
diff --git a/tests/data/repeated.jsonl b/tests/data/repeated.jsonl
@@ -0,0 +1,3 @@
+{"rowindex": 0, "int_list_col": [1],        "bool_list_col": [true],        "float_list_col": [1.2, 2.3],        "date_list_col": ["2021-07-21"],                "date_time_list_col": ["2021-07-21 11:39:45"], "numeric_list_col": [1.2, 2.3, 3.4], "string_list_col": ["abc", "de", "f"]}
+{"rowindex": 1, "int_list_col": [1,2],      "bool_list_col": [true, false], "float_list_col": [1.1],             "date_list_col": ["2021-07-21", "1987-03-28"],  "date_time_list_col": ["1999-03-14 17:22:00"], "numeric_list_col": [5.5, 2.3],      "string_list_col": ["a", "bc", "de"]}
+{"rowindex": 2, "int_list_col": [1,2,3],    "bool_list_col": [true],        "float_list_col": [0.5, -1.9, 2.3],  "date_list_col": ["2017-08-01", "2004-11-22"],  "date_time_list_col": ["1979-06-03 03:20:45"], "numeric_list_col": [1.7],           "string_list_col": ["", "a"]}
diff --git a/tests/data/repeated_schema.json b/tests/data/repeated_schema.json
@@ -0,0 +1,42 @@
+[
+    {
+        "name": "rowindex",
+        "type": "INTEGER",
+        "mode": "REQUIRED"
+    },
+    {
+        "name": "int_list_col",
+        "type": "INTEGER",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "bool_list_col",
+        "type": "BOOLEAN",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "float_list_col",
+        "type": "FLOAT",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "date_list_col",
+        "type": "DATE",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "date_time_list_col",
+        "type": "DATETIME",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "numeric_list_col",
+        "type": "NUMERIC",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "string_list_col",
+        "type": "STRING",
+        "mode": "REPEATED"
+    }
+]
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
@@ -39,6 +39,7 @@
 import bigframes
 import bigframes.dataframe
 import bigframes.pandas as bpd
+import bigframes.series
 import tests.system.utils
 
 # Use this to control the number of cloud functions being deleted in a single
@@ -294,6 +295,7 @@ def load_test_data_tables(
         ("scalars", "scalars_schema.json", "scalars.jsonl"),
         ("scalars_too", "scalars_schema.json", "scalars.jsonl"),
         ("nested", "nested_schema.json", "nested.jsonl"),
+        ("repeated", "repeated_schema.json", "repeated.jsonl"),
         ("penguins", "penguins_schema.json", "penguins.jsonl"),
         ("time_series", "time_series_schema.json", "time_series.jsonl"),
         ("hockey_players", "hockey_players.json", "hockey_players.jsonl"),
@@ -370,6 +372,11 @@ def nested_table_id(test_data_tables) -> str:
     return test_data_tables["nested"]
 
 
+@pytest.fixture(scope="session")
+def repeated_table_id(test_data_tables) -> str:
+    return test_data_tables["repeated"]
+
+
 @pytest.fixture(scope="session")
 def penguins_table_id(test_data_tables) -> str:
     return test_data_tables["penguins"]
@@ -410,6 +417,26 @@ def nested_pandas_df() -> pd.DataFrame:
     return df
 
 
+@pytest.fixture(scope="session")
+def repeated_df(
+    repeated_table_id: str, session: bigframes.Session
+) -> bigframes.dataframe.DataFrame:
+    """Returns a DataFrame containing columns of list type."""
+    return session.read_gbq(repeated_table_id, index_col="rowindex")
+
+
+@pytest.fixture(scope="session")
+def repeated_pandas_df() -> pd.DataFrame:
+    """Returns a DataFrame containing columns of list type."""
+
+    df = pd.read_json(
+        DATA_DIR / "repeated.jsonl",
+        lines=True,
+    )
+    df = df.set_index("rowindex")
+    return df
+
+
 @pytest.fixture(scope="session")
 def scalars_df_default_index(
     scalars_df_index: bigframes.dataframe.DataFrame,
diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py
@@ -18,8 +18,6 @@
 import pyarrow as pa
 import pytest
 
-import bigframes.pandas as bpd
-
 from ...utils import assert_series_equal
 
 
@@ -32,19 +30,34 @@
         pytest.param(slice(0, 2, None), id="default_step_slice"),
     ],
 )
-def test_getitem(key):
+@pytest.mark.parametrize(
+    ("column_name", "dtype"),
+    [
+        pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))),
+        pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))),
+        pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))),
+        pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))),
+        pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))),
+        pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))),
+        pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))),
+    ],
+)
+def test_getitem(key, column_name, dtype, repeated_df, repeated_pandas_df):
     if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"):
         pytest.skip(
             "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data"
         )
-    data = [[1], [2, 3], [4, 5, 6]]
-    s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
-    pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
 
-    bf_result = s.list[key].to_pandas()
-    pd_result = pd_s.list[key]
+    bf_result = repeated_df[column_name].list[key].to_pandas()
+    pd_result = repeated_pandas_df[column_name].astype(dtype).list[key]
 
-    assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
+    assert_series_equal(
+        pd_result,
+        bf_result,
+        check_dtype=False,
+        check_index_type=False,
+        check_names=False,
+    )
 
 
 @pytest.mark.parametrize(
@@ -60,24 +73,36 @@ def test_getitem(key):
         (slice(0, 2, 2), pytest.raises(NotImplementedError)),
     ],
 )
-def test_getitem_notsupported(key, expectation):
-    data = [[1], [2, 3], [4, 5, 6]]
-    s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
-
+def test_getitem_notsupported(key, expectation, repeated_df):
     with expectation as e:
-        assert s.list[key] == e
+        assert repeated_df["int_list_col"].list[key] == e
 
 
-def test_len():
+@pytest.mark.parametrize(
+    ("column_name", "dtype"),
+    [
+        pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))),
+        pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))),
+        pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))),
+        pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))),
+        pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))),
+        pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))),
+        pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))),
+    ],
+)
+def test_len(column_name, dtype, repeated_df, repeated_pandas_df):
     if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"):
         pytest.skip(
             "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data"
         )
-    data = [[], [1], [1, 2], [1, 2, 3]]
-    s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
-    pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
 
-    bf_result = s.list.len().to_pandas()
-    pd_result = pd_s.list.len()
+    bf_result = repeated_df[column_name].list.len().to_pandas()
+    pd_result = repeated_pandas_df[column_name].astype(dtype).list.len()
 
-    assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
+    assert_series_equal(
+        pd_result,
+        bf_result,
+        check_dtype=False,
+        check_index_type=False,
+        check_names=False,
+    )
diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py
@@ -615,21 +615,28 @@ def test_getitem_w_string(scalars_dfs, index):
 @pytest.mark.parametrize(
     ("index"),
     [
-        pytest.param(2, id="int"),
+        pytest.param(0, id="int"),
         pytest.param(slice(None, None, None), id="default_start_slice"),
         pytest.param(slice(0, None, 1), id="default_stop_slice"),
         pytest.param(slice(0, 2, None), id="default_step_slice"),
         pytest.param(slice(0, 0, None), id="single_one_slice"),
     ],
 )
-def test_getitem_w_array(index):
-    data = [[1], [2, 3], [], [4, 5, 6]]
-    s = bpd.Series(data)
-    pd_s = pd.Series(data)
-
-    bf_result = s.str[index].to_pandas()
-    pd_result = pd_s.str[index]
-    # Skip dtype checks here because pandas returns `int64` while BF returns `Int64`.
+@pytest.mark.parametrize(
+    "column_name",
+    [
+        pytest.param("int_list_col"),
+        pytest.param("bool_list_col"),
+        pytest.param("float_list_col"),
+        pytest.param("string_list_col"),
+        # date, date_time and numeric are excluded because their default types are different
+        # in Pandas and BigFrames
+    ],
+)
+def test_getitem_w_array(index, column_name, repeated_df, repeated_pandas_df):
+    bf_result = repeated_df[column_name].str[index].to_pandas()
+    pd_result = repeated_pandas_df[column_name].str[index]
+
     assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
 
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{"rowindex": 0, "int_list_col": [1], "bool_list_col": [true], "float_list_col": [1.2, 2.3], "date_list_col": ["2021-07-21"], "date_time_list_col": ["2021-07-21 11:39:45"], "numeric_list_col": [1.2, 2.3, 3.4], "string_list_col": ["abc", "de", "f"]}`
	`2`	`+{"rowindex": 1, "int_list_col": [1,2], "bool_list_col": [true, false], "float_list_col": [1.1], "date_list_col": ["2021-07-21", "1987-03-28"], "date_time_list_col": ["1999-03-14 17:22:00"], "numeric_list_col": [5.5, 2.3], "string_list_col": ["a", "bc", "de"]}`
	`3`	`+{"rowindex": 2, "int_list_col": [1,2,3], "bool_list_col": [true], "float_list_col": [0.5, -1.9, 2.3], "date_list_col": ["2017-08-01", "2004-11-22"], "date_time_list_col": ["1979-06-03 03:20:45"], "numeric_list_col": [1.7], "string_list_col": ["", "a"]}`