chore: add load tests session for reading large tables (#410)

tswast · web-flow · commit 6478ad75a98f · 2024-03-05T16:45:31.000-06:00
* chore: add load tests session for reading large tables

* update junit prefix

* xfail for to_pandas_batches

* use smaller table but still beyond query results limit
diff --git a/.kokoro/load/common.cfg b/.kokoro/load/common.cfg
@@ -0,0 +1,10 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Build logs will be here
+action {
+  define_artifacts {
+    regex: "**/*sponge_log.xml"
+  }
+}
+
+build_file: "python-bigquery-dataframes/.kokoro/build.sh"
diff --git a/.kokoro/load/load.cfg b/.kokoro/load/load.cfg
@@ -0,0 +1,17 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Only run this nox session.
+env_vars: {
+    key: "NOX_SESSION"
+    value: "load"
+}
+
+env_vars: {
+    key: "GOOGLE_CLOUD_PROJECT"
+    value: "bigframes-load-testing"
+}
+
+env_vars: {
+    key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT"
+    value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048"
+}
diff --git a/noxfile.py b/noxfile.py
@@ -387,6 +387,17 @@ def e2e(session: nox.sessions.Session):
     )
 
 
+@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1])
+def load(session: nox.sessions.Session):
+    """Run the very large tests in system test suite."""
+    run_system(
+        session=session,
+        prefix_name="load",
+        test_folder=os.path.join("tests", "system", "load"),
+        print_duration=True,
+    )
+
+
 @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS)
 def samples(session):
     """Run the samples test suite."""
diff --git a/scripts/create_load_test_tables.py b/scripts/create_load_test_tables.py
@@ -0,0 +1,109 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import pathlib
+import sys
+
+import google.cloud.bigquery as bigquery
+
+REPO_ROOT = pathlib.Path(__file__).parent.parent
+
+PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
+
+if not PROJECT_ID:
+    print(
+        "Please set GOOGLE_CLOUD_PROJECT environment variable before running.",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+
+DATASET_ID = f"{PROJECT_ID}.load_testing"
+TABLE_ID = f"{DATASET_ID}.scalars"
+TABLE_ID_FORMAT = f"{DATASET_ID}.scalars_{{size}}"
+
+KB_BYTES = 1000
+MB_BYTES = 1000 * KB_BYTES
+GB_BYTES = 1000 * MB_BYTES
+TB_BYTES = 1000 * GB_BYTES
+SIZES = (
+    ("1mb", MB_BYTES),
+    ("10mb", 10 * MB_BYTES),
+    ("100mb", 100 * MB_BYTES),
+    ("1gb", GB_BYTES),
+    ("10gb", 10 * GB_BYTES),
+    ("100gb", 100 * GB_BYTES),
+    ("1tb", TB_BYTES),
+)
+SCHEMA_PATH = REPO_ROOT / "tests" / "data" / "scalars_schema.json"
+DATA_PATH = REPO_ROOT / "tests" / "data" / "scalars.jsonl"
+BQCLIENT = bigquery.Client()
+
+
+def create_dataset():
+    dataset = bigquery.Dataset(DATASET_ID)
+    BQCLIENT.create_dataset(dataset, exists_ok=True)
+
+
+def load_scalars_table():
+    schema = BQCLIENT.schema_from_json(SCHEMA_PATH)
+    job_config = bigquery.LoadJobConfig()
+    job_config.schema = schema
+    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
+    job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
+
+    print(f"Creating {TABLE_ID}")
+    with open(DATA_PATH, "rb") as data_file:
+        BQCLIENT.load_table_from_file(
+            data_file,
+            TABLE_ID,
+            job_config=job_config,
+        ).result()
+
+
+def multiply_table(previous_table_id, target_table_id, multiplier):
+    clauses = [f"SELECT * FROM `{previous_table_id}`"] * multiplier
+    query = " UNION ALL ".join(clauses)
+    job_config = bigquery.QueryJobConfig()
+    job_config.destination = target_table_id
+    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
+    print(f"Creating {target_table_id}, {multiplier} x {previous_table_id}")
+    BQCLIENT.query_and_wait(query, job_config=job_config)
+
+
+def create_tables():
+    base_table = BQCLIENT.get_table(TABLE_ID)
+    previous_bytes = base_table.num_bytes
+    previous_table_id = TABLE_ID
+
+    for table_suffix, target_bytes in SIZES:
+        # Make sure we exceed the desired bytes by adding to the multiplier.
+        multiplier = math.ceil(target_bytes / previous_bytes) + 1
+        target_table_id = TABLE_ID_FORMAT.format(size=table_suffix)
+        multiply_table(previous_table_id, target_table_id, multiplier)
+
+        table = BQCLIENT.get_table(target_table_id)
+        previous_bytes = table.num_bytes
+        previous_table_id = target_table_id
+
+
+def main():
+    create_dataset()
+    load_scalars_table()
+    create_tables()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
@@ -104,6 +104,11 @@ def cloudfunctions_client(
     return session.cloudfunctionsclient
 
 
+@pytest.fixture(scope="session")
+def project_id(bigquery_client: bigquery.Client) -> str:
+    return bigquery_client.project
+
+
 @pytest.fixture(scope="session")
 def resourcemanager_client(
     session: bigframes.Session,
@@ -159,9 +164,8 @@ def dataset_id_not_created(bigquery_client: bigquery.Client):
 
 
 @pytest.fixture(scope="session")
-def dataset_id_permanent(bigquery_client: bigquery.Client) -> str:
+def dataset_id_permanent(bigquery_client: bigquery.Client, project_id: str) -> str:
     """Create a dataset if it doesn't exist."""
-    project_id = bigquery_client.project
     dataset_id = f"{project_id}.{PERMANENT_DATASET}"
     dataset = bigquery.Dataset(dataset_id)
     bigquery_client.create_dataset(dataset, exists_ok=True)
diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py
@@ -0,0 +1,96 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Load test for query (SQL) inputs with large results sizes."""
+
+import pytest
+
+import bigframes.pandas as bpd
+
+KB_BYTES = 1000
+MB_BYTES = 1000 * KB_BYTES
+GB_BYTES = 1000 * MB_BYTES
+TB_BYTES = 1000 * GB_BYTES
+
+
+@pytest.mark.parametrize(
+    ("sql", "expected_bytes"),
+    (
+        pytest.param(
+            "SELECT * FROM load_testing.scalars_1gb",
+            GB_BYTES,
+            id="1gb",
+        ),
+        pytest.param(
+            "SELECT * FROM load_testing.scalars_10gb",
+            10 * GB_BYTES,
+            id="10gb",
+        ),
+        pytest.param(
+            "SELECT * FROM load_testing.scalars_100gb",
+            100 * GB_BYTES,
+            id="100gb",
+        ),
+        pytest.param(
+            "SELECT * FROM load_testing.scalars_1tb",
+            TB_BYTES,
+            id="1tb",
+        ),
+    ),
+)
+def test_read_gbq_sql_large_results(sql, expected_bytes):
+    df = bpd.read_gbq(sql)
+    assert df.memory_usage().sum() >= expected_bytes
+
+
+def test_df_repr_large_table():
+    df = bpd.read_gbq("load_testing.scalars_100gb")
+    row_count, column_count = df.shape
+    expected = f"[{row_count} rows x {column_count} columns]"
+    actual = repr(df)
+    assert expected in actual
+
+
+def test_series_repr_large_table():
+    df = bpd.read_gbq("load_testing.scalars_1tb")
+    actual = repr(df["string_col"])
+    assert actual is not None
+
+
+def test_index_repr_large_table():
+    df = bpd.read_gbq("load_testing.scalars_1tb")
+    actual = repr(df.index)
+    assert actual is not None
+
+
+# FAILED
+# tests/system/load/test_large_tables.py::test_to_pandas_batches_large_table
+# google.api_core.exceptions.Forbidden: 403 Response too large to return.
+# Consider specifying a destination table in your job...
+@pytest.mark.xfail
+def test_to_pandas_batches_large_table():
+    df = bpd.read_gbq("load_testing.scalars_100gb")
+    expected_row_count, expected_column_count = df.shape
+
+    row_count = 0
+    for df in df.to_pandas_batches():
+        batch_row_count, batch_column_count = df.shape
+        assert batch_column_count == expected_column_count
+        row_count += batch_row_count
+
+        # Attempt to save on memory by manually removing the batch df
+        # from local memory after finishing with processing.
+        del df
+
+    assert row_count == expected_row_count