chore: add dry_run parameter to _read_gbq_colab (#1721)

tswast · web-flow · commit 6629e6664079 · 2025-05-12T16:31:01.000-05:00
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -477,14 +477,34 @@ def _register_object(
     ):
         self._objects.append(weakref.ref(object))
 
+    @overload
     def _read_gbq_colab(
         self,
         query: str,
-        # TODO: Add a callback parameter that takes some kind of Event object.
-        # TODO: Add dry_run parameter.
         *,
         pyformat_args: Optional[Dict[str, Any]] = None,
+        dry_run: Literal[False] = ...,
     ) -> dataframe.DataFrame:
+        ...
+
+    @overload
+    def _read_gbq_colab(
+        self,
+        query: str,
+        *,
+        pyformat_args: Optional[Dict[str, Any]] = None,
+        dry_run: Literal[True] = ...,
+    ) -> pandas.Series:
+        ...
+
+    def _read_gbq_colab(
+        self,
+        query: str,
+        # TODO: Add a callback parameter that takes some kind of Event object.
+        *,
+        pyformat_args: Optional[Dict[str, Any]] = None,
+        dry_run: bool = False,
+    ) -> Union[dataframe.DataFrame, pandas.Series]:
         """A version of read_gbq that has the necessary default values for use in colab integrations.
 
         This includes, no ordering, no index, no progress bar, always use string
@@ -501,23 +521,21 @@ def _read_gbq_colab(
                 None, this function always assumes {var} refers to a variable
                 that is supposed to be supplied in this dictionary.
         """
-        # TODO: Allow for a table ID to avoid queries like with read_gbq?
-
         if pyformat_args is None:
             pyformat_args = {}
 
-        # TODO: move this to read_gbq_query if/when we expose this feature
-        # beyond in _read_gbq_colab.
         query = bigframes.core.pyformat.pyformat(
             query,
             pyformat_args=pyformat_args,
+            # TODO: add dry_run parameter to avoid API calls for data in pyformat_args
         )
 
         return self._loader.read_gbq_query(
             query=query,
             index_col=bigframes.enums.DefaultIndexKind.NULL,
             api_name="read_gbq_colab",
             force_total_order=False,
+            dry_run=typing.cast(Union[Literal[False], Literal[True]], dry_run),
         )
 
     @overload
diff --git a/bigframes/session/dry_runs.py b/bigframes/session/dry_runs.py
@@ -101,34 +101,38 @@ def get_query_stats(
 
     job_api_repr = copy.deepcopy(query_job._properties)
 
-    job_ref = job_api_repr["jobReference"]
+    # jobReference might not be populated for "job optional" queries.
+    job_ref = job_api_repr.get("jobReference", {})
     for key, val in job_ref.items():
         index.append(key)
         values.append(val)
 
+    configuration = job_api_repr.get("configuration", {})
     index.append("jobType")
-    values.append(job_api_repr["configuration"]["jobType"])
+    values.append(configuration.get("jobType", None))
 
-    query_config = job_api_repr["configuration"]["query"]
+    query_config = configuration.get("query", {})
     for key in ("destinationTable", "useLegacySql"):
         index.append(key)
-        values.append(query_config.get(key))
+        values.append(query_config.get(key, None))
 
-    query_stats = job_api_repr["statistics"]["query"]
+    statistics = job_api_repr.get("statistics", {})
+    query_stats = statistics.get("query", {})
     for key in (
         "referencedTables",
         "totalBytesProcessed",
         "cacheHit",
         "statementType",
     ):
         index.append(key)
-        values.append(query_stats.get(key))
+        values.append(query_stats.get(key, None))
 
+    creation_time = statistics.get("creationTime", None)
     index.append("creationTime")
     values.append(
-        pandas.Timestamp(
-            job_api_repr["statistics"]["creationTime"], unit="ms", tz="UTC"
-        )
+        pandas.Timestamp(creation_time, unit="ms", tz="UTC")
+        if creation_time is not None
+        else None
     )
 
     return pandas.Series(values, index=index)
diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import datetime
 from typing import Optional, Sequence
 import unittest.mock as mock
@@ -78,11 +79,14 @@ def create_bigquery_session(
         type(table).num_rows = mock.PropertyMock(return_value=1000000000)
         bqclient.get_table.return_value = table
 
+    queries = []
     job_configs = []
 
     def query_mock(query, *args, job_config=None, **kwargs):
-        job_configs.append(job_config)
+        queries.append(query)
+        job_configs.append(copy.deepcopy(job_config))
         query_job = mock.create_autospec(google.cloud.bigquery.QueryJob)
+        query_job._properties = {}
         type(query_job).destination = mock.PropertyMock(
             return_value=anonymous_dataset.table("test_table"),
         )
@@ -100,7 +104,8 @@ def query_mock(query, *args, job_config=None, **kwargs):
     existing_query_and_wait = bqclient.query_and_wait
 
     def query_and_wait_mock(query, *args, job_config=None, **kwargs):
-        job_configs.append(job_config)
+        queries.append(query)
+        job_configs.append(copy.deepcopy(job_config))
         if query.startswith("SELECT CURRENT_TIMESTAMP()"):
             return iter([[datetime.datetime.now()]])
         else:
@@ -118,6 +123,7 @@ def query_and_wait_mock(query, *args, job_config=None, **kwargs):
     session._bq_connection_manager = mock.create_autospec(
         bigframes.clients.BqConnectionManager, instance=True
     )
+    session._queries = queries  # type: ignore
     session._job_configs = job_configs  # type: ignore
     return session
 
diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py
@@ -47,7 +47,7 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi
 def test_read_gbq_colab_includes_formatted_scalars(session):
     pyformat_args = {
         "some_integer": 123,
-        "some_string": "This could be dangerous, but we esape it",
+        "some_string": "This could be dangerous, but we escape it",
         # This is not a supported type, but ignored if not referenced.
         "some_object": object(),
     }
@@ -66,39 +66,7 @@ def test_read_gbq_colab_includes_formatted_scalars(session):
             {
                 "some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()),
                 "some_string": pandas.Series(
-                    ["This could be dangerous, but we esape it"],
-                    dtype="string[pyarrow]",
-                ),
-                "escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"),
-            }
-        ),
-    )
-
-
-def test_read_gbq_colab_includes_formatted_bigframes_dataframe(session):
-    pyformat_args = {
-        # TODO: put a bigframes DataFrame here.
-        "some_integer": 123,
-        "some_string": "This could be dangerous, but we esape it",
-        # This is not a supported type, but ignored if not referenced.
-        "some_object": object(),
-    }
-    df = session._read_gbq_colab(
-        """
-        SELECT {some_integer} as some_integer,
-        {some_string} as some_string,
-        '{{escaped}}' as escaped
-        """,
-        pyformat_args=pyformat_args,
-    )
-    result = df.to_pandas()
-    pandas.testing.assert_frame_equal(
-        result,
-        pandas.DataFrame(
-            {
-                "some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()),
-                "some_string": pandas.Series(
-                    ["This could be dangerous, but we esape it"],
+                    ["This could be dangerous, but we escape it"],
                     dtype="string[pyarrow]",
                 ),
                 "escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"),
diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py
@@ -30,3 +30,39 @@ def test_read_gbq_colab_includes_label():
         label_values.extend(config.labels.values())
 
     assert "read_gbq_colab" in label_values
+
+
+def test_read_gbq_colab_includes_formatted_values_in_dry_run():
+    session = mocks.create_bigquery_session()
+
+    pyformat_args = {
+        "some_integer": 123,
+        "some_string": "This could be dangerous, but we escape it",
+        # This is not a supported type, but ignored if not referenced.
+        "some_object": object(),
+    }
+    _ = session._read_gbq_colab(
+        """
+        SELECT {some_integer} as some_integer,
+        {some_string} as some_string,
+        '{{escaped}}' as escaped
+        """,
+        pyformat_args=pyformat_args,
+        dry_run=True,
+    )
+    expected = """
+        SELECT 123 as some_integer,
+        'This could be dangerous, but we escape it' as some_string,
+        '{escaped}' as escaped
+        """
+    queries = session._queries  # type: ignore
+    configs = session._job_configs  # type: ignore
+
+    for query, config in zip(queries, configs):
+        if config is None:
+            continue
+        if config.dry_run:
+            break
+
+    assert config.dry_run
+    assert query.strip() == expected.strip()