fix: Fix issue with iterating on >10gb dataframes (#949)

TrevorBergeron · tswast · web-flow · commit 2b0f0faf840a · 2024-09-04T10:13:08.000-05:00
Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -577,7 +577,9 @@ def to_pandas_batches(
         see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result"""
         dtypes = dict(zip(self.index_columns, self.index.dtypes))
         dtypes.update(zip(self.value_columns, self.dtypes))
-        _, query_job = self.session._execute(self.expr, ordered=True)
+        _, query_job = self.session._executor.execute(
+            self.expr, ordered=True, use_explicit_destination=True
+        )
         results_iterator = query_job.result(
             page_size=page_size, max_results=max_results
         )
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -1324,6 +1324,7 @@ def _execute(
         *,
         ordered: bool = True,
         col_id_overrides: Mapping[str, str] = {},
+        use_explicit_destination: bool = False,
     ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
         return self._executor.execute(
             array_value,
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py
@@ -102,6 +102,7 @@ def execute(
         *,
         ordered: bool = True,
         col_id_overrides: Mapping[str, str] = {},
+        use_explicit_destination: bool = False,
     ):
         """
         Execute the ArrayValue, storing the result to a temporary session-owned table.
@@ -113,6 +114,13 @@ def execute(
             array_value, ordered=ordered, col_id_overrides=col_id_overrides
         )
         job_config = bigquery.QueryJobConfig()
+        # Use explicit destination to avoid 10GB limit of temporary table
+        if use_explicit_destination:
+            schema = array_value.schema.to_bigquery()
+            destination_table = self.storage_manager.create_temp_table(
+                schema, cluster_cols=[]
+            )
+            job_config.destination = destination_table
         # TODO(swast): plumb through the api_name of the user-facing api that
         # caused this query.
         return self._run_execute_query(