googleapis
diff --git a/‎bigframes/bigquery/_operations/search.py
Lines changed: 12 additions & 5 deletions b/‎bigframes/bigquery/_operations/search.py
Lines changed: 12 additions & 5 deletions
diff --git a/‎bigframes/dataframe.py
Lines changed: 1 addition & 1 deletion b/‎bigframes/dataframe.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎bigframes/ml/core.py
Lines changed: 76 additions & 16 deletions b/‎bigframes/ml/core.py
Lines changed: 76 additions & 16 deletions
diff --git a/‎bigframes/operations/ai.py
Lines changed: 4 additions & 0 deletions b/‎bigframes/operations/ai.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎bigframes/pandas/io/api.py
Lines changed: 24 additions & 4 deletions b/‎bigframes/pandas/io/api.py
Lines changed: 24 additions & 4 deletions
@@ -99,6 +99,7 @@ def vector_search(
     distance_type: Optional[Literal["euclidean", "cosine", "dot_product"]] = None,
     fraction_lists_to_search: Optional[float] = None,
     use_brute_force: Optional[bool] = None,
+    allow_large_results: Optional[bool] = None,
 ) -> dataframe.DataFrame:
     """
     Conduct vector search which searches embeddings to find semantically similar entities.
@@ -163,12 +164,12 @@ def vector_search(
         ...             query=search_query,
         ...             distance_type="cosine",
         ...             query_column_to_search="another_embedding",
-        ...             top_k=2)
+        ...             top_k=2).sort_values("id")
           query_id  embedding another_embedding  id my_embedding  distance
-        1      cat  [3.  5.2]         [3.3 5.2]   2      [2. 4.]  0.005181
-        0      dog    [1. 2.]         [0.7 2.2]   4    [1.  3.2]  0.000013
         1      cat  [3.  5.2]         [3.3 5.2]   1      [1. 2.]  0.005181
+        1      cat  [3.  5.2]         [3.3 5.2]   2      [2. 4.]  0.005181
         0      dog    [1. 2.]         [0.7 2.2]   3    [1.5 7. ]  0.004697
+        0      dog    [1. 2.]         [0.7 2.2]   4    [1.  3.2]  0.000013
         <BLANKLINE>
         [4 rows x 6 columns]
 
@@ -199,6 +200,10 @@ def vector_search(
         use_brute_force (bool):
             Determines whether to use brute force search by skipping the vector index if one is available.
             Default to False.
+        allow_large_results (bool, optional):
+            Whether to allow large query results. If ``True``, the query
+            results can be larger than the maximum response size.
+            Defaults to ``bpd.options.compute.allow_large_results``.
 
     Returns:
         bigframes.dataframe.DataFrame: A DataFrame containing vector search result.
@@ -236,9 +241,11 @@ def vector_search(
         options=options,
     )
     if index_col_ids is not None:
-        df = query._session.read_gbq(sql, index_col=index_col_ids)
+        df = query._session.read_gbq_query(
+            sql, index_col=index_col_ids, allow_large_results=allow_large_results
+        )
         df.index.names = index_labels
     else:
-        df = query._session.read_gbq(sql)
+        df = query._session.read_gbq_query(sql, allow_large_results=allow_large_results)
 
     return df
@@ -4496,7 +4496,7 @@ def to_dict(
         allow_large_results: Optional[bool] = None,
         **kwargs,
     ) -> dict | list[dict]:
-        return self.to_pandas(allow_large_results=allow_large_results).to_dict(orient, into, **kwargs)  # type: ignore
+        return self.to_pandas(allow_large_results=allow_large_results).to_dict(orient=orient, into=into, **kwargs)  # type: ignore
 
     def to_excel(
         self,
 
@@ -45,7 +45,11 @@ def ai_forecast(
         result_sql = self._sql_generator.ai_forecast(
             source_sql=input_data.sql, options=options
         )
-        return self._session.read_gbq(result_sql)
+
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        return self._session.read_gbq_query(result_sql, allow_large_results=True)
 
 
 class BqmlModel(BaseBqml):
@@ -95,7 +99,17 @@ def _apply_ml_tvf(
         )
 
         result_sql = apply_sql_tvf(input_sql)
-        df = self._session.read_gbq(result_sql, index_col=index_col_ids)
+        df = self._session.read_gbq_query(
+            result_sql,
+            index_col=index_col_ids,
+            # Many ML methods use nested JSON, which isn't yet compatible with
+            # joining local results. Also, there is a chance that the results
+            # are greater than 10 GB.
+            # TODO(b/395912450): Once the limitations with local data are
+            # resolved, consider setting allow_large_results only when expected
+            # data size is large.
+            allow_large_results=True,
+        )
         if df._has_index:
             df.index.names = index_labels
         # Restore column labels
@@ -159,7 +173,10 @@ def explain_predict(
     def global_explain(self, options: Mapping[str, bool]) -> bpd.DataFrame:
         sql = self._sql_generator.ml_global_explain(struct_options=options)
         return (
-            self._session.read_gbq(sql)
+            # TODO(b/395912450): Once the limitations with local data are
+            # resolved, consider setting allow_large_results only when expected
+            # data size is large.
+            self._session.read_gbq_query(sql, allow_large_results=True)
             .sort_values(by="attribution", ascending=False)
             .set_index("feature")
         )
@@ -234,26 +251,49 @@ def forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame:
         sql = self._sql_generator.ml_forecast(struct_options=options)
         timestamp_col_name = "forecast_timestamp"
         index_cols = [timestamp_col_name]
-        first_col_name = self._session.read_gbq(sql).columns.values[0]
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        first_col_name = self._session.read_gbq_query(
+            sql, allow_large_results=True
+        ).columns.values[0]
         if timestamp_col_name != first_col_name:
             index_cols.append(first_col_name)
-        return self._session.read_gbq(sql, index_col=index_cols).reset_index()
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        return self._session.read_gbq_query(
+            sql, index_col=index_cols, allow_large_results=True
+        ).reset_index()
 
     def explain_forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame:
         sql = self._sql_generator.ml_explain_forecast(struct_options=options)
         timestamp_col_name = "time_series_timestamp"
         index_cols = [timestamp_col_name]
-        first_col_name = self._session.read_gbq(sql).columns.values[0]
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        first_col_name = self._session.read_gbq_query(
+            sql, allow_large_results=True
+        ).columns.values[0]
         if timestamp_col_name != first_col_name:
             index_cols.append(first_col_name)
-        return self._session.read_gbq(sql, index_col=index_cols).reset_index()
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        return self._session.read_gbq_query(
+            sql, index_col=index_cols, allow_large_results=True
+        ).reset_index()
 
     def evaluate(self, input_data: Optional[bpd.DataFrame] = None):
         sql = self._sql_generator.ml_evaluate(
             input_data.sql if (input_data is not None) else None
         )
 
-        return self._session.read_gbq(sql)
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        return self._session.read_gbq_query(sql, allow_large_results=True)
 
     def llm_evaluate(
         self,
@@ -262,42 +302,62 @@ def llm_evaluate(
     ):
         sql = self._sql_generator.ml_llm_evaluate(input_data.sql, task_type)
 
-        return self._session.read_gbq(sql)
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        return self._session.read_gbq_query(sql, allow_large_results=True)
 
     def arima_evaluate(self, show_all_candidate_models: bool = False):
         sql = self._sql_generator.ml_arima_evaluate(show_all_candidate_models)
 
-        return self._session.read_gbq(sql)
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        return self._session.read_gbq_query(sql, allow_large_results=True)
 
     def arima_coefficients(self) -> bpd.DataFrame:
         sql = self._sql_generator.ml_arima_coefficients()
 
-        return self._session.read_gbq(sql)
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        return self._session.read_gbq_query(sql, allow_large_results=True)
 
     def centroids(self) -> bpd.DataFrame:
         assert self._model.model_type == "KMEANS"
 
         sql = self._sql_generator.ml_centroids()
 
-        return self._session.read_gbq(
-            sql, index_col=["centroid_id", "feature"]
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        return self._session.read_gbq_query(
+            sql, index_col=["centroid_id", "feature"], allow_large_results=True
         ).reset_index()
 
     def principal_components(self) -> bpd.DataFrame:
         assert self._model.model_type == "PCA"
 
         sql = self._sql_generator.ml_principal_components()
 
-        return self._session.read_gbq(
-            sql, index_col=["principal_component_id", "feature"]
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        return self._session.read_gbq_query(
+            sql,
+            index_col=["principal_component_id", "feature"],
+            allow_large_results=True,
         ).reset_index()
 
     def principal_component_info(self) -> bpd.DataFrame:
         assert self._model.model_type == "PCA"
 
         sql = self._sql_generator.ml_principal_component_info()
 
-        return self._session.read_gbq(sql)
+        # TODO(b/395912450): Once the limitations with local data are
+        # resolved, consider setting allow_large_results only when expected
+        # data size is large.
+        return self._session.read_gbq_query(sql, allow_large_results=True)
 
     def copy(self, new_model_name: str, replace: bool = False) -> BqmlModel:
         job_config = self._session._prepare_copy_job_config()
 
@@ -566,6 +566,10 @@ def search(
                 column_to_search=embedding_result_column,
                 query=query_df,
                 top_k=top_k,
+                # TODO(tswast): set allow_large_results based on Series size.
+                # If we expect small results, it could be faster to set
+                # allow_large_results to False.
+                allow_large_results=True,
             )
             .rename(columns={"content": search_column})
             .set_index("index")
 
@@ -187,6 +187,7 @@ def read_gbq(  # type: ignore[overload-overlap]
     use_cache: Optional[bool] = ...,
     col_order: Iterable[str] = ...,
     dry_run: Literal[False] = ...,
+    allow_large_results: Optional[bool] = ...,
 ) -> bigframes.dataframe.DataFrame:
     ...
 
@@ -203,6 +204,7 @@ def read_gbq(
     use_cache: Optional[bool] = ...,
     col_order: Iterable[str] = ...,
     dry_run: Literal[True] = ...,
+    allow_large_results: Optional[bool] = ...,
 ) -> pandas.Series:
     ...
 
@@ -218,6 +220,7 @@ def read_gbq(
     use_cache: Optional[bool] = None,
     col_order: Iterable[str] = (),
     dry_run: bool = False,
+    allow_large_results: Optional[bool] = None,
 ) -> bigframes.dataframe.DataFrame | pandas.Series:
     _set_default_session_location_if_possible(query_or_table)
     return global_session.with_default_session(
@@ -231,6 +234,7 @@ def read_gbq(
         use_cache=use_cache,
         col_order=col_order,
         dry_run=dry_run,
+        allow_large_results=allow_large_results,
     )
 
 
@@ -400,6 +404,7 @@ def read_gbq_query(  # type: ignore[overload-overlap]
     col_order: Iterable[str] = ...,
     filters: vendored_pandas_gbq.FiltersType = ...,
     dry_run: Literal[False] = ...,
+    allow_large_results: Optional[bool] = ...,
 ) -> bigframes.dataframe.DataFrame:
     ...
 
@@ -416,6 +421,7 @@ def read_gbq_query(
     col_order: Iterable[str] = ...,
     filters: vendored_pandas_gbq.FiltersType = ...,
     dry_run: Literal[True] = ...,
+    allow_large_results: Optional[bool] = ...,
 ) -> pandas.Series:
     ...
 
@@ -431,6 +437,7 @@ def read_gbq_query(
     col_order: Iterable[str] = (),
     filters: vendored_pandas_gbq.FiltersType = (),
     dry_run: bool = False,
+    allow_large_results: Optional[bool] = None,
 ) -> bigframes.dataframe.DataFrame | pandas.Series:
     _set_default_session_location_if_possible(query)
     return global_session.with_default_session(
@@ -444,6 +451,7 @@ def read_gbq_query(
         col_order=col_order,
         filters=filters,
         dry_run=dry_run,
+        allow_large_results=allow_large_results,
     )
 
 
@@ -617,7 +625,11 @@ def from_glob_path(
 
 
 def _get_bqclient() -> bigquery.Client:
-    clients_provider = bigframes.session.clients.ClientsProvider(
+    # Address circular imports in doctest due to bigframes/session/__init__.py
+    # containing a lot of logic and samples.
+    from bigframes.session import clients
+
+    clients_provider = clients.ClientsProvider(
         project=config.options.bigquery.project,
         location=config.options.bigquery.location,
         use_regional_endpoints=config.options.bigquery.use_regional_endpoints,
@@ -631,11 +643,15 @@ def _get_bqclient() -> bigquery.Client:
 
 
 def _dry_run(query, bqclient) -> bigquery.QueryJob:
+    # Address circular imports in doctest due to bigframes/session/__init__.py
+    # containing a lot of logic and samples.
+    from bigframes.session import metrics as bf_metrics
+
     job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True))
 
     # Fix for b/435183833. Log metrics even if a Session isn't available.
-    if bigframes.session.metrics.LOGGING_NAME_ENV_VAR in os.environ:
-        metrics = bigframes.session.metrics.ExecutionMetrics()
+    if bf_metrics.LOGGING_NAME_ENV_VAR in os.environ:
+        metrics = bf_metrics.ExecutionMetrics()
         metrics.count_job_stats(job)
     return job
 
@@ -645,6 +661,10 @@ def _set_default_session_location_if_possible(query):
 
 
 def _set_default_session_location_if_possible_deferred_query(create_query):
+    # Address circular imports in doctest due to bigframes/session/__init__.py
+    # containing a lot of logic and samples.
+    from bigframes.session._io import bigquery
+
     # Set the location as per the query if this is the first query the user is
     # running and:
     # (1) Default session has not started yet, and
@@ -666,7 +686,7 @@ def _set_default_session_location_if_possible_deferred_query(create_query):
         query = create_query()
         bqclient = _get_bqclient()
 
-        if bigframes.session._io.bigquery.is_query(query):
+        if bigquery.is_query(query):
             # Intentionally run outside of the session so that we can detect the
             # location before creating the session. Since it's a dry_run, labels
             # aren't necessary.