Merge branch 'main' into pangea-v1alpha

chalmerlowe · web-flow · commit 74beca637a6a · 2024-11-25T06:39:24.000-05:00
diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -1837,6 +1837,7 @@ def to_arrow_iterable(
         self,
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT,  # type: ignore
+        max_stream_count: Optional[int] = None,
     ) -> Iterator["pyarrow.RecordBatch"]:
         """[Beta] Create an iterable of class:`pyarrow.RecordBatch`, to process the table as a stream.
 
@@ -1861,6 +1862,22 @@ def to_arrow_iterable(
                 created by the server. If ``max_queue_size`` is :data:`None`, the queue
                 size is infinite.
 
+            max_stream_count (Optional[int]):
+                The maximum number of parallel download streams when
+                using BigQuery Storage API. Ignored if
+                BigQuery Storage API is not used.
+
+                This setting also has no effect if the query result
+                is deterministically ordered with ORDER BY,
+                in which case, the number of download stream is always 1.
+
+                If set to 0 or None (the default), the number of download
+                streams is determined by BigQuery the server. However, this behaviour
+                can require a lot of memory to store temporary download result,
+                especially with very large queries. In that case,
+                setting this parameter value to a value > 0 can help
+                reduce system resource consumption.
+
         Returns:
             pyarrow.RecordBatch:
                 A generator of :class:`~pyarrow.RecordBatch`.
@@ -1877,6 +1894,7 @@ def to_arrow_iterable(
             preserve_order=self._preserve_order,
             selected_fields=self._selected_fields,
             max_queue_size=max_queue_size,
+            max_stream_count=max_stream_count,
         )
         tabledata_list_download = functools.partial(
             _pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema
@@ -2003,6 +2021,7 @@ def to_dataframe_iterable(
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         dtypes: Optional[Dict[str, Any]] = None,
         max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT,  # type: ignore
+        max_stream_count: Optional[int] = None,
     ) -> "pandas.DataFrame":
         """Create an iterable of pandas DataFrames, to process the table as a stream.
 
@@ -2033,6 +2052,22 @@ def to_dataframe_iterable(
 
                 .. versionadded:: 2.14.0
 
+            max_stream_count (Optional[int]):
+                The maximum number of parallel download streams when
+                using BigQuery Storage API. Ignored if
+                BigQuery Storage API is not used.
+
+                This setting also has no effect if the query result
+                is deterministically ordered with ORDER BY,
+                in which case, the number of download stream is always 1.
+
+                If set to 0 or None (the default), the number of download
+                streams is determined by BigQuery the server. However, this behaviour
+                can require a lot of memory to store temporary download result,
+                especially with very large queries. In that case,
+                setting this parameter value to a value > 0 can help
+                reduce system resource consumption.
+
         Returns:
             pandas.DataFrame:
                 A generator of :class:`~pandas.DataFrame`.
@@ -2059,6 +2094,7 @@ def to_dataframe_iterable(
             preserve_order=self._preserve_order,
             selected_fields=self._selected_fields,
             max_queue_size=max_queue_size,
+            max_stream_count=max_stream_count,
         )
         tabledata_list_download = functools.partial(
             _pandas_helpers.download_dataframe_row_iterator,
@@ -2715,6 +2751,7 @@ def to_dataframe_iterable(
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         dtypes: Optional[Dict[str, Any]] = None,
         max_queue_size: Optional[int] = None,
+        max_stream_count: Optional[int] = None,
     ) -> Iterator["pandas.DataFrame"]:
         """Create an iterable of pandas DataFrames, to process the table as a stream.
 
@@ -2730,6 +2767,9 @@ def to_dataframe_iterable(
             max_queue_size:
                 Ignored. Added for compatibility with RowIterator.
 
+            max_stream_count:
+                Ignored. Added for compatibility with RowIterator.
+
         Returns:
             An iterator yielding a single empty :class:`~pandas.DataFrame`.
 
@@ -2744,6 +2784,7 @@ def to_arrow_iterable(
         self,
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         max_queue_size: Optional[int] = None,
+        max_stream_count: Optional[int] = None,
     ) -> Iterator["pyarrow.RecordBatch"]:
         """Create an iterable of pandas DataFrames, to process the table as a stream.
 
@@ -2756,6 +2797,9 @@ def to_arrow_iterable(
             max_queue_size:
                 Ignored. Added for compatibility with RowIterator.
 
+            max_stream_count:
+                Ignored. Added for compatibility with RowIterator.
+
         Returns:
             An iterator yielding a single empty :class:`~pyarrow.RecordBatch`.
         """
diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt
@@ -24,7 +24,7 @@ google-crc32c==1.6.0; python_version >= '3.9'
 google-resumable-media==2.7.2
 googleapis-common-protos==1.66.0
 grpcio===1.62.2; python_version == '3.7'
-grpcio==1.67.1; python_version >= '3.8'
+grpcio==1.68.0; python_version >= '3.8'
 idna==3.10
 munch==4.0.0
 mypy-extensions==1.0.0
diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py
@@ -5927,3 +5927,72 @@ def test_external_catalog_table_options_setter(
         # Confirm that the api_repr of the ecto_output matches the inputs
         result = table.to_api_repr()
         assert result == expected
+
+@pytest.mark.parametrize("preserve_order", [True, False])
+def test_to_arrow_iterable_w_bqstorage_max_stream_count(preserve_order):
+    pytest.importorskip("pandas")
+    pytest.importorskip("google.cloud.bigquery_storage")
+    from google.cloud.bigquery import schema
+    from google.cloud.bigquery import table as mut
+    from google.cloud import bigquery_storage
+
+    bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient)
+    session = bigquery_storage.types.ReadSession()
+    bqstorage_client.create_read_session.return_value = session
+
+    row_iterator = mut.RowIterator(
+        _mock_client(),
+        api_request=None,
+        path=None,
+        schema=[
+            schema.SchemaField("colA", "INTEGER"),
+        ],
+        table=mut.TableReference.from_string("proj.dset.tbl"),
+    )
+    row_iterator._preserve_order = preserve_order
+
+    max_stream_count = 132
+    result_iterable = row_iterator.to_arrow_iterable(
+        bqstorage_client=bqstorage_client, max_stream_count=max_stream_count
+    )
+    list(result_iterable)
+    bqstorage_client.create_read_session.assert_called_once_with(
+        parent=mock.ANY,
+        read_session=mock.ANY,
+        max_stream_count=max_stream_count if not preserve_order else 1,
+    )
+
+
+@pytest.mark.parametrize("preserve_order", [True, False])
+def test_to_dataframe_iterable_w_bqstorage_max_stream_count(preserve_order):
+    pytest.importorskip("pandas")
+    pytest.importorskip("google.cloud.bigquery_storage")
+    from google.cloud.bigquery import schema
+    from google.cloud.bigquery import table as mut
+    from google.cloud import bigquery_storage
+
+    bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient)
+    session = bigquery_storage.types.ReadSession()
+    bqstorage_client.create_read_session.return_value = session
+
+    row_iterator = mut.RowIterator(
+        _mock_client(),
+        api_request=None,
+        path=None,
+        schema=[
+            schema.SchemaField("colA", "INTEGER"),
+        ],
+        table=mut.TableReference.from_string("proj.dset.tbl"),
+    )
+    row_iterator._preserve_order = preserve_order
+
+    max_stream_count = 132
+    result_iterable = row_iterator.to_dataframe_iterable(
+        bqstorage_client=bqstorage_client, max_stream_count=max_stream_count
+    )
+    list(result_iterable)
+    bqstorage_client.create_read_session.assert_called_once_with(
+        parent=mock.ANY,
+        read_session=mock.ANY,
+        max_stream_count=max_stream_count if not preserve_order else 1,
+    )