Merge branch 'pangea-v1alpha' into feat-b358215039-add-attrs-to-schemafield

chalmerlowe · chalmerlowe · commit 83c2ef5b8f32 · 2024-11-25T11:49:32.000Z
diff --git a/google/cloud/bigquery/schema.py b/google/cloud/bigquery/schema.py
@@ -24,7 +24,6 @@
 from google.cloud.bigquery import standard_sql
 from google.cloud.bigquery._helpers import (
     _isinstance_or_raise,
-    _from_api_repr,
     _get_sub_prop,
 )
 from google.cloud.bigquery.enums import StandardSqlTypeNames, RoundingMode
@@ -548,6 +547,7 @@ def _to_schema_fields(schema):
         sequence is not a :class:`~google.cloud.bigquery.schema.SchemaField`
         instance or a compatible mapping representation of the field.
     """
+
     for field in schema:
         if not isinstance(field, (SchemaField, collections.abc.Mapping)):
             raise ValueError(
@@ -645,61 +645,6 @@ def to_api_repr(self) -> dict:
         return answer
 
 
-class TableSchema:
-    """Schema of a table
-
-    Args:
-        fields (Optional[list]): Describes the fields in a table.
-        foreignTypeInfo (Optional[str]): Specifies metadata of the foreign data type
-            definition in field schema.
-    """
-
-    def __init__(
-        self, fields: Optional[list] = None, foreign_type_info: Optional[str] = None
-    ):
-        self._properties: Dict[str, Any] = {}
-        self.fields = fields
-        self.foreign_type_info = foreign_type_info
-
-    @property
-    def fields(self) -> Any:
-        """Describes the fields in a table."""
-
-        return self._properties.get("fields")
-
-    @fields.setter
-    def fields(self, value: list, dtype: str) -> None:
-        value = _isinstance_or_raise(value, list, none_allowed=True)
-        self._properties["fields"] = value
-
-    @property
-    def foreign_type_info(self) -> Any:
-        """Optional. Specifies metadata of the foreign data type definition in
-        field schema (TableFieldSchema.foreign_type_definition)."""
-
-        return self._properties.get("foreignTypeInfo")
-
-    @foreign_type_info.setter
-    def foreign_type_info(self, value: str, dtype: str) -> None:
-        if not isinstance(value, str):
-            raise ValueError(
-                f"Pass {value} as a '{repr(dtype)}'." f"Got {type(value)}."
-            )
-        self._properties["foreignTypeInfo"] = value
-
-    def to_api_repr(self) -> dict:
-        """Build an API representation of this object.
-
-        Returns:
-            Dict[str, Any]:
-                A dictionary in the format used by the BigQuery API.
-        """
-        return copy.deepcopy(self._properties)
-
-    def from_api_repr(self, resource):
-        return _from_api_repr(self, resource)
-
-
 class ForeignTypeInfo:
     """Metadata about the foreign data type definition such as the system in which the
     type is defined.
@@ -734,8 +679,21 @@ def to_api_repr(self) -> dict:
         """
         return copy.deepcopy(self._properties)
 
-    def from_api_repr(self, resource):
-        return _from_api_repr(self, resource)
+    @classmethod
+    def from_api_repr(cls, resource):
+        """Factory: constructs an instance of the class (cls)
+        given its API representation.
+
+        Args:
+            resource (Dict[str, Any]):
+                API representation of the object to be instantiated.
+
+        Returns:
+            An instance of the class initialized with data from 'resource'.
+        """
+        config = cls()
+        config._properties = copy.deepcopy(resource)
+        return config
 
 
 class StorageDescriptor:
diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -1837,6 +1837,7 @@ def to_arrow_iterable(
         self,
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT,  # type: ignore
+        max_stream_count: Optional[int] = None,
     ) -> Iterator["pyarrow.RecordBatch"]:
         """[Beta] Create an iterable of class:`pyarrow.RecordBatch`, to process the table as a stream.
 
@@ -1861,6 +1862,22 @@ def to_arrow_iterable(
                 created by the server. If ``max_queue_size`` is :data:`None`, the queue
                 size is infinite.
 
+            max_stream_count (Optional[int]):
+                The maximum number of parallel download streams when
+                using BigQuery Storage API. Ignored if
+                BigQuery Storage API is not used.
+
+                This setting also has no effect if the query result
+                is deterministically ordered with ORDER BY,
+                in which case, the number of download stream is always 1.
+
+                If set to 0 or None (the default), the number of download
+                streams is determined by BigQuery the server. However, this behaviour
+                can require a lot of memory to store temporary download result,
+                especially with very large queries. In that case,
+                setting this parameter value to a value > 0 can help
+                reduce system resource consumption.
+
         Returns:
             pyarrow.RecordBatch:
                 A generator of :class:`~pyarrow.RecordBatch`.
@@ -1877,6 +1894,7 @@ def to_arrow_iterable(
             preserve_order=self._preserve_order,
             selected_fields=self._selected_fields,
             max_queue_size=max_queue_size,
+            max_stream_count=max_stream_count,
         )
         tabledata_list_download = functools.partial(
             _pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema
@@ -2003,6 +2021,7 @@ def to_dataframe_iterable(
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         dtypes: Optional[Dict[str, Any]] = None,
         max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT,  # type: ignore
+        max_stream_count: Optional[int] = None,
     ) -> "pandas.DataFrame":
         """Create an iterable of pandas DataFrames, to process the table as a stream.
 
@@ -2033,6 +2052,22 @@ def to_dataframe_iterable(
 
                 .. versionadded:: 2.14.0
 
+            max_stream_count (Optional[int]):
+                The maximum number of parallel download streams when
+                using BigQuery Storage API. Ignored if
+                BigQuery Storage API is not used.
+
+                This setting also has no effect if the query result
+                is deterministically ordered with ORDER BY,
+                in which case, the number of download stream is always 1.
+
+                If set to 0 or None (the default), the number of download
+                streams is determined by BigQuery the server. However, this behaviour
+                can require a lot of memory to store temporary download result,
+                especially with very large queries. In that case,
+                setting this parameter value to a value > 0 can help
+                reduce system resource consumption.
+
         Returns:
             pandas.DataFrame:
                 A generator of :class:`~pandas.DataFrame`.
@@ -2059,6 +2094,7 @@ def to_dataframe_iterable(
             preserve_order=self._preserve_order,
             selected_fields=self._selected_fields,
             max_queue_size=max_queue_size,
+            max_stream_count=max_stream_count,
         )
         tabledata_list_download = functools.partial(
             _pandas_helpers.download_dataframe_row_iterator,
@@ -2715,6 +2751,7 @@ def to_dataframe_iterable(
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         dtypes: Optional[Dict[str, Any]] = None,
         max_queue_size: Optional[int] = None,
+        max_stream_count: Optional[int] = None,
     ) -> Iterator["pandas.DataFrame"]:
         """Create an iterable of pandas DataFrames, to process the table as a stream.
 
@@ -2730,6 +2767,9 @@ def to_dataframe_iterable(
             max_queue_size:
                 Ignored. Added for compatibility with RowIterator.
 
+            max_stream_count:
+                Ignored. Added for compatibility with RowIterator.
+
         Returns:
             An iterator yielding a single empty :class:`~pandas.DataFrame`.
 
@@ -2744,6 +2784,7 @@ def to_arrow_iterable(
         self,
         bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
         max_queue_size: Optional[int] = None,
+        max_stream_count: Optional[int] = None,
     ) -> Iterator["pyarrow.RecordBatch"]:
         """Create an iterable of pandas DataFrames, to process the table as a stream.
 
@@ -2756,6 +2797,9 @@ def to_arrow_iterable(
             max_queue_size:
                 Ignored. Added for compatibility with RowIterator.
 
+            max_stream_count:
+                Ignored. Added for compatibility with RowIterator.
+
         Returns:
             An iterator yielding a single empty :class:`~pyarrow.RecordBatch`.
         """
diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt
@@ -24,7 +24,7 @@ google-crc32c==1.6.0; python_version >= '3.9'
 google-resumable-media==2.7.2
 googleapis-common-protos==1.66.0
 grpcio===1.62.2; python_version == '3.7'
-grpcio==1.67.1; python_version >= '3.8'
+grpcio==1.68.0; python_version >= '3.8'
 idna==3.10
 munch==4.0.0
 mypy-extensions==1.0.0
diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py
@@ -17,7 +17,7 @@
 from google.cloud.bigquery.standard_sql import StandardSqlStructType
 from google.cloud.bigquery.schema import (
     PolicyTagList,
-    # ForeignTypeInfo,
+    ForeignTypeInfo,
     StorageDescriptor,
     SerDeInfo,
 )
@@ -1158,8 +1158,6 @@ class TestForeignTypeInfo:
 
     @staticmethod
     def _get_target_class():
-        from google.cloud.bigquery.schema import ForeignTypeInfo
-
         return ForeignTypeInfo
 
     def _make_one(self, *args, **kw):
@@ -1197,6 +1195,39 @@ def test_to_api_repr(self, type_system, expected):
         result = self._make_one(type_system=type_system)
         assert result.to_api_repr() == expected
 
+    def test_from_api_repr(self):
+        """GIVEN an api representation of a ForeignTypeInfo object (i.e. resource)
+        WHEN converted into a ForeignTypeInfo object using from_api_repr() and
+        displayed as a dict
+        THEN it will have the same representation a ForeignTypeInfo object created
+        directly (via _make_one()) and displayed as a dict.
+        """
+        resource = {"typeSystem": "TYPE_SYSTEM_UNSPECIFIED"}
+
+        expected = self._make_one(type_system="TYPE_SYSTEM_UNSPECIFIED")
+
+        klass = self._get_target_class()
+        result = klass.from_api_repr(resource)
+
+        assert result.to_api_repr() == expected.to_api_repr()
+
+
+@pytest.fixture
+def _make_storage_descriptor():
+    serdeinfo = SerDeInfo(
+        serialization_library="testpath.to.LazySimpleSerDe",
+        name="serde_lib_name",
+        parameters={"key": "value"},
+    )
+
+    obj = StorageDescriptor(
+        input_format="testpath.to.OrcInputFormat",
+        location_uri="gs://test/path/",
+        output_format="testpath.to.OrcOutputFormat",
+        serde_info=serdeinfo,
+    )
+    return obj
+
 
 class TestStorageDescriptor:
     """Tests for the StorageDescriptor class."""
@@ -1288,7 +1319,34 @@ def test_to_api_repr(self):
 
         assert storage_descriptor.to_api_repr() == expected_repr
 
-    # TODO: needs a from_api_repr() test.
+    SERDEINFO = SerDeInfo(
+        serialization_library="testpath.to.LazySimpleSerDe",
+        name="serde_lib_name",
+        parameters={"key": "value"},
+    )
+
+    API_REPR = {
+        "inputFormat": "testpath.to.OrcInputFormat",
+        "locationUri": "gs://test/path/",
+        "outputFormat": "testpath.to.OrcOutputFormat",
+        "serDeInfo": SERDEINFO.to_api_repr(),
+    }
+
+    def test_from_api_repr(self, _make_storage_descriptor):
+        """GIVEN an api representation of a StorageDescriptor (i.e. API_REPR)
+        WHEN converted into a StorageDescriptor using from_api_repr() and
+        displayed as a dict
+        THEN it will have the same representation a StorageDescriptor created
+        directly (via the fixture) and displayed as a dict.
+        """
+        # generate via fixture
+        expected = _make_storage_descriptor
+        resource = self.API_REPR
+        klass = self._get_target_class()
+        # generate via API_REPR
+        result = klass.from_api_repr(resource)
+
+        assert result.to_api_repr() == expected.to_api_repr()
 
 
 class TestSerDeInfo:
@@ -1352,4 +1410,26 @@ def test_to_api_repr(self):
         }
         assert serde_info.to_api_repr() == expected_repr
 
-    # TODO: needs a from_api_repr() test.
+    def test_from_api_repr(self, _make_storage_descriptor):
+        """GIVEN an api representation of a SerDeInfo object (i.e. resource)
+        WHEN converted into a SerDeInfo using from_api_repr() and
+        displayed as a dict
+        THEN it will have the same representation a SerDeInfo object created
+        directly (via _make_one()) and displayed as a dict.
+        """
+        resource = {
+            "serializationLibrary": "testpath.to.LazySimpleSerDe",
+            "name": "serde_name",
+            "parameters": {"key": "value"},
+        }
+
+        expected = self._make_one(
+            serialization_library="testpath.to.LazySimpleSerDe",
+            name="serde_name",
+            parameters={"key": "value"},
+        )
+
+        klass = self._get_target_class()
+        result = klass.from_api_repr(resource)
+
+        assert result.to_api_repr() == expected.to_api_repr()
diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py