Merge branch 'main' into jco-ga

shollyman · shollyman · commit 5cd1b072184e · 2025-05-19T19:31:23.000Z
diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -508,31 +508,37 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
         bq_schema_unused = set()
 
     bq_schema_out = []
-    unknown_type_fields = []
-
+    unknown_type_columns = []
+    dataframe_reset_index = dataframe.reset_index()
     for column, dtype in list_columns_and_indexes(dataframe):
-        # Use provided type from schema, if present.
+        # Step 1: use provided type from schema, if present.
         bq_field = bq_schema_index.get(column)
         if bq_field:
             bq_schema_out.append(bq_field)
             bq_schema_unused.discard(bq_field.name)
             continue
 
-        # Otherwise, try to automatically determine the type based on the
+        # Step 2: try to automatically determine the type based on the
         # pandas dtype.
         bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
         if bq_type is None:
-            sample_data = _first_valid(dataframe.reset_index()[column])
+            sample_data = _first_valid(dataframe_reset_index[column])
             if (
                 isinstance(sample_data, _BaseGeometry)
                 and sample_data is not None  # Paranoia
             ):
                 bq_type = "GEOGRAPHY"
-        bq_field = schema.SchemaField(column, bq_type)
-        bq_schema_out.append(bq_field)
+        if bq_type is not None:
+            bq_schema_out.append(schema.SchemaField(column, bq_type))
+            continue
+
+        # Step 3: try with pyarrow if available
+        bq_field = _get_schema_by_pyarrow(column, dataframe_reset_index[column])
+        if bq_field is not None:
+            bq_schema_out.append(bq_field)
+            continue
 
-        if bq_field.field_type is None:
-            unknown_type_fields.append(bq_field)
+        unknown_type_columns.append(column)
 
     # Catch any schema mismatch. The developer explicitly asked to serialize a
     # column, but it was not found.
@@ -543,98 +549,70 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
             )
         )
 
-    # If schema detection was not successful for all columns, also try with
-    # pyarrow, if available.
-    if unknown_type_fields:
-        if not pyarrow:
-            msg = "Could not determine the type of columns: {}".format(
-                ", ".join(field.name for field in unknown_type_fields)
-            )
-            warnings.warn(msg)
-            return None  # We cannot detect the schema in full.
-
-        # The augment_schema() helper itself will also issue unknown type
-        # warnings if detection still fails for any of the fields.
-        bq_schema_out = augment_schema(dataframe, bq_schema_out)
+    if unknown_type_columns != []:
+        msg = "Could not determine the type of columns: {}".format(
+            ", ".join(unknown_type_columns)
+        )
+        warnings.warn(msg)
+        return None  # We cannot detect the schema in full.
 
-    return tuple(bq_schema_out) if bq_schema_out else None
+    return tuple(bq_schema_out)
 
 
-def augment_schema(dataframe, current_bq_schema):
-    """Try to deduce the unknown field types and return an improved schema.
+def _get_schema_by_pyarrow(name, series):
+    """Attempt to detect the type of the given series by leveraging PyArrow's
+    type detection capabilities.
 
-    This function requires ``pyarrow`` to run. If all the missing types still
-    cannot be detected, ``None`` is returned. If all types are already known,
-    a shallow copy of the given schema is returned.
+    This function requires the ``pyarrow`` library to be installed and
+    available. If the series type cannot be determined or ``pyarrow`` is not
+    available, ``None`` is returned.
 
     Args:
-        dataframe (pandas.DataFrame):
-            DataFrame for which some of the field types are still unknown.
-        current_bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
-            A BigQuery schema for ``dataframe``. The types of some or all of
-            the fields may be ``None``.
+        name (str):
+            the column name of the SchemaField.
+        series (pandas.Series):
+            The Series data for which to detect the data type.
     Returns:
-        Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]
+        Optional[google.cloud.bigquery.schema.SchemaField]:
+            A tuple containing the BigQuery-compatible type string (e.g.,
+            "STRING", "INTEGER", "TIMESTAMP", "DATETIME", "NUMERIC", "BIGNUMERIC")
+            and the mode string ("NULLABLE", "REPEATED").
+            Returns ``None`` if the type cannot be determined or ``pyarrow``
+            is not imported.
     """
-    # pytype: disable=attribute-error
-    augmented_schema = []
-    unknown_type_fields = []
-    for field in current_bq_schema:
-        if field.field_type is not None:
-            augmented_schema.append(field)
-            continue
-
-        arrow_table = pyarrow.array(dataframe.reset_index()[field.name])
-
-        if pyarrow.types.is_list(arrow_table.type):
-            # `pyarrow.ListType`
-            detected_mode = "REPEATED"
-            detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(
-                arrow_table.values.type.id
-            )
-
-            # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds
-            # it to such datetimes, causing them to be recognized as TIMESTAMP type.
-            # We thus additionally check the actual data to see if we need to overrule
-            # that and choose DATETIME instead.
-            # Note that this should only be needed for datetime values inside a list,
-            # since scalar datetime values have a proper Pandas dtype that allows
-            # distinguishing between timezone-naive and timezone-aware values before
-            # even requiring the additional schema augment logic in this method.
-            if detected_type == "TIMESTAMP":
-                valid_item = _first_array_valid(dataframe[field.name])
-                if isinstance(valid_item, datetime) and valid_item.tzinfo is None:
-                    detected_type = "DATETIME"
-        else:
-            detected_mode = field.mode
-            detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id)
-            if detected_type == "NUMERIC" and arrow_table.type.scale > 9:
-                detected_type = "BIGNUMERIC"
 
-        if detected_type is None:
-            unknown_type_fields.append(field)
-            continue
+    if not pyarrow:
+        return None
 
-        new_field = schema.SchemaField(
-            name=field.name,
-            field_type=detected_type,
-            mode=detected_mode,
-            description=field.description,
-            fields=field.fields,
-        )
-        augmented_schema.append(new_field)
+    arrow_table = pyarrow.array(series)
+    if pyarrow.types.is_list(arrow_table.type):
+        # `pyarrow.ListType`
+        mode = "REPEATED"
+        type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.values.type.id)
+
+        # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds
+        # it to such datetimes, causing them to be recognized as TIMESTAMP type.
+        # We thus additionally check the actual data to see if we need to overrule
+        # that and choose DATETIME instead.
+        # Note that this should only be needed for datetime values inside a list,
+        # since scalar datetime values have a proper Pandas dtype that allows
+        # distinguishing between timezone-naive and timezone-aware values before
+        # even requiring the additional schema augment logic in this method.
+        if type == "TIMESTAMP":
+            valid_item = _first_array_valid(series)
+            if isinstance(valid_item, datetime) and valid_item.tzinfo is None:
+                type = "DATETIME"
+    else:
+        mode = "NULLABLE"  # default mode
+        type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id)
+        if type == "NUMERIC" and arrow_table.type.scale > 9:
+            type = "BIGNUMERIC"
 
-    if unknown_type_fields:
-        warnings.warn(
-            "Pyarrow could not determine the type of columns: {}.".format(
-                ", ".join(field.name for field in unknown_type_fields)
-            )
-        )
+    if type is not None:
+        return schema.SchemaField(name, type, mode)
+    else:
         return None
 
-    return augmented_schema
-    # pytype: enable=attribute-error
-
 
 def dataframe_to_arrow(dataframe, bq_schema):
     """Convert pandas dataframe to Arrow table, using BigQuery schema.
diff --git a/google/cloud/bigquery/job/base.py b/google/cloud/bigquery/job/base.py
@@ -224,6 +224,26 @@ def job_timeout_ms(self, value):
         else:
             self._properties.pop("jobTimeoutMs", None)
 
+    @property
+    def reservation(self):
+        """str: Optional. The reservation that job would use.
+
+        User can specify a reservation to execute the job. If reservation is
+        not set, reservation is determined based on the rules defined by the
+        reservation assignments. The expected format is
+        projects/{project}/locations/{location}/reservations/{reservation}.
+
+        Raises:
+            ValueError: If ``value`` type is not None or of string type.
+        """
+        return self._properties.setdefault("reservation", None)
+
+    @reservation.setter
+    def reservation(self, value):
+        if value and not isinstance(value, str):
+            raise ValueError("Reservation must be None or a string.")
+        self._properties["reservation"] = value
+
     @property
     def labels(self):
         """Dict[str, str]: Labels for the job.
@@ -488,6 +508,18 @@ def location(self):
         """str: Location where the job runs."""
         return _helpers._get_sub_prop(self._properties, ["jobReference", "location"])
 
+    @property
+    def reservation_id(self):
+        """str: Name of the primary reservation assigned to this job.
+
+        Note that this could be different than reservations reported in
+        the reservation field if parent reservations were used to execute
+        this job.
+        """
+        return _helpers._get_sub_prop(
+            self._properties, ["statistics", "reservation_id"]
+        )
+
     def _require_client(self, client):
         """Check client or verify over-ride.
 
diff --git a/google/cloud/bigquery/schema.py b/google/cloud/bigquery/schema.py
@@ -284,15 +284,13 @@ def name(self):
         return self._properties.get("name", "")
 
     @property
-    def field_type(self):
+    def field_type(self) -> str:
         """str: The type of the field.
 
         See:
         https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema.FIELDS.type
         """
         type_ = self._properties.get("type")
-        if type_ is None:  # Shouldn't happen, but some unit tests do this.
-            return None
         return cast(str, type_).upper()
 
     @property
@@ -397,20 +395,16 @@ def _key(self):
         Returns:
             Tuple: The contents of this :class:`~google.cloud.bigquery.schema.SchemaField`.
         """
-        field_type = self.field_type.upper() if self.field_type is not None else None
-
-        # Type can temporarily be set to None if the code needs a SchemaField instance,
-        # but has not determined the exact type of the field yet.
-        if field_type is not None:
-            if field_type == "STRING" or field_type == "BYTES":
-                if self.max_length is not None:
-                    field_type = f"{field_type}({self.max_length})"
-            elif field_type.endswith("NUMERIC"):
-                if self.precision is not None:
-                    if self.scale is not None:
-                        field_type = f"{field_type}({self.precision}, {self.scale})"
-                    else:
-                        field_type = f"{field_type}({self.precision})"
+        field_type = self.field_type
+        if field_type == "STRING" or field_type == "BYTES":
+            if self.max_length is not None:
+                field_type = f"{field_type}({self.max_length})"
+        elif field_type.endswith("NUMERIC"):
+            if self.precision is not None:
+                if self.scale is not None:
+                    field_type = f"{field_type}({self.precision}, {self.scale})"
+                else:
+                    field_type = f"{field_type}({self.precision})"
 
         policy_tags = (
             None if self.policy_tags is None else tuple(sorted(self.policy_tags.names))
diff --git a/tests/unit/job/test_base.py b/tests/unit/job/test_base.py
@@ -443,6 +443,16 @@ def test_state(self):
         status["state"] = state
         self.assertEqual(job.state, state)
 
+    def test_reservation_id(self):
+        reservation_id = "RESERVATION-ID"
+        client = _make_client(project=self.PROJECT)
+        job = self._make_one(self.JOB_ID, client)
+        self.assertIsNone(job.reservation_id)
+        stats = job._properties["statistics"] = {}
+        self.assertIsNone(job.reservation_id)
+        stats["reservation_id"] = reservation_id
+        self.assertEqual(job.reservation_id, reservation_id)
+
     def _set_properties_job(self):
         client = _make_client(project=self.PROJECT)
         job = self._make_one(self.JOB_ID, client)
@@ -1188,31 +1198,37 @@ def test_fill_query_job_config_from_default(self):
         job_config = QueryJobConfig()
         job_config.dry_run = True
         job_config.maximum_bytes_billed = 1000
+        job_config.reservation = "reservation_1"
 
         default_job_config = QueryJobConfig()
         default_job_config.use_query_cache = True
         default_job_config.maximum_bytes_billed = 2000
+        default_job_config.reservation = "reservation_2"
 
         final_job_config = job_config._fill_from_default(default_job_config)
         self.assertTrue(final_job_config.dry_run)
         self.assertTrue(final_job_config.use_query_cache)
         self.assertEqual(final_job_config.maximum_bytes_billed, 1000)
+        self.assertEqual(final_job_config.reservation, "reservation_1")
 
     def test_fill_load_job_from_default(self):
         from google.cloud.bigquery import LoadJobConfig
 
         job_config = LoadJobConfig()
         job_config.create_session = True
         job_config.encoding = "UTF-8"
+        job_config.reservation = "reservation_1"
 
         default_job_config = LoadJobConfig()
         default_job_config.ignore_unknown_values = True
         default_job_config.encoding = "ISO-8859-1"
+        default_job_config.reservation = "reservation_2"
 
         final_job_config = job_config._fill_from_default(default_job_config)
         self.assertTrue(final_job_config.create_session)
         self.assertTrue(final_job_config.ignore_unknown_values)
         self.assertEqual(final_job_config.encoding, "UTF-8")
+        self.assertEqual(final_job_config.reservation, "reservation_1")
 
     def test_fill_from_default_conflict(self):
         from google.cloud.bigquery import QueryJobConfig
@@ -1232,10 +1248,12 @@ def test_fill_from_empty_default_conflict(self):
         job_config = QueryJobConfig()
         job_config.dry_run = True
         job_config.maximum_bytes_billed = 1000
+        job_config.reservation = "reservation_1"
 
         final_job_config = job_config._fill_from_default(default_job_config=None)
         self.assertTrue(final_job_config.dry_run)
         self.assertEqual(final_job_config.maximum_bytes_billed, 1000)
+        self.assertEqual(final_job_config.reservation, "reservation_1")
 
     @mock.patch("google.cloud.bigquery._helpers._get_sub_prop")
     def test__get_sub_prop_wo_default(self, _get_sub_prop):
@@ -1338,3 +1356,27 @@ def test_job_timeout_properties(self):
         job_config.job_timeout_ms = None
         assert job_config.job_timeout_ms is None
         assert "jobTimeoutMs" not in job_config._properties
+
+    def test_reservation_miss(self):
+        job_config = self._make_one()
+        self.assertEqual(job_config.reservation, None)
+
+    def test_reservation_hit(self):
+        job_config = self._make_one()
+        job_config._properties["reservation"] = "foo"
+        self.assertEqual(job_config.reservation, "foo")
+
+    def test_reservation_update_in_place(self):
+        job_config = self._make_one()
+        job_config.reservation = "bar"  # update in place
+        self.assertEqual(job_config.reservation, "bar")
+
+    def test_reservation_setter_invalid(self):
+        job_config = self._make_one()
+        with self.assertRaises(ValueError):
+            job_config.reservation = object()
+
+    def test_reservation_setter(self):
+        job_config = self._make_one()
+        job_config.reservation = "foo"
+        self.assertEqual(job_config._properties["reservation"], "foo")
diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py
diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py