From d12b1bd171a9df2812bd26107425718b93806571 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Fri, 20 Sep 2024 14:15:45 -0500
Subject: [PATCH 1/3] fix!: `to_gbq` loads `unit8` columns to BigQuery INT64
 instead of STRING

fix!: `to_gbq` loads naive (no timezone) columns to BigQuery DATETIME instead of TIMESTAMP
fix!: `to_gbq` loads object column containing bool values to BOOLEAN instead of STRING
fix!: `to_gbq` loads object column containing dictionary values to STRUCT instead of STRING
---
 noxfile.py                                    |   1 +
 owlbot.py                                     |   2 +-
 pandas_gbq/core/__init__.py                   |   3 +
 pandas_gbq/core/pandas.py                     |  70 +++++
 pandas_gbq/gbq.py                             |  12 +-
 pandas_gbq/load.py                            |  10 +-
 pandas_gbq/{schema.py => schema/__init__.py}  |  31 ---
 pandas_gbq/schema/bigquery.py                 |  44 +++
 pandas_gbq/schema/pandas_to_bigquery.py       | 218 +++++++++++++++
 pandas_gbq/schema/pyarrow_to_bigquery.py      |  67 +++++
 setup.py                                      |   3 +-
 testing/constraints-3.8.txt                   |   4 +-
 tests/system/test_to_gbq.py                   | 256 +++++++++++++++++-
 tests/unit/schema/__init__.py                 |   3 +
 tests/unit/schema/test_pandas_to_bigquery.py  | 156 +++++++++++
 tests/unit/schema/test_pyarrow_to_bigquery.py |  25 ++
 tests/unit/test_load.py                       |   5 +-
 tests/unit/test_schema.py                     | 141 ++++++++--
 18 files changed, 977 insertions(+), 74 deletions(-)
 create mode 100644 pandas_gbq/core/__init__.py
 create mode 100644 pandas_gbq/core/pandas.py
 rename pandas_gbq/{schema.py => schema/__init__.py} (85%)
 create mode 100644 pandas_gbq/schema/bigquery.py
 create mode 100644 pandas_gbq/schema/pandas_to_bigquery.py
 create mode 100644 pandas_gbq/schema/pyarrow_to_bigquery.py
 create mode 100644 tests/unit/schema/__init__.py
 create mode 100644 tests/unit/schema/test_pandas_to_bigquery.py
 create mode 100644 tests/unit/schema/test_pyarrow_to_bigquery.py

diff --git a/noxfile.py b/noxfile.py
index d316dac8..02cd052d 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -51,6 +51,7 @@
 UNIT_TEST_EXTRAS = [
     "bqstorage",
     "tqdm",
+    "geopandas",
 ]
 UNIT_TEST_EXTRAS_BY_PYTHON = {
     "3.9": [],
diff --git a/owlbot.py b/owlbot.py
index 916a7074..190298a6 100644
--- a/owlbot.py
+++ b/owlbot.py
@@ -32,7 +32,7 @@
     # Use a middle version of Python to test when no extras are installed.
     "3.9": []
 }
-extras = ["tqdm"]
+extras = ["tqdm", "geopandas"]
 templated_files = common.py_library(
     unit_test_python_versions=["3.8", "3.9", "3.10", "3.11", "3.12"],
     system_test_python_versions=["3.8", "3.9", "3.10", "3.11", "3.12"],
diff --git a/pandas_gbq/core/__init__.py b/pandas_gbq/core/__init__.py
new file mode 100644
index 00000000..02d26e8e
--- /dev/null
+++ b/pandas_gbq/core/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2024 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
diff --git a/pandas_gbq/core/pandas.py b/pandas_gbq/core/pandas.py
new file mode 100644
index 00000000..37557adf
--- /dev/null
+++ b/pandas_gbq/core/pandas.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2019 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import itertools
+
+import pandas
+
+
+def list_columns_and_indexes(dataframe, index=True):
+    """Return all index and column names with dtypes.
+
+    Returns:
+        Sequence[Tuple[str, dtype]]:
+            Returns a sorted list of indexes and column names with
+            corresponding dtypes. If an index is missing a name or has the
+            same name as a column, the index is omitted.
+    """
+    column_names = frozenset(dataframe.columns)
+    columns_and_indexes = []
+    if index:
+        if isinstance(dataframe.index, pandas.MultiIndex):
+            for name in dataframe.index.names:
+                if name and name not in column_names:
+                    values = dataframe.index.get_level_values(name)
+                    columns_and_indexes.append((name, values.dtype))
+        else:
+            if dataframe.index.name and dataframe.index.name not in column_names:
+                columns_and_indexes.append(
+                    (dataframe.index.name, dataframe.index.dtype)
+                )
+
+    columns_and_indexes += zip(dataframe.columns, dataframe.dtypes)
+    return columns_and_indexes
+
+
+def first_valid(series):
+    first_valid_index = series.first_valid_index()
+    if first_valid_index is not None:
+        return series.at[first_valid_index]
+
+
+def first_array_valid(series):
+    """Return the first "meaningful" element from the array series.
+
+    Here, "meaningful" means the first non-None element in one of the arrays that can
+    be used for type detextion.
+    """
+    first_valid_index = series.first_valid_index()
+    if first_valid_index is None:
+        return None
+
+    valid_array = series.at[first_valid_index]
+    valid_item = next((item for item in valid_array if not pandas.isna(item)), None)
+
+    if valid_item is not None:
+        return valid_item
+
+    # Valid item is None because all items in the "valid" array are invalid. Try
+    # to find a true valid array manually.
+    for array in itertools.islice(series, first_valid_index + 1, None):
+        try:
+            array_iter = iter(array)
+        except TypeError:
+            continue  # Not an array, apparently, e.g. None, thus skip.
+        valid_item = next((item for item in array_iter if not pandas.isna(item)), None)
+        if valid_item is not None:
+            break
+
+    return valid_item
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
index 19c42a6b..06b6bbf2 100644
--- a/pandas_gbq/gbq.py
+++ b/pandas_gbq/gbq.py
@@ -25,6 +25,7 @@
 from pandas_gbq.features import FEATURES
 import pandas_gbq.query
 import pandas_gbq.schema
+import pandas_gbq.schema.pandas_to_bigquery
 import pandas_gbq.timestamp
 
 try:
@@ -1219,9 +1220,16 @@ def _generate_bq_schema(df, default_type="STRING"):
     be overridden: https://github.com/pydata/pandas-gbq/issues/218, this
     method can be removed after there is time to migrate away from this
     method."""
-    from pandas_gbq import schema
+    fields = pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(
+        df,
+        default_type=default_type,
+    )
+    fields_json = []
+
+    for field in fields:
+        fields_json.append(field.to_api_repr())
 
-    return schema.generate_bq_schema(df, default_type=default_type)
+    return {"fields": fields_json}
 
 
 class _Table(GbqConnector):
diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py
index 45e474b2..567899df 100644
--- a/pandas_gbq/load.py
+++ b/pandas_gbq/load.py
@@ -15,6 +15,8 @@
 
 from pandas_gbq import exceptions
 import pandas_gbq.schema
+import pandas_gbq.schema.bigquery
+import pandas_gbq.schema.pandas_to_bigquery
 
 
 def encode_chunk(dataframe):
@@ -214,11 +216,9 @@ def load_csv_from_file(
     This method is needed for writing with google-cloud-bigquery versions that
     don't implment load_table_from_dataframe with the CSV serialization format.
     """
-    if schema is None:
-        schema = pandas_gbq.schema.generate_bq_schema(dataframe)
-
-    schema = pandas_gbq.schema.remove_policy_tags(schema)
-    bq_schema = pandas_gbq.schema.to_google_cloud_bigquery(schema)
+    bq_schema = pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(
+        dataframe, schema
+    )
 
     def load_chunk(chunk, job_config):
         try:
diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema/__init__.py
similarity index 85%
rename from pandas_gbq/schema.py
rename to pandas_gbq/schema/__init__.py
index b60fdeda..350a1d2e 100644
--- a/pandas_gbq/schema.py
+++ b/pandas_gbq/schema/__init__.py
@@ -92,37 +92,6 @@ def schema_is_subset(schema_remote, schema_local):
     return all(field in fields_remote for field in fields_local)
 
 
-def generate_bq_schema(dataframe, default_type="STRING"):
-    """Given a passed dataframe, generate the associated Google BigQuery schema.
-
-    Arguments:
-        dataframe (pandas.DataFrame): D
-    default_type : string
-        The default big query type in case the type of the column
-        does not exist in the schema.
-    """
-
-    # If you update this mapping, also update the table at
-    # `docs/source/writing.rst`.
-    type_mapping = {
-        "i": "INTEGER",
-        "b": "BOOLEAN",
-        "f": "FLOAT",
-        "O": "STRING",
-        "S": "STRING",
-        "U": "STRING",
-        "M": "TIMESTAMP",
-    }
-
-    fields = []
-    for column_name, dtype in dataframe.dtypes.items():
-        fields.append(
-            {"name": column_name, "type": type_mapping.get(dtype.kind, default_type)}
-        )
-
-    return {"fields": fields}
-
-
 def update_schema(schema_old, schema_new):
     """
     Given an old BigQuery schema, update it with a new one.
diff --git a/pandas_gbq/schema/bigquery.py b/pandas_gbq/schema/bigquery.py
new file mode 100644
index 00000000..0de21978
--- /dev/null
+++ b/pandas_gbq/schema/bigquery.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2019 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import collections
+
+import google.cloud.bigquery
+
+
+def to_schema_fields(schema):
+    """Coerce `schema` to a list of schema field instances.
+
+    Args:
+        schema(Sequence[Union[ \
+            :class:`~google.cloud.bigquery.schema.SchemaField`, \
+            Mapping[str, Any] \
+        ]]):
+            Table schema to convert. If some items are passed as mappings,
+            their content must be compatible with
+            :meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`.
+
+    Returns:
+        Sequence[:class:`~google.cloud.bigquery.schema.SchemaField`]
+
+    Raises:
+        Exception: If ``schema`` is not a sequence, or if any item in the
+        sequence is not a :class:`~google.cloud.bigquery.schema.SchemaField`
+        instance or a compatible mapping representation of the field.
+    """
+    for field in schema:
+        if not isinstance(
+            field, (google.cloud.bigquery.SchemaField, collections.abc.Mapping)
+        ):
+            raise ValueError(
+                "Schema items must either be fields or compatible "
+                "mapping representations."
+            )
+
+    return [
+        field
+        if isinstance(field, google.cloud.bigquery.SchemaField)
+        else google.cloud.bigquery.SchemaField.from_api_repr(field)
+        for field in schema
+    ]
diff --git a/pandas_gbq/schema/pandas_to_bigquery.py b/pandas_gbq/schema/pandas_to_bigquery.py
new file mode 100644
index 00000000..5a979a12
--- /dev/null
+++ b/pandas_gbq/schema/pandas_to_bigquery.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2019 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import collections.abc
+import datetime
+from typing import Optional, Tuple
+import warnings
+
+import db_dtypes
+from google.cloud.bigquery import schema
+import pandas
+import pyarrow
+
+import pandas_gbq.core.pandas
+import pandas_gbq.schema.bigquery
+import pandas_gbq.schema.pyarrow_to_bigquery
+
+try:
+    # _BaseGeometry is used to detect shapely objects in `bq_to_arrow_array`
+    from shapely.geometry.base import BaseGeometry as _BaseGeometry  # type: ignore
+except ImportError:
+    # No shapely, use NoneType for _BaseGeometry as a placeholder.
+    _BaseGeometry = type(None)
+
+
+# If you update this mapping, also update the table at
+# `docs/source/writing.rst`.
+_PANDAS_DTYPE_TO_BQ = {
+    "bool": "BOOLEAN",
+    "datetime64[ns, UTC]": "TIMESTAMP",
+    "datetime64[ns]": "DATETIME",
+    "float32": "FLOAT",
+    "float64": "FLOAT",
+    "int8": "INTEGER",
+    "int16": "INTEGER",
+    "int32": "INTEGER",
+    "int64": "INTEGER",
+    "uint8": "INTEGER",
+    "uint16": "INTEGER",
+    "uint32": "INTEGER",
+    "geometry": "GEOGRAPHY",
+    db_dtypes.DateDtype.name: "DATE",
+    db_dtypes.TimeDtype.name: "TIME",
+    # TODO(tswast): Add support for JSON.
+}
+
+
+def dataframe_to_bigquery_fields(
+    dataframe,
+    override_bigquery_fields=None,
+    default_type="STRING",
+    index=False,
+) -> Tuple[schema.SchemaField]:
+    """Convert a pandas DataFrame schema to a BigQuery schema.
+
+    Args:
+        dataframe (pandas.DataFrame):
+            DataFrame for which the client determines the BigQuery schema.
+        override_bigquery_fields (Sequence[Union[ \
+            :class:`~google.cloud.bigquery.schema.SchemaField`, \
+            Mapping[str, Any] \
+        ]]):
+            A BigQuery schema. Use this argument to override the autodetected
+            type for some or all of the DataFrame columns.
+
+    Returns:
+        Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]:
+            The automatically determined schema. Returns None if the type of
+            any column cannot be determined.
+    """
+    if override_bigquery_fields:
+        override_bigquery_fields = pandas_gbq.schema.bigquery.to_schema_fields(
+            override_bigquery_fields
+        )
+        override_fields_by_name = {
+            field.name: field for field in override_bigquery_fields
+        }
+        override_fields_unused = set(override_fields_by_name.keys())
+    else:
+        override_fields_by_name = {}
+        override_fields_unused = set()
+
+    bq_schema_out = []
+    unknown_type_fields = []
+
+    # TODO(tswast): Support index=True in to_gbq.
+    for column, dtype in pandas_gbq.core.pandas.list_columns_and_indexes(
+        dataframe, index=index
+    ):
+        # Use provided type from schema, if present.
+        bq_field = override_fields_by_name.get(column)
+        if bq_field:
+            bq_schema_out.append(bq_field)
+            override_fields_unused.discard(bq_field.name)
+            continue
+
+        # Try to automatically determine the type based on the pandas dtype.
+        bq_field = dtype_to_bigquery_field(column, dtype)
+        if bq_field:
+            bq_schema_out.append(bq_field)
+            continue
+
+        # Try to automatically determine the type based on a few rows of the data.
+        values = dataframe.reset_index()[column]
+        bq_field = values_to_bigquery_field(column, values)
+
+        if bq_field:
+            bq_schema_out.append(bq_field)
+            continue
+
+        # Try to automatically determine the type based on the arrow conversion.
+        try:
+            arrow_value = pyarrow.array(values)
+            bq_field = (
+                pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field(
+                    column, arrow_value.type
+                )
+            )
+
+            if bq_field:
+                bq_schema_out.append(bq_field)
+                continue
+        except pyarrow.lib.ArrowInvalid:
+            # TODO(tswast): Better error message if conversion to arrow fails.
+            pass
+
+        # Unknown field type.
+        bq_field = schema.SchemaField(column, default_type)
+        bq_schema_out.append(bq_field)
+        unknown_type_fields.append(bq_field)
+
+    # Catch any schema mismatch. The developer explicitly asked to serialize a
+    # column, but it was not found.
+    if override_fields_unused:
+        raise ValueError(
+            "Provided BigQuery fields contain field(s) not present in DataFrame: {}".format(
+                override_fields_unused
+            )
+        )
+
+    # If schema detection was not successful for all columns, also try with
+    # pyarrow, if available.
+    if unknown_type_fields:
+        msg = "Could not determine the type of columns: {}".format(
+            ", ".join(field.name for field in unknown_type_fields)
+        )
+        warnings.warn(msg)
+
+    return tuple(bq_schema_out)
+
+
+def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]:
+    bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
+
+    if bq_type is not None:
+        return schema.SchemaField(name, bq_type)
+
+    if hasattr(pandas, "ArrowDtype") and isinstance(dtype, pandas.ArrowDtype):
+        return pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field(
+            name, dtype.pyarrow_dtype
+        )
+
+    return None
+
+
+def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
+    if isinstance(value, str):
+        return schema.SchemaField(name, "STRING")
+
+    # For timezone-naive datetimes, the later pyarrow conversion to try and
+    # learn the type add a timezone to such datetimes, causing them to be
+    # recognized as TIMESTAMP type. We thus additionally check the actual data
+    # to see if we need to overrule that and choose DATETIME instead.
+    #
+    # See: https://github.com/googleapis/python-bigquery/issues/985
+    # and https://github.com/googleapis/python-bigquery/pull/1061
+    # and https://github.com/googleapis/python-bigquery-pandas/issues/450
+    if isinstance(value, datetime.datetime):
+        if value.tzinfo is not None:
+            return schema.SchemaField(name, "TIMESTAMP")
+        else:
+            return schema.SchemaField(name, "DATETIME")
+
+    if _BaseGeometry is not None and isinstance(value, _BaseGeometry):
+        return schema.SchemaField(name, "GEOGRAPHY")
+
+    return None
+
+
+def values_to_bigquery_field(name, values) -> Optional[schema.SchemaField]:
+    value = pandas_gbq.core.pandas.first_valid(values)
+
+    # All NULL, type not determinable.
+    if value is None:
+        return None
+
+    field = value_to_bigquery_field(name, value)
+    if field is not None:
+        return field
+
+    if isinstance(value, str):
+        return schema.SchemaField(name, "STRING")
+
+    # Check plain ARRAY values here. Let STRUCT get determined by pyarrow,
+    # which can examine more values to determine all keys.
+    if isinstance(value, collections.abc.Iterable) and not isinstance(
+        value, collections.abc.Mapping
+    ):
+        # It could be that this value contains all None or is empty, so get the
+        # first non-None value we can find.
+        valid_item = pandas_gbq.core.pandas.first_array_valid(values)
+        field = value_to_bigquery_field(name, valid_item)
+
+        if field is not None:
+            return schema.SchemaField(name, field.field_type, mode="REPEATED")
+
+    return None
diff --git a/pandas_gbq/schema/pyarrow_to_bigquery.py b/pandas_gbq/schema/pyarrow_to_bigquery.py
new file mode 100644
index 00000000..c63559eb
--- /dev/null
+++ b/pandas_gbq/schema/pyarrow_to_bigquery.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+from typing import Optional, cast
+
+from google.cloud.bigquery import schema
+import pyarrow
+import pyarrow.types
+
+_ARROW_SCALAR_IDS_TO_BQ = {
+    # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
+    pyarrow.bool_().id: "BOOLEAN",
+    pyarrow.int8().id: "INTEGER",
+    pyarrow.int16().id: "INTEGER",
+    pyarrow.int32().id: "INTEGER",
+    pyarrow.int64().id: "INTEGER",
+    pyarrow.uint8().id: "INTEGER",
+    pyarrow.uint16().id: "INTEGER",
+    pyarrow.uint32().id: "INTEGER",
+    pyarrow.uint64().id: "INTEGER",
+    pyarrow.float16().id: "FLOAT",
+    pyarrow.float32().id: "FLOAT",
+    pyarrow.float64().id: "FLOAT",
+    pyarrow.time32("ms").id: "TIME",
+    pyarrow.time64("ns").id: "TIME",
+    pyarrow.timestamp("ns").id: "TIMESTAMP",
+    pyarrow.date32().id: "DATE",
+    pyarrow.date64().id: "DATETIME",  # because millisecond resolution
+    pyarrow.binary().id: "BYTES",
+    pyarrow.string().id: "STRING",  # also alias for pyarrow.utf8()
+    pyarrow.large_string().id: "STRING",
+    # The exact decimal's scale and precision are not important, as only
+    # the type ID matters, and it's the same for all decimal256 instances.
+    pyarrow.decimal128(38, scale=9).id: "NUMERIC",
+    pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC",
+}
+
+
+def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
+    detected_type = _ARROW_SCALAR_IDS_TO_BQ.get(type_.id, None)
+    if detected_type is not None:
+        return schema.SchemaField(name, detected_type)
+
+    if pyarrow.types.is_list(type_):
+        return arrow_list_type_to_bigquery(name, type_)
+
+    if pyarrow.types.is_struct(type_):
+        inner_fields: list[pyarrow.Field] = []
+        struct_type = cast(pyarrow.StructType, type_)
+        for field_index in range(struct_type.num_fields):
+            field = struct_type[field_index]
+            inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type))
+
+        return schema.SchemaField(name, "RECORD", fields=inner_fields)
+
+    return None
+
+
+def arrow_list_type_to_bigquery(name, type_) -> Optional[schema.SchemaField]:
+    inner_field = arrow_type_to_bigquery_field(name, type_.value_type)
+    if inner_field is None:
+        return None
+
+    return schema.SchemaField(
+        name, inner_field.field_type, mode="REPEATED", fields=inner_field.fields
+    )
diff --git a/setup.py b/setup.py
index df793e59..a4127a55 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,8 @@
     "bqstorage": [
         "google-cloud-bigquery-storage >=2.16.2, <3.0.0dev",
     ],
-    "tqdm": "tqdm>=4.23.0",
+    "tqdm": ["tqdm>=4.23.0"],
+    "geopandas": ["geopandas>=0.9.0", "Shapely>=1.8.4"],
 }
 
 # Setup boilerplate below this line.
diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt
index e551d17e..068480fd 100644
--- a/testing/constraints-3.8.txt
+++ b/testing/constraints-3.8.txt
@@ -7,6 +7,7 @@
 # Then this file should have foo==1.14.0
 # protobuf==3.19.5
 db-dtypes==1.0.4
+geopandas==0.9.0
 google-api-core==2.10.2
 google-auth==2.13.0
 google-auth-oauthlib==0.7.0
@@ -16,5 +17,6 @@ numpy==1.18.1
 pandas==1.1.4
 pyarrow==3.0.0
 pydata-google-auth==1.5.0
+Shapely==1.8.4
 tqdm==4.23.0
-packaging==22.0.0
\ No newline at end of file
+packaging==22.0.0
diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py
index 2e7245d5..0bcd8780 100644
--- a/tests/system/test_to_gbq.py
+++ b/tests/system/test_to_gbq.py
@@ -10,6 +10,7 @@
 import db_dtypes
 import pandas
 import pandas.testing
+import pyarrow
 import pytest
 
 pytest.importorskip("google.cloud.bigquery", minversion="1.24.0")
@@ -125,6 +126,37 @@ def test_series_round_trip(
 )
 
 DATAFRAME_ROUND_TRIPS = [
+    # Ensure that a BOOLEAN column can be written with bool, boolean, and
+    # object dtypes. See:
+    # https://github.com/googleapis/python-bigquery-pandas/issues/105
+    pytest.param(
+        *DataFrameRoundTripTestCase(
+            input_df=pandas.DataFrame(
+                {
+                    "row_num": [0, 1, 2],
+                    "bool_col": pandas.Series(
+                        [True, False, True],
+                        dtype="bool",
+                    ),
+                    "boolean_col": pandas.Series(
+                        [None, True, False],
+                        dtype="boolean",
+                    ),
+                    "object_col": pandas.Series(
+                        [False, None, True],
+                        dtype="object",
+                    ),
+                }
+            ),
+            table_schema=[
+                {"name": "bool_col", "type": "BOOLEAN"},
+                {"name": "boolean_col", "type": "BOOLEAN"},
+                {"name": "object_col", "type": "BOOLEAN"},
+            ],
+            api_methods={"load_csv", "load_parquet"},
+        ),
+        id="boolean",
+    ),
     # Ensure that a DATE column can be written with datetime64[ns] dtype
     # data. See:
     # https://github.com/googleapis/python-bigquery-pandas/issues/362
@@ -176,6 +208,96 @@ def test_series_round_trip(
             {"name": "date_col", "type": "DATE"},
         ],
     ),
+    # Loading an INTEGER column should work for any integer dtype. See:
+    # https://github.com/googleapis/python-bigquery-pandas/issues/616
+    pytest.param(
+        *DataFrameRoundTripTestCase(
+            input_df=pandas.DataFrame(
+                {
+                    "row_num": [0, 1, 2],
+                    "object": pandas.Series(
+                        [None, 1, -2],
+                        dtype="object",
+                    ),
+                    "nullable_int64": pandas.Series(
+                        [3, None, -4],
+                        dtype="Int64",
+                    ),
+                    "int8": pandas.Series(
+                        [5, -6, 7],
+                        dtype="int8",
+                    ),
+                    "int16": pandas.Series(
+                        [-8, 9, -10],
+                        dtype="int16",
+                    ),
+                    "int32": pandas.Series(
+                        [11, -12, 13],
+                        dtype="int32",
+                    ),
+                    "int64": pandas.Series(
+                        [-14, 15, -16],
+                        dtype="int64",
+                    ),
+                    "uint8": pandas.Series(
+                        [0, 1, 2],
+                        dtype="uint8",
+                    ),
+                    "uint16": pandas.Series(
+                        [3, 4, 5],
+                        dtype="uint16",
+                    ),
+                    "uint32": pandas.Series(
+                        [6, 7, 8],
+                        dtype="uint32",
+                    ),
+                }
+            ),
+            expected_df=pandas.DataFrame(
+                {
+                    "row_num": [0, 1, 2],
+                    "object": pandas.Series(
+                        [None, 1, -2],
+                        dtype="Int64",
+                    ),
+                    "nullable_int64": pandas.Series(
+                        [3, None, -4],
+                        dtype="Int64",
+                    ),
+                    "int8": pandas.Series(
+                        [5, -6, 7],
+                        dtype="Int64",
+                    ),
+                    "int16": pandas.Series(
+                        [-8, 9, -10],
+                        dtype="Int64",
+                    ),
+                    "int32": pandas.Series(
+                        [11, -12, 13],
+                        dtype="Int64",
+                    ),
+                    "int64": pandas.Series(
+                        [-14, 15, -16],
+                        dtype="Int64",
+                    ),
+                    "uint8": pandas.Series(
+                        [0, 1, 2],
+                        dtype="Int64",
+                    ),
+                    "uint16": pandas.Series(
+                        [3, 4, 5],
+                        dtype="Int64",
+                    ),
+                    "uint32": pandas.Series(
+                        [6, 7, 8],
+                        dtype="Int64",
+                    ),
+                }
+            ),
+            api_methods={"load_csv", "load_parquet"},
+        ),
+        id="integer",
+    ),
     # Loading a NUMERIC column should work for floating point objects. See:
     # https://github.com/googleapis/python-bigquery-pandas/issues/421
     DataFrameRoundTripTestCase(
@@ -240,6 +362,115 @@ def test_series_round_trip(
         ),
         id="issue365-extreme-datetimes",
     ),
+    pytest.param(
+        # Load STRUCT and ARRAY using either object column or ArrowDtype.
+        # See: https://github.com/googleapis/python-bigquery-pandas/issues/452
+        *DataFrameRoundTripTestCase(
+            input_df=pandas.DataFrame(
+                {
+                    "row_num": [0, 1, 2],
+                    "object_struct": pandas.Series(
+                        [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}],
+                        dtype="object",
+                    ),
+                    # Array of DATETIME requires inspection into list elements.
+                    # See:
+                    # https://github.com/googleapis/python-bigquery/pull/1061
+                    "object_array_datetime": pandas.Series(
+                        [[], [datetime.datetime(1998, 9, 4, 12, 0, 0)], []],
+                        dtype="object",
+                    ),
+                    "object_array_of_struct": pandas.Series(
+                        [[], [{"test": "str4"}], []], dtype="object"
+                    ),
+                    "arrow_struct": pandas.Series(
+                        [
+                            {"version": 1, "project": "pandas"},
+                            {"version": 2, "project": "pandas"},
+                            {"version": 1, "project": "numpy"},
+                        ],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.struct(
+                                [
+                                    ("version", pyarrow.int64()),
+                                    ("project", pyarrow.string()),
+                                ]
+                            )
+                        ),
+                    ),
+                    "arrow_array": pandas.Series(
+                        [[1, 2, 3], None, [4, 5, 6]],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.list_(pyarrow.int64()),
+                        ),
+                    ),
+                    "arrow_array_of_struct": pandas.Series(
+                        [
+                            [{"test": "str5"}],
+                            None,
+                            [{"test": "str6"}, {"test": "str7"}],
+                        ],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])),
+                        ),
+                    ),
+                },
+            ),
+            expected_df=pandas.DataFrame(
+                {
+                    "row_num": [0, 1, 2],
+                    "object_struct": pandas.Series(
+                        [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.struct([("test", pyarrow.string())]),
+                        ),
+                    ),
+                    # Array of DATETIME requires inspection into list elements.
+                    # See:
+                    # https://github.com/googleapis/python-bigquery/pull/1061
+                    "object_array_datetime": pandas.Series(
+                        [[], [datetime.datetime(1998, 9, 4, 12, 0, 0)], []],
+                        dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.timestamp("us"))),
+                    ),
+                    "object_array_of_struct": pandas.Series(
+                        [[], [{"test": "str4"}], []],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])),
+                        ),
+                    ),
+                    "arrow_struct": pandas.Series(
+                        [
+                            {"version": 1, "project": "pandas"},
+                            {"version": 2, "project": "pandas"},
+                            {"version": 1, "project": "numpy"},
+                        ],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.struct(
+                                [
+                                    ("version", pyarrow.int64()),
+                                    ("project", pyarrow.string()),
+                                ]
+                            )
+                        ),
+                    ),
+                    "arrow_array": pandas.Series(
+                        [[1, 2, 3], [], [4, 5, 6]],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.list_(pyarrow.int64()),
+                        ),
+                    ),
+                    "arrow_array_of_struct": pandas.Series(
+                        [[{"test": "str5"}], [], [{"test": "str6"}, {"test": "str7"}]],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])),
+                        ),
+                    ),
+                },
+            ),
+            api_methods={"load_parquet"},
+        ),
+        id="struct",
+    ),
 ]
 
 
@@ -264,13 +495,20 @@ def test_dataframe_round_trip_with_table_schema(
     method_under_test(
         input_df, table_id, table_schema=table_schema, api_method=api_method
     )
-    round_trip = read_gbq(
-        table_id,
-        dtypes=dict(zip(expected_df.columns, expected_df.dtypes)),
-        # BigQuery Storage API is required to avoid out-of-bound due to extra
-        # day from rounding error which was fixed in google-cloud-bigquery
-        # 2.6.0. https://github.com/googleapis/python-bigquery/pull/402
-        use_bqstorage_api=True,
+    round_trip = (
+        read_gbq(
+            table_id,
+            dtypes=dict(zip(expected_df.columns, expected_df.dtypes)),
+            # BigQuery Storage API is required to avoid out-of-bound due to extra
+            # day from rounding error which was fixed in google-cloud-bigquery
+            # 2.6.0. https://github.com/googleapis/python-bigquery/pull/402
+            use_bqstorage_api=True,
+        )
+        .set_index("row_num")
+        .sort_index()
+    )
+
+    # TODO(tswast): Support writing index columns if to_gbq(index=True).
+    pandas.testing.assert_frame_equal(
+        expected_df.set_index("row_num").sort_index(), round_trip
     )
-    round_trip.sort_values("row_num", inplace=True)
-    pandas.testing.assert_frame_equal(expected_df, round_trip)
diff --git a/tests/unit/schema/__init__.py b/tests/unit/schema/__init__.py
new file mode 100644
index 00000000..02d26e8e
--- /dev/null
+++ b/tests/unit/schema/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2024 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
diff --git a/tests/unit/schema/test_pandas_to_bigquery.py b/tests/unit/schema/test_pandas_to_bigquery.py
new file mode 100644
index 00000000..f9b1ddf4
--- /dev/null
+++ b/tests/unit/schema/test_pandas_to_bigquery.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2019 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import collections
+import datetime
+import operator
+
+import pandas
+import pytest
+from google.cloud.bigquery import schema
+
+
+@pytest.fixture
+def module_under_test():
+    from pandas_gbq.schema import pandas_to_bigquery
+
+    return pandas_to_bigquery
+
+
+def test_dataframe_to_bigquery_fields_w_named_index(module_under_test):
+    df_data = collections.OrderedDict(
+        [
+            ("str_column", ["hello", "world"]),
+            ("int_column", [42, 8]),
+            ("bool_column", [True, False]),
+        ]
+    )
+    index = pandas.Index(["a", "b"], name="str_index")
+    dataframe = pandas.DataFrame(df_data, index=index)
+
+    returned_schema = module_under_test.dataframe_to_bigquery_fields(
+        dataframe, [], index=True
+    )
+
+    expected_schema = (
+        schema.SchemaField("str_index", "STRING", "NULLABLE"),
+        schema.SchemaField("str_column", "STRING", "NULLABLE"),
+        schema.SchemaField("int_column", "INTEGER", "NULLABLE"),
+        schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"),
+    )
+    assert returned_schema == expected_schema
+
+
+def test_dataframe_to_bigquery_fields_w_multiindex(module_under_test):
+    df_data = collections.OrderedDict(
+        [
+            ("str_column", ["hello", "world"]),
+            ("int_column", [42, 8]),
+            ("bool_column", [True, False]),
+        ]
+    )
+    index = pandas.MultiIndex.from_tuples(
+        [
+            ("a", 0, datetime.datetime(1999, 12, 31, 23, 59, 59, 999999)),
+            ("a", 0, datetime.datetime(2000, 1, 1, 0, 0, 0)),
+        ],
+        names=["str_index", "int_index", "dt_index"],
+    )
+    dataframe = pandas.DataFrame(df_data, index=index)
+
+    returned_schema = module_under_test.dataframe_to_bigquery_fields(
+        dataframe, [], index=True
+    )
+
+    expected_schema = (
+        schema.SchemaField("str_index", "STRING", "NULLABLE"),
+        schema.SchemaField("int_index", "INTEGER", "NULLABLE"),
+        schema.SchemaField("dt_index", "DATETIME", "NULLABLE"),
+        schema.SchemaField("str_column", "STRING", "NULLABLE"),
+        schema.SchemaField("int_column", "INTEGER", "NULLABLE"),
+        schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"),
+    )
+    assert returned_schema == expected_schema
+
+
+def test_dataframe_to_bigquery_fields_w_bq_schema(module_under_test):
+    df_data = collections.OrderedDict(
+        [
+            ("str_column", ["hello", "world"]),
+            ("int_column", [42, 8]),
+            ("bool_column", [True, False]),
+        ]
+    )
+    dataframe = pandas.DataFrame(df_data)
+
+    dict_schema = [
+        {"name": "str_column", "type": "STRING", "mode": "NULLABLE"},
+        {"name": "bool_column", "type": "BOOL", "mode": "REQUIRED"},
+    ]
+
+    returned_schema = module_under_test.dataframe_to_bigquery_fields(
+        dataframe, dict_schema
+    )
+
+    expected_schema = (
+        schema.SchemaField("str_column", "STRING", "NULLABLE"),
+        schema.SchemaField("int_column", "INTEGER", "NULLABLE"),
+        schema.SchemaField("bool_column", "BOOL", "REQUIRED"),
+    )
+    assert returned_schema == expected_schema
+
+
+def test_dataframe_to_bigquery_fields_fallback_needed_w_pyarrow(module_under_test):
+    dataframe = pandas.DataFrame(
+        data=[
+            {"id": 10, "status": "FOO", "created_at": datetime.date(2019, 5, 10)},
+            {"id": 20, "status": "BAR", "created_at": datetime.date(2018, 9, 12)},
+        ]
+    )
+
+    detected_schema = module_under_test.dataframe_to_bigquery_fields(
+        dataframe, override_bigquery_fields=[]
+    )
+    expected_schema = (
+        schema.SchemaField("id", "INTEGER", mode="NULLABLE"),
+        schema.SchemaField("status", "STRING", mode="NULLABLE"),
+        schema.SchemaField("created_at", "DATE", mode="NULLABLE"),
+    )
+    by_name = operator.attrgetter("name")
+    assert sorted(detected_schema, key=by_name) == sorted(expected_schema, key=by_name)
+
+
+def test_dataframe_to_bigquery_fields_w_extra_fields(module_under_test):
+    with pytest.raises(ValueError) as exc_context:
+        module_under_test.dataframe_to_bigquery_fields(
+            pandas.DataFrame(),
+            override_bigquery_fields=(schema.SchemaField("not_in_df", "STRING"),),
+        )
+    message = str(exc_context.value)
+    assert (
+        "Provided BigQuery fields contain field(s) not present in DataFrame:" in message
+    )
+    assert "not_in_df" in message
+
+
+def test_dataframe_to_bigquery_fields_geography(module_under_test):
+    geopandas = pytest.importorskip("geopandas")
+    from shapely import wkt
+
+    df = geopandas.GeoDataFrame(
+        pandas.DataFrame(
+            dict(
+                name=["foo", "bar"],
+                geo1=[None, None],
+                geo2=[None, wkt.loads("Point(1 1)")],
+            )
+        ),
+        geometry="geo1",
+    )
+    bq_schema = module_under_test.dataframe_to_bigquery_fields(df, [])
+    assert bq_schema == (
+        schema.SchemaField("name", "STRING"),
+        schema.SchemaField("geo1", "GEOGRAPHY"),
+        schema.SchemaField("geo2", "GEOGRAPHY"),
+    )
diff --git a/tests/unit/schema/test_pyarrow_to_bigquery.py b/tests/unit/schema/test_pyarrow_to_bigquery.py
new file mode 100644
index 00000000..9a20e342
--- /dev/null
+++ b/tests/unit/schema/test_pyarrow_to_bigquery.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2024 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import pyarrow
+
+from pandas_gbq.schema import pyarrow_to_bigquery
+
+
+def test_arrow_type_to_bigquery_field_unknown():
+    # Default types should be picked at a higher layer.
+    assert (
+        pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow.null())
+        is None
+    )
+
+
+def test_arrow_type_to_bigquery_field_list_of_unknown():
+    # Default types should be picked at a higher layer.
+    assert (
+        pyarrow_to_bigquery.arrow_type_to_bigquery_field(
+            "test_name", pyarrow.list_(pyarrow.null())
+        )
+        is None
+    )
diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py
index 45c73533..bb611781 100644
--- a/tests/unit/test_load.py
+++ b/tests/unit/test_load.py
@@ -165,11 +165,8 @@ def test_load_csv_from_file_generates_schema(mock_bigquery_client):
     assert sent_schema[2].field_type == "FLOAT"
     assert sent_schema[3].name == "string_col"
     assert sent_schema[3].field_type == "STRING"
-    # TODO: Disambiguate TIMESTAMP from DATETIME based on if column is
-    # localized or at least use field type from table metadata. See:
-    # https://github.com/googleapis/python-bigquery-pandas/issues/450
     assert sent_schema[4].name == "datetime_col"
-    assert sent_schema[4].field_type == "TIMESTAMP"
+    assert sent_schema[4].field_type == "DATETIME"
     assert sent_schema[5].name == "timestamp_col"
     assert sent_schema[5].field_type == "TIMESTAMP"
 
diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py
index 7fdc616c..48e8862a 100644
--- a/tests/unit/test_schema.py
+++ b/tests/unit/test_schema.py
@@ -7,14 +7,12 @@
 
 import google.cloud.bigquery
 import pandas
+import pyarrow
 import pytest
 
-
-@pytest.fixture
-def module_under_test():
-    import pandas_gbq.schema
-
-    return pandas_gbq.schema
+import pandas_gbq
+import pandas_gbq.gbq
+import pandas_gbq.schema
 
 
 @pytest.mark.parametrize(
@@ -45,17 +43,15 @@ def module_under_test():
         ),
     ],
 )
-def test_schema_is_subset_passes_if_subset(
-    module_under_test, original_fields, dataframe_fields
-):
+def test_schema_is_subset_passes_if_subset(original_fields, dataframe_fields):
     # Issue #24 schema_is_subset indicates whether the schema of the
     # dataframe is a subset of the schema of the bigquery table
     table_schema = {"fields": original_fields}
     tested_schema = {"fields": dataframe_fields}
-    assert module_under_test.schema_is_subset(table_schema, tested_schema)
+    assert pandas_gbq.schema.schema_is_subset(table_schema, tested_schema)
 
 
-def test_schema_is_subset_fails_if_not_subset(module_under_test):
+def test_schema_is_subset_fails_if_not_subset():
     table_schema = {
         "fields": [
             {"name": "A", "type": "FLOAT"},
@@ -66,12 +62,17 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test):
     tested_schema = {
         "fields": [{"name": "A", "type": "FLOAT"}, {"name": "C", "type": "FLOAT"}]
     }
-    assert not module_under_test.schema_is_subset(table_schema, tested_schema)
+    assert not pandas_gbq.schema.schema_is_subset(table_schema, tested_schema)
 
 
 @pytest.mark.parametrize(
     "dataframe,expected_schema",
     [
+        pytest.param(
+            pandas.DataFrame(data={"col1": [object()]}),
+            {"fields": [{"name": "col1", "type": "STRING"}]},
+            id="default-type-fails-pyarrow-conversion",
+        ),
         (
             pandas.DataFrame(data={"col1": [1, 2, 3]}),
             {"fields": [{"name": "col1", "type": "INTEGER"}]},
@@ -88,13 +89,39 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test):
             pandas.DataFrame(data={"col1": ["hello", "world"]}),
             {"fields": [{"name": "col1", "type": "STRING"}]},
         ),
-        (
-            pandas.DataFrame(data={"col1": [datetime.datetime.now()]}),
-            {"fields": [{"name": "col1", "type": "TIMESTAMP"}]},
+        pytest.param(
+            # No time zone -> DATETIME,
+            # Time zone -> TIMESTAMP
+            # See: https://github.com/googleapis/python-bigquery-pandas/issues/450
+            pandas.DataFrame(
+                data={
+                    "object1": pandas.Series([datetime.datetime.now()], dtype="object"),
+                    "object2": pandas.Series(
+                        [datetime.datetime.now(datetime.timezone.utc)], dtype="object"
+                    ),
+                    "datetime1": pandas.Series(
+                        [datetime.datetime.now()], dtype="datetime64[ns]"
+                    ),
+                    "datetime2": pandas.Series(
+                        [datetime.datetime.now(datetime.timezone.utc)],
+                        dtype="datetime64[ns, UTC]",
+                    ),
+                }
+            ),
+            {
+                "fields": [
+                    {"name": "object1", "type": "DATETIME"},
+                    {"name": "object2", "type": "TIMESTAMP"},
+                    {"name": "datetime1", "type": "DATETIME"},
+                    {"name": "datetime2", "type": "TIMESTAMP"},
+                ]
+            },
+            id="issue450-datetime",
         ),
         (
             pandas.DataFrame(
                 data={
+                    "col0": [datetime.datetime.now(datetime.timezone.utc)],
                     "col1": [datetime.datetime.now()],
                     "col2": ["hello"],
                     "col3": [3.14],
@@ -104,7 +131,8 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test):
             ),
             {
                 "fields": [
-                    {"name": "col1", "type": "TIMESTAMP"},
+                    {"name": "col0", "type": "TIMESTAMP"},
+                    {"name": "col1", "type": "DATETIME"},
                     {"name": "col2", "type": "STRING"},
                     {"name": "col3", "type": "FLOAT"},
                     {"name": "col4", "type": "BOOLEAN"},
@@ -112,10 +140,83 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test):
                 ]
             },
         ),
+        pytest.param(
+            # uint8, which is the result from get_dummies, should be INTEGER.
+            # https://github.com/googleapis/python-bigquery-pandas/issues/616
+            pandas.DataFrame({"col": [0, 1]}, dtype="uint8"),
+            {"fields": [{"name": "col", "type": "INTEGER"}]},
+            id="issue616-uint8",
+        ),
+        pytest.param(
+            # object column containing dictionaries should load to STRUCT.
+            # https://github.com/googleapis/python-bigquery-pandas/issues/452
+            pandas.DataFrame(
+                {
+                    "my_struct": pandas.Series(
+                        [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}],
+                        dtype="object",
+                    ),
+                }
+            ),
+            {
+                "fields": [
+                    {
+                        "name": "my_struct",
+                        "type": "RECORD",
+                        "fields": [
+                            {"name": "test", "type": "STRING", "mode": "NULLABLE"}
+                        ],
+                    }
+                ]
+            },
+            id="issue452-struct",
+        ),
+        pytest.param(
+            pandas.DataFrame(
+                {
+                    "object": pandas.Series([[], ["abc"], []], dtype="object"),
+                    "list": pandas.Series(
+                        [[], [1, 2, 3], []],
+                        dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.int64()))
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
+                    ),
+                    "list_of_struct": pandas.Series(
+                        [[], [{"test": "abc"}], []],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.list_(pyarrow.struct([("test", pyarrow.string())]))
+                        )
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
+                    ),
+                }
+            ),
+            {
+                "fields": [
+                    {"name": "object", "type": "STRING", "mode": "REPEATED"},
+                    {"name": "list", "type": "INTEGER", "mode": "REPEATED"},
+                    {
+                        "name": "list_of_struct",
+                        "type": "RECORD",
+                        "mode": "REPEATED",
+                        "fields": [
+                            {"name": "test", "type": "STRING", "mode": "NULLABLE"},
+                        ],
+                    },
+                ],
+            },
+            id="array",
+        ),
     ],
 )
-def test_generate_bq_schema(module_under_test, dataframe, expected_schema):
-    schema = module_under_test.generate_bq_schema(dataframe)
+def test_generate_bq_schema(dataframe, expected_schema):
+    schema = pandas_gbq.gbq._generate_bq_schema(dataframe)
+
+    # NULLABLE is the default mode.
+    for field in expected_schema["fields"]:
+        if "mode" not in field:
+            field["mode"] = "NULLABLE"
+
     assert schema == expected_schema
 
 
@@ -156,8 +257,8 @@ def test_generate_bq_schema(module_under_test, dataframe, expected_schema):
         ),
     ],
 )
-def test_update_schema(module_under_test, schema_old, schema_new, expected_output):
-    output = module_under_test.update_schema(schema_old, schema_new)
+def test_update_schema(schema_old, schema_new, expected_output):
+    output = pandas_gbq.schema.update_schema(schema_old, schema_new)
     assert output == expected_output
 
 

From a61265822d7c9e6f260d4cd17b773d39af59314d Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Fri, 20 Sep 2024 16:43:27 -0500
Subject: [PATCH 2/3] fix tests on older pandas

---
 tests/system/test_to_gbq.py                  | 36 +++++++++++++++-----
 tests/unit/schema/test_pandas_to_bigquery.py |  2 +-
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py
index 0bcd8780..6352fbd7 100644
--- a/tests/system/test_to_gbq.py
+++ b/tests/system/test_to_gbq.py
@@ -396,13 +396,17 @@ def test_series_round_trip(
                                     ("project", pyarrow.string()),
                                 ]
                             )
-                        ),
+                        )
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
                     ),
                     "arrow_array": pandas.Series(
                         [[1, 2, 3], None, [4, 5, 6]],
                         dtype=pandas.ArrowDtype(
                             pyarrow.list_(pyarrow.int64()),
-                        ),
+                        )
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
                     ),
                     "arrow_array_of_struct": pandas.Series(
                         [
@@ -412,7 +416,9 @@ def test_series_round_trip(
                         ],
                         dtype=pandas.ArrowDtype(
                             pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])),
-                        ),
+                        )
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
                     ),
                 },
             ),
@@ -423,20 +429,26 @@ def test_series_round_trip(
                         [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}],
                         dtype=pandas.ArrowDtype(
                             pyarrow.struct([("test", pyarrow.string())]),
-                        ),
+                        )
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
                     ),
                     # Array of DATETIME requires inspection into list elements.
                     # See:
                     # https://github.com/googleapis/python-bigquery/pull/1061
                     "object_array_datetime": pandas.Series(
                         [[], [datetime.datetime(1998, 9, 4, 12, 0, 0)], []],
-                        dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.timestamp("us"))),
+                        dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.timestamp("us")))
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
                     ),
                     "object_array_of_struct": pandas.Series(
                         [[], [{"test": "str4"}], []],
                         dtype=pandas.ArrowDtype(
                             pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])),
-                        ),
+                        )
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
                     ),
                     "arrow_struct": pandas.Series(
                         [
@@ -451,19 +463,25 @@ def test_series_round_trip(
                                     ("project", pyarrow.string()),
                                 ]
                             )
-                        ),
+                        )
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
                     ),
                     "arrow_array": pandas.Series(
                         [[1, 2, 3], [], [4, 5, 6]],
                         dtype=pandas.ArrowDtype(
                             pyarrow.list_(pyarrow.int64()),
-                        ),
+                        )
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
                     ),
                     "arrow_array_of_struct": pandas.Series(
                         [[{"test": "str5"}], [], [{"test": "str6"}, {"test": "str7"}]],
                         dtype=pandas.ArrowDtype(
                             pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])),
-                        ),
+                        )
+                        if hasattr(pandas, "ArrowDtype")
+                        else "object",
                     ),
                 },
             ),
diff --git a/tests/unit/schema/test_pandas_to_bigquery.py b/tests/unit/schema/test_pandas_to_bigquery.py
index f9b1ddf4..924ce1ee 100644
--- a/tests/unit/schema/test_pandas_to_bigquery.py
+++ b/tests/unit/schema/test_pandas_to_bigquery.py
@@ -6,9 +6,9 @@
 import datetime
 import operator
 
+from google.cloud.bigquery import schema
 import pandas
 import pytest
-from google.cloud.bigquery import schema
 
 
 @pytest.fixture

From 39cb3017aea3e1038f158834785ebb098b020f07 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Mon, 23 Sep 2024 10:12:05 -0500
Subject: [PATCH 3/3] deps: min pyarrow is now 4.0.0 to support compliant
 nested types

---
 setup.py                    | 2 +-
 testing/constraints-3.8.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index a4127a55..10d97733 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@
     "db-dtypes >=1.0.4,<2.0.0",
     "numpy >=1.18.1",
     "pandas >=1.1.4",
-    "pyarrow >=3.0.0",
+    "pyarrow >=4.0.0",
     "pydata-google-auth >=1.5.0",
     # Note: google-api-core and google-auth are also included via transitive
     # dependency on google-cloud-bigquery, but this library also uses them
diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt
index 068480fd..8d6ef4f4 100644
--- a/testing/constraints-3.8.txt
+++ b/testing/constraints-3.8.txt
@@ -15,7 +15,7 @@ google-cloud-bigquery==3.4.2
 google-cloud-bigquery-storage==2.16.2
 numpy==1.18.1
 pandas==1.1.4
-pyarrow==3.0.0
+pyarrow==4.0.0
 pydata-google-auth==1.5.0
 Shapely==1.8.4
 tqdm==4.23.0