From d12b1bd171a9df2812bd26107425718b93806571 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 20 Sep 2024 14:15:45 -0500 Subject: [PATCH 1/3] fix!: `to_gbq` loads `unit8` columns to BigQuery INT64 instead of STRING fix!: `to_gbq` loads naive (no timezone) columns to BigQuery DATETIME instead of TIMESTAMP fix!: `to_gbq` loads object column containing bool values to BOOLEAN instead of STRING fix!: `to_gbq` loads object column containing dictionary values to STRUCT instead of STRING --- noxfile.py | 1 + owlbot.py | 2 +- pandas_gbq/core/__init__.py | 3 + pandas_gbq/core/pandas.py | 70 +++++ pandas_gbq/gbq.py | 12 +- pandas_gbq/load.py | 10 +- pandas_gbq/{schema.py => schema/__init__.py} | 31 --- pandas_gbq/schema/bigquery.py | 44 +++ pandas_gbq/schema/pandas_to_bigquery.py | 218 +++++++++++++++ pandas_gbq/schema/pyarrow_to_bigquery.py | 67 +++++ setup.py | 3 +- testing/constraints-3.8.txt | 4 +- tests/system/test_to_gbq.py | 256 +++++++++++++++++- tests/unit/schema/__init__.py | 3 + tests/unit/schema/test_pandas_to_bigquery.py | 156 +++++++++++ tests/unit/schema/test_pyarrow_to_bigquery.py | 25 ++ tests/unit/test_load.py | 5 +- tests/unit/test_schema.py | 141 ++++++++-- 18 files changed, 977 insertions(+), 74 deletions(-) create mode 100644 pandas_gbq/core/__init__.py create mode 100644 pandas_gbq/core/pandas.py rename pandas_gbq/{schema.py => schema/__init__.py} (85%) create mode 100644 pandas_gbq/schema/bigquery.py create mode 100644 pandas_gbq/schema/pandas_to_bigquery.py create mode 100644 pandas_gbq/schema/pyarrow_to_bigquery.py create mode 100644 tests/unit/schema/__init__.py create mode 100644 tests/unit/schema/test_pandas_to_bigquery.py create mode 100644 tests/unit/schema/test_pyarrow_to_bigquery.py diff --git a/noxfile.py b/noxfile.py index d316dac8..02cd052d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -51,6 +51,7 @@ UNIT_TEST_EXTRAS = [ "bqstorage", "tqdm", + "geopandas", ] UNIT_TEST_EXTRAS_BY_PYTHON = { "3.9": [], diff --git a/owlbot.py b/owlbot.py index 916a7074..190298a6 100644 --- a/owlbot.py +++ b/owlbot.py @@ -32,7 +32,7 @@ # Use a middle version of Python to test when no extras are installed. "3.9": [] } -extras = ["tqdm"] +extras = ["tqdm", "geopandas"] templated_files = common.py_library( unit_test_python_versions=["3.8", "3.9", "3.10", "3.11", "3.12"], system_test_python_versions=["3.8", "3.9", "3.10", "3.11", "3.12"], diff --git a/pandas_gbq/core/__init__.py b/pandas_gbq/core/__init__.py new file mode 100644 index 00000000..02d26e8e --- /dev/null +++ b/pandas_gbq/core/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. diff --git a/pandas_gbq/core/pandas.py b/pandas_gbq/core/pandas.py new file mode 100644 index 00000000..37557adf --- /dev/null +++ b/pandas_gbq/core/pandas.py @@ -0,0 +1,70 @@ +# Copyright (c) 2019 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import itertools + +import pandas + + +def list_columns_and_indexes(dataframe, index=True): + """Return all index and column names with dtypes. + + Returns: + Sequence[Tuple[str, dtype]]: + Returns a sorted list of indexes and column names with + corresponding dtypes. If an index is missing a name or has the + same name as a column, the index is omitted. + """ + column_names = frozenset(dataframe.columns) + columns_and_indexes = [] + if index: + if isinstance(dataframe.index, pandas.MultiIndex): + for name in dataframe.index.names: + if name and name not in column_names: + values = dataframe.index.get_level_values(name) + columns_and_indexes.append((name, values.dtype)) + else: + if dataframe.index.name and dataframe.index.name not in column_names: + columns_and_indexes.append( + (dataframe.index.name, dataframe.index.dtype) + ) + + columns_and_indexes += zip(dataframe.columns, dataframe.dtypes) + return columns_and_indexes + + +def first_valid(series): + first_valid_index = series.first_valid_index() + if first_valid_index is not None: + return series.at[first_valid_index] + + +def first_array_valid(series): + """Return the first "meaningful" element from the array series. + + Here, "meaningful" means the first non-None element in one of the arrays that can + be used for type detextion. + """ + first_valid_index = series.first_valid_index() + if first_valid_index is None: + return None + + valid_array = series.at[first_valid_index] + valid_item = next((item for item in valid_array if not pandas.isna(item)), None) + + if valid_item is not None: + return valid_item + + # Valid item is None because all items in the "valid" array are invalid. Try + # to find a true valid array manually. + for array in itertools.islice(series, first_valid_index + 1, None): + try: + array_iter = iter(array) + except TypeError: + continue # Not an array, apparently, e.g. None, thus skip. + valid_item = next((item for item in array_iter if not pandas.isna(item)), None) + if valid_item is not None: + break + + return valid_item diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 19c42a6b..06b6bbf2 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -25,6 +25,7 @@ from pandas_gbq.features import FEATURES import pandas_gbq.query import pandas_gbq.schema +import pandas_gbq.schema.pandas_to_bigquery import pandas_gbq.timestamp try: @@ -1219,9 +1220,16 @@ def _generate_bq_schema(df, default_type="STRING"): be overridden: https://github.com/pydata/pandas-gbq/issues/218, this method can be removed after there is time to migrate away from this method.""" - from pandas_gbq import schema + fields = pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields( + df, + default_type=default_type, + ) + fields_json = [] + + for field in fields: + fields_json.append(field.to_api_repr()) - return schema.generate_bq_schema(df, default_type=default_type) + return {"fields": fields_json} class _Table(GbqConnector): diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index 45e474b2..567899df 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -15,6 +15,8 @@ from pandas_gbq import exceptions import pandas_gbq.schema +import pandas_gbq.schema.bigquery +import pandas_gbq.schema.pandas_to_bigquery def encode_chunk(dataframe): @@ -214,11 +216,9 @@ def load_csv_from_file( This method is needed for writing with google-cloud-bigquery versions that don't implment load_table_from_dataframe with the CSV serialization format. """ - if schema is None: - schema = pandas_gbq.schema.generate_bq_schema(dataframe) - - schema = pandas_gbq.schema.remove_policy_tags(schema) - bq_schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) + bq_schema = pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields( + dataframe, schema + ) def load_chunk(chunk, job_config): try: diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema/__init__.py similarity index 85% rename from pandas_gbq/schema.py rename to pandas_gbq/schema/__init__.py index b60fdeda..350a1d2e 100644 --- a/pandas_gbq/schema.py +++ b/pandas_gbq/schema/__init__.py @@ -92,37 +92,6 @@ def schema_is_subset(schema_remote, schema_local): return all(field in fields_remote for field in fields_local) -def generate_bq_schema(dataframe, default_type="STRING"): - """Given a passed dataframe, generate the associated Google BigQuery schema. - - Arguments: - dataframe (pandas.DataFrame): D - default_type : string - The default big query type in case the type of the column - does not exist in the schema. - """ - - # If you update this mapping, also update the table at - # `docs/source/writing.rst`. - type_mapping = { - "i": "INTEGER", - "b": "BOOLEAN", - "f": "FLOAT", - "O": "STRING", - "S": "STRING", - "U": "STRING", - "M": "TIMESTAMP", - } - - fields = [] - for column_name, dtype in dataframe.dtypes.items(): - fields.append( - {"name": column_name, "type": type_mapping.get(dtype.kind, default_type)} - ) - - return {"fields": fields} - - def update_schema(schema_old, schema_new): """ Given an old BigQuery schema, update it with a new one. diff --git a/pandas_gbq/schema/bigquery.py b/pandas_gbq/schema/bigquery.py new file mode 100644 index 00000000..0de21978 --- /dev/null +++ b/pandas_gbq/schema/bigquery.py @@ -0,0 +1,44 @@ +# Copyright (c) 2019 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import collections + +import google.cloud.bigquery + + +def to_schema_fields(schema): + """Coerce `schema` to a list of schema field instances. + + Args: + schema(Sequence[Union[ \ + :class:`~google.cloud.bigquery.schema.SchemaField`, \ + Mapping[str, Any] \ + ]]): + Table schema to convert. If some items are passed as mappings, + their content must be compatible with + :meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`. + + Returns: + Sequence[:class:`~google.cloud.bigquery.schema.SchemaField`] + + Raises: + Exception: If ``schema`` is not a sequence, or if any item in the + sequence is not a :class:`~google.cloud.bigquery.schema.SchemaField` + instance or a compatible mapping representation of the field. + """ + for field in schema: + if not isinstance( + field, (google.cloud.bigquery.SchemaField, collections.abc.Mapping) + ): + raise ValueError( + "Schema items must either be fields or compatible " + "mapping representations." + ) + + return [ + field + if isinstance(field, google.cloud.bigquery.SchemaField) + else google.cloud.bigquery.SchemaField.from_api_repr(field) + for field in schema + ] diff --git a/pandas_gbq/schema/pandas_to_bigquery.py b/pandas_gbq/schema/pandas_to_bigquery.py new file mode 100644 index 00000000..5a979a12 --- /dev/null +++ b/pandas_gbq/schema/pandas_to_bigquery.py @@ -0,0 +1,218 @@ +# Copyright (c) 2019 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import collections.abc +import datetime +from typing import Optional, Tuple +import warnings + +import db_dtypes +from google.cloud.bigquery import schema +import pandas +import pyarrow + +import pandas_gbq.core.pandas +import pandas_gbq.schema.bigquery +import pandas_gbq.schema.pyarrow_to_bigquery + +try: + # _BaseGeometry is used to detect shapely objects in `bq_to_arrow_array` + from shapely.geometry.base import BaseGeometry as _BaseGeometry # type: ignore +except ImportError: + # No shapely, use NoneType for _BaseGeometry as a placeholder. + _BaseGeometry = type(None) + + +# If you update this mapping, also update the table at +# `docs/source/writing.rst`. +_PANDAS_DTYPE_TO_BQ = { + "bool": "BOOLEAN", + "datetime64[ns, UTC]": "TIMESTAMP", + "datetime64[ns]": "DATETIME", + "float32": "FLOAT", + "float64": "FLOAT", + "int8": "INTEGER", + "int16": "INTEGER", + "int32": "INTEGER", + "int64": "INTEGER", + "uint8": "INTEGER", + "uint16": "INTEGER", + "uint32": "INTEGER", + "geometry": "GEOGRAPHY", + db_dtypes.DateDtype.name: "DATE", + db_dtypes.TimeDtype.name: "TIME", + # TODO(tswast): Add support for JSON. +} + + +def dataframe_to_bigquery_fields( + dataframe, + override_bigquery_fields=None, + default_type="STRING", + index=False, +) -> Tuple[schema.SchemaField]: + """Convert a pandas DataFrame schema to a BigQuery schema. + + Args: + dataframe (pandas.DataFrame): + DataFrame for which the client determines the BigQuery schema. + override_bigquery_fields (Sequence[Union[ \ + :class:`~google.cloud.bigquery.schema.SchemaField`, \ + Mapping[str, Any] \ + ]]): + A BigQuery schema. Use this argument to override the autodetected + type for some or all of the DataFrame columns. + + Returns: + Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]: + The automatically determined schema. Returns None if the type of + any column cannot be determined. + """ + if override_bigquery_fields: + override_bigquery_fields = pandas_gbq.schema.bigquery.to_schema_fields( + override_bigquery_fields + ) + override_fields_by_name = { + field.name: field for field in override_bigquery_fields + } + override_fields_unused = set(override_fields_by_name.keys()) + else: + override_fields_by_name = {} + override_fields_unused = set() + + bq_schema_out = [] + unknown_type_fields = [] + + # TODO(tswast): Support index=True in to_gbq. + for column, dtype in pandas_gbq.core.pandas.list_columns_and_indexes( + dataframe, index=index + ): + # Use provided type from schema, if present. + bq_field = override_fields_by_name.get(column) + if bq_field: + bq_schema_out.append(bq_field) + override_fields_unused.discard(bq_field.name) + continue + + # Try to automatically determine the type based on the pandas dtype. + bq_field = dtype_to_bigquery_field(column, dtype) + if bq_field: + bq_schema_out.append(bq_field) + continue + + # Try to automatically determine the type based on a few rows of the data. + values = dataframe.reset_index()[column] + bq_field = values_to_bigquery_field(column, values) + + if bq_field: + bq_schema_out.append(bq_field) + continue + + # Try to automatically determine the type based on the arrow conversion. + try: + arrow_value = pyarrow.array(values) + bq_field = ( + pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field( + column, arrow_value.type + ) + ) + + if bq_field: + bq_schema_out.append(bq_field) + continue + except pyarrow.lib.ArrowInvalid: + # TODO(tswast): Better error message if conversion to arrow fails. + pass + + # Unknown field type. + bq_field = schema.SchemaField(column, default_type) + bq_schema_out.append(bq_field) + unknown_type_fields.append(bq_field) + + # Catch any schema mismatch. The developer explicitly asked to serialize a + # column, but it was not found. + if override_fields_unused: + raise ValueError( + "Provided BigQuery fields contain field(s) not present in DataFrame: {}".format( + override_fields_unused + ) + ) + + # If schema detection was not successful for all columns, also try with + # pyarrow, if available. + if unknown_type_fields: + msg = "Could not determine the type of columns: {}".format( + ", ".join(field.name for field in unknown_type_fields) + ) + warnings.warn(msg) + + return tuple(bq_schema_out) + + +def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]: + bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) + + if bq_type is not None: + return schema.SchemaField(name, bq_type) + + if hasattr(pandas, "ArrowDtype") and isinstance(dtype, pandas.ArrowDtype): + return pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field( + name, dtype.pyarrow_dtype + ) + + return None + + +def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]: + if isinstance(value, str): + return schema.SchemaField(name, "STRING") + + # For timezone-naive datetimes, the later pyarrow conversion to try and + # learn the type add a timezone to such datetimes, causing them to be + # recognized as TIMESTAMP type. We thus additionally check the actual data + # to see if we need to overrule that and choose DATETIME instead. + # + # See: https://github.com/googleapis/python-bigquery/issues/985 + # and https://github.com/googleapis/python-bigquery/pull/1061 + # and https://github.com/googleapis/python-bigquery-pandas/issues/450 + if isinstance(value, datetime.datetime): + if value.tzinfo is not None: + return schema.SchemaField(name, "TIMESTAMP") + else: + return schema.SchemaField(name, "DATETIME") + + if _BaseGeometry is not None and isinstance(value, _BaseGeometry): + return schema.SchemaField(name, "GEOGRAPHY") + + return None + + +def values_to_bigquery_field(name, values) -> Optional[schema.SchemaField]: + value = pandas_gbq.core.pandas.first_valid(values) + + # All NULL, type not determinable. + if value is None: + return None + + field = value_to_bigquery_field(name, value) + if field is not None: + return field + + if isinstance(value, str): + return schema.SchemaField(name, "STRING") + + # Check plain ARRAY values here. Let STRUCT get determined by pyarrow, + # which can examine more values to determine all keys. + if isinstance(value, collections.abc.Iterable) and not isinstance( + value, collections.abc.Mapping + ): + # It could be that this value contains all None or is empty, so get the + # first non-None value we can find. + valid_item = pandas_gbq.core.pandas.first_array_valid(values) + field = value_to_bigquery_field(name, valid_item) + + if field is not None: + return schema.SchemaField(name, field.field_type, mode="REPEATED") + + return None diff --git a/pandas_gbq/schema/pyarrow_to_bigquery.py b/pandas_gbq/schema/pyarrow_to_bigquery.py new file mode 100644 index 00000000..c63559eb --- /dev/null +++ b/pandas_gbq/schema/pyarrow_to_bigquery.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from typing import Optional, cast + +from google.cloud.bigquery import schema +import pyarrow +import pyarrow.types + +_ARROW_SCALAR_IDS_TO_BQ = { + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes + pyarrow.bool_().id: "BOOLEAN", + pyarrow.int8().id: "INTEGER", + pyarrow.int16().id: "INTEGER", + pyarrow.int32().id: "INTEGER", + pyarrow.int64().id: "INTEGER", + pyarrow.uint8().id: "INTEGER", + pyarrow.uint16().id: "INTEGER", + pyarrow.uint32().id: "INTEGER", + pyarrow.uint64().id: "INTEGER", + pyarrow.float16().id: "FLOAT", + pyarrow.float32().id: "FLOAT", + pyarrow.float64().id: "FLOAT", + pyarrow.time32("ms").id: "TIME", + pyarrow.time64("ns").id: "TIME", + pyarrow.timestamp("ns").id: "TIMESTAMP", + pyarrow.date32().id: "DATE", + pyarrow.date64().id: "DATETIME", # because millisecond resolution + pyarrow.binary().id: "BYTES", + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() + pyarrow.large_string().id: "STRING", + # The exact decimal's scale and precision are not important, as only + # the type ID matters, and it's the same for all decimal256 instances. + pyarrow.decimal128(38, scale=9).id: "NUMERIC", + pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", +} + + +def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]: + detected_type = _ARROW_SCALAR_IDS_TO_BQ.get(type_.id, None) + if detected_type is not None: + return schema.SchemaField(name, detected_type) + + if pyarrow.types.is_list(type_): + return arrow_list_type_to_bigquery(name, type_) + + if pyarrow.types.is_struct(type_): + inner_fields: list[pyarrow.Field] = [] + struct_type = cast(pyarrow.StructType, type_) + for field_index in range(struct_type.num_fields): + field = struct_type[field_index] + inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type)) + + return schema.SchemaField(name, "RECORD", fields=inner_fields) + + return None + + +def arrow_list_type_to_bigquery(name, type_) -> Optional[schema.SchemaField]: + inner_field = arrow_type_to_bigquery_field(name, type_.value_type) + if inner_field is None: + return None + + return schema.SchemaField( + name, inner_field.field_type, mode="REPEATED", fields=inner_field.fields + ) diff --git a/setup.py b/setup.py index df793e59..a4127a55 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,8 @@ "bqstorage": [ "google-cloud-bigquery-storage >=2.16.2, <3.0.0dev", ], - "tqdm": "tqdm>=4.23.0", + "tqdm": ["tqdm>=4.23.0"], + "geopandas": ["geopandas>=0.9.0", "Shapely>=1.8.4"], } # Setup boilerplate below this line. diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt index e551d17e..068480fd 100644 --- a/testing/constraints-3.8.txt +++ b/testing/constraints-3.8.txt @@ -7,6 +7,7 @@ # Then this file should have foo==1.14.0 # protobuf==3.19.5 db-dtypes==1.0.4 +geopandas==0.9.0 google-api-core==2.10.2 google-auth==2.13.0 google-auth-oauthlib==0.7.0 @@ -16,5 +17,6 @@ numpy==1.18.1 pandas==1.1.4 pyarrow==3.0.0 pydata-google-auth==1.5.0 +Shapely==1.8.4 tqdm==4.23.0 -packaging==22.0.0 \ No newline at end of file +packaging==22.0.0 diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 2e7245d5..0bcd8780 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -10,6 +10,7 @@ import db_dtypes import pandas import pandas.testing +import pyarrow import pytest pytest.importorskip("google.cloud.bigquery", minversion="1.24.0") @@ -125,6 +126,37 @@ def test_series_round_trip( ) DATAFRAME_ROUND_TRIPS = [ + # Ensure that a BOOLEAN column can be written with bool, boolean, and + # object dtypes. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/105 + pytest.param( + *DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "bool_col": pandas.Series( + [True, False, True], + dtype="bool", + ), + "boolean_col": pandas.Series( + [None, True, False], + dtype="boolean", + ), + "object_col": pandas.Series( + [False, None, True], + dtype="object", + ), + } + ), + table_schema=[ + {"name": "bool_col", "type": "BOOLEAN"}, + {"name": "boolean_col", "type": "BOOLEAN"}, + {"name": "object_col", "type": "BOOLEAN"}, + ], + api_methods={"load_csv", "load_parquet"}, + ), + id="boolean", + ), # Ensure that a DATE column can be written with datetime64[ns] dtype # data. See: # https://github.com/googleapis/python-bigquery-pandas/issues/362 @@ -176,6 +208,96 @@ def test_series_round_trip( {"name": "date_col", "type": "DATE"}, ], ), + # Loading an INTEGER column should work for any integer dtype. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/616 + pytest.param( + *DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "object": pandas.Series( + [None, 1, -2], + dtype="object", + ), + "nullable_int64": pandas.Series( + [3, None, -4], + dtype="Int64", + ), + "int8": pandas.Series( + [5, -6, 7], + dtype="int8", + ), + "int16": pandas.Series( + [-8, 9, -10], + dtype="int16", + ), + "int32": pandas.Series( + [11, -12, 13], + dtype="int32", + ), + "int64": pandas.Series( + [-14, 15, -16], + dtype="int64", + ), + "uint8": pandas.Series( + [0, 1, 2], + dtype="uint8", + ), + "uint16": pandas.Series( + [3, 4, 5], + dtype="uint16", + ), + "uint32": pandas.Series( + [6, 7, 8], + dtype="uint32", + ), + } + ), + expected_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "object": pandas.Series( + [None, 1, -2], + dtype="Int64", + ), + "nullable_int64": pandas.Series( + [3, None, -4], + dtype="Int64", + ), + "int8": pandas.Series( + [5, -6, 7], + dtype="Int64", + ), + "int16": pandas.Series( + [-8, 9, -10], + dtype="Int64", + ), + "int32": pandas.Series( + [11, -12, 13], + dtype="Int64", + ), + "int64": pandas.Series( + [-14, 15, -16], + dtype="Int64", + ), + "uint8": pandas.Series( + [0, 1, 2], + dtype="Int64", + ), + "uint16": pandas.Series( + [3, 4, 5], + dtype="Int64", + ), + "uint32": pandas.Series( + [6, 7, 8], + dtype="Int64", + ), + } + ), + api_methods={"load_csv", "load_parquet"}, + ), + id="integer", + ), # Loading a NUMERIC column should work for floating point objects. See: # https://github.com/googleapis/python-bigquery-pandas/issues/421 DataFrameRoundTripTestCase( @@ -240,6 +362,115 @@ def test_series_round_trip( ), id="issue365-extreme-datetimes", ), + pytest.param( + # Load STRUCT and ARRAY using either object column or ArrowDtype. + # See: https://github.com/googleapis/python-bigquery-pandas/issues/452 + *DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "object_struct": pandas.Series( + [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}], + dtype="object", + ), + # Array of DATETIME requires inspection into list elements. + # See: + # https://github.com/googleapis/python-bigquery/pull/1061 + "object_array_datetime": pandas.Series( + [[], [datetime.datetime(1998, 9, 4, 12, 0, 0)], []], + dtype="object", + ), + "object_array_of_struct": pandas.Series( + [[], [{"test": "str4"}], []], dtype="object" + ), + "arrow_struct": pandas.Series( + [ + {"version": 1, "project": "pandas"}, + {"version": 2, "project": "pandas"}, + {"version": 1, "project": "numpy"}, + ], + dtype=pandas.ArrowDtype( + pyarrow.struct( + [ + ("version", pyarrow.int64()), + ("project", pyarrow.string()), + ] + ) + ), + ), + "arrow_array": pandas.Series( + [[1, 2, 3], None, [4, 5, 6]], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.int64()), + ), + ), + "arrow_array_of_struct": pandas.Series( + [ + [{"test": "str5"}], + None, + [{"test": "str6"}, {"test": "str7"}], + ], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])), + ), + ), + }, + ), + expected_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "object_struct": pandas.Series( + [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}], + dtype=pandas.ArrowDtype( + pyarrow.struct([("test", pyarrow.string())]), + ), + ), + # Array of DATETIME requires inspection into list elements. + # See: + # https://github.com/googleapis/python-bigquery/pull/1061 + "object_array_datetime": pandas.Series( + [[], [datetime.datetime(1998, 9, 4, 12, 0, 0)], []], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.timestamp("us"))), + ), + "object_array_of_struct": pandas.Series( + [[], [{"test": "str4"}], []], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])), + ), + ), + "arrow_struct": pandas.Series( + [ + {"version": 1, "project": "pandas"}, + {"version": 2, "project": "pandas"}, + {"version": 1, "project": "numpy"}, + ], + dtype=pandas.ArrowDtype( + pyarrow.struct( + [ + ("version", pyarrow.int64()), + ("project", pyarrow.string()), + ] + ) + ), + ), + "arrow_array": pandas.Series( + [[1, 2, 3], [], [4, 5, 6]], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.int64()), + ), + ), + "arrow_array_of_struct": pandas.Series( + [[{"test": "str5"}], [], [{"test": "str6"}, {"test": "str7"}]], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])), + ), + ), + }, + ), + api_methods={"load_parquet"}, + ), + id="struct", + ), ] @@ -264,13 +495,20 @@ def test_dataframe_round_trip_with_table_schema( method_under_test( input_df, table_id, table_schema=table_schema, api_method=api_method ) - round_trip = read_gbq( - table_id, - dtypes=dict(zip(expected_df.columns, expected_df.dtypes)), - # BigQuery Storage API is required to avoid out-of-bound due to extra - # day from rounding error which was fixed in google-cloud-bigquery - # 2.6.0. https://github.com/googleapis/python-bigquery/pull/402 - use_bqstorage_api=True, + round_trip = ( + read_gbq( + table_id, + dtypes=dict(zip(expected_df.columns, expected_df.dtypes)), + # BigQuery Storage API is required to avoid out-of-bound due to extra + # day from rounding error which was fixed in google-cloud-bigquery + # 2.6.0. https://github.com/googleapis/python-bigquery/pull/402 + use_bqstorage_api=True, + ) + .set_index("row_num") + .sort_index() + ) + + # TODO(tswast): Support writing index columns if to_gbq(index=True). + pandas.testing.assert_frame_equal( + expected_df.set_index("row_num").sort_index(), round_trip ) - round_trip.sort_values("row_num", inplace=True) - pandas.testing.assert_frame_equal(expected_df, round_trip) diff --git a/tests/unit/schema/__init__.py b/tests/unit/schema/__init__.py new file mode 100644 index 00000000..02d26e8e --- /dev/null +++ b/tests/unit/schema/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. diff --git a/tests/unit/schema/test_pandas_to_bigquery.py b/tests/unit/schema/test_pandas_to_bigquery.py new file mode 100644 index 00000000..f9b1ddf4 --- /dev/null +++ b/tests/unit/schema/test_pandas_to_bigquery.py @@ -0,0 +1,156 @@ +# Copyright (c) 2019 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import collections +import datetime +import operator + +import pandas +import pytest +from google.cloud.bigquery import schema + + +@pytest.fixture +def module_under_test(): + from pandas_gbq.schema import pandas_to_bigquery + + return pandas_to_bigquery + + +def test_dataframe_to_bigquery_fields_w_named_index(module_under_test): + df_data = collections.OrderedDict( + [ + ("str_column", ["hello", "world"]), + ("int_column", [42, 8]), + ("bool_column", [True, False]), + ] + ) + index = pandas.Index(["a", "b"], name="str_index") + dataframe = pandas.DataFrame(df_data, index=index) + + returned_schema = module_under_test.dataframe_to_bigquery_fields( + dataframe, [], index=True + ) + + expected_schema = ( + schema.SchemaField("str_index", "STRING", "NULLABLE"), + schema.SchemaField("str_column", "STRING", "NULLABLE"), + schema.SchemaField("int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"), + ) + assert returned_schema == expected_schema + + +def test_dataframe_to_bigquery_fields_w_multiindex(module_under_test): + df_data = collections.OrderedDict( + [ + ("str_column", ["hello", "world"]), + ("int_column", [42, 8]), + ("bool_column", [True, False]), + ] + ) + index = pandas.MultiIndex.from_tuples( + [ + ("a", 0, datetime.datetime(1999, 12, 31, 23, 59, 59, 999999)), + ("a", 0, datetime.datetime(2000, 1, 1, 0, 0, 0)), + ], + names=["str_index", "int_index", "dt_index"], + ) + dataframe = pandas.DataFrame(df_data, index=index) + + returned_schema = module_under_test.dataframe_to_bigquery_fields( + dataframe, [], index=True + ) + + expected_schema = ( + schema.SchemaField("str_index", "STRING", "NULLABLE"), + schema.SchemaField("int_index", "INTEGER", "NULLABLE"), + schema.SchemaField("dt_index", "DATETIME", "NULLABLE"), + schema.SchemaField("str_column", "STRING", "NULLABLE"), + schema.SchemaField("int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"), + ) + assert returned_schema == expected_schema + + +def test_dataframe_to_bigquery_fields_w_bq_schema(module_under_test): + df_data = collections.OrderedDict( + [ + ("str_column", ["hello", "world"]), + ("int_column", [42, 8]), + ("bool_column", [True, False]), + ] + ) + dataframe = pandas.DataFrame(df_data) + + dict_schema = [ + {"name": "str_column", "type": "STRING", "mode": "NULLABLE"}, + {"name": "bool_column", "type": "BOOL", "mode": "REQUIRED"}, + ] + + returned_schema = module_under_test.dataframe_to_bigquery_fields( + dataframe, dict_schema + ) + + expected_schema = ( + schema.SchemaField("str_column", "STRING", "NULLABLE"), + schema.SchemaField("int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("bool_column", "BOOL", "REQUIRED"), + ) + assert returned_schema == expected_schema + + +def test_dataframe_to_bigquery_fields_fallback_needed_w_pyarrow(module_under_test): + dataframe = pandas.DataFrame( + data=[ + {"id": 10, "status": "FOO", "created_at": datetime.date(2019, 5, 10)}, + {"id": 20, "status": "BAR", "created_at": datetime.date(2018, 9, 12)}, + ] + ) + + detected_schema = module_under_test.dataframe_to_bigquery_fields( + dataframe, override_bigquery_fields=[] + ) + expected_schema = ( + schema.SchemaField("id", "INTEGER", mode="NULLABLE"), + schema.SchemaField("status", "STRING", mode="NULLABLE"), + schema.SchemaField("created_at", "DATE", mode="NULLABLE"), + ) + by_name = operator.attrgetter("name") + assert sorted(detected_schema, key=by_name) == sorted(expected_schema, key=by_name) + + +def test_dataframe_to_bigquery_fields_w_extra_fields(module_under_test): + with pytest.raises(ValueError) as exc_context: + module_under_test.dataframe_to_bigquery_fields( + pandas.DataFrame(), + override_bigquery_fields=(schema.SchemaField("not_in_df", "STRING"),), + ) + message = str(exc_context.value) + assert ( + "Provided BigQuery fields contain field(s) not present in DataFrame:" in message + ) + assert "not_in_df" in message + + +def test_dataframe_to_bigquery_fields_geography(module_under_test): + geopandas = pytest.importorskip("geopandas") + from shapely import wkt + + df = geopandas.GeoDataFrame( + pandas.DataFrame( + dict( + name=["foo", "bar"], + geo1=[None, None], + geo2=[None, wkt.loads("Point(1 1)")], + ) + ), + geometry="geo1", + ) + bq_schema = module_under_test.dataframe_to_bigquery_fields(df, []) + assert bq_schema == ( + schema.SchemaField("name", "STRING"), + schema.SchemaField("geo1", "GEOGRAPHY"), + schema.SchemaField("geo2", "GEOGRAPHY"), + ) diff --git a/tests/unit/schema/test_pyarrow_to_bigquery.py b/tests/unit/schema/test_pyarrow_to_bigquery.py new file mode 100644 index 00000000..9a20e342 --- /dev/null +++ b/tests/unit/schema/test_pyarrow_to_bigquery.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import pyarrow + +from pandas_gbq.schema import pyarrow_to_bigquery + + +def test_arrow_type_to_bigquery_field_unknown(): + # Default types should be picked at a higher layer. + assert ( + pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow.null()) + is None + ) + + +def test_arrow_type_to_bigquery_field_list_of_unknown(): + # Default types should be picked at a higher layer. + assert ( + pyarrow_to_bigquery.arrow_type_to_bigquery_field( + "test_name", pyarrow.list_(pyarrow.null()) + ) + is None + ) diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py index 45c73533..bb611781 100644 --- a/tests/unit/test_load.py +++ b/tests/unit/test_load.py @@ -165,11 +165,8 @@ def test_load_csv_from_file_generates_schema(mock_bigquery_client): assert sent_schema[2].field_type == "FLOAT" assert sent_schema[3].name == "string_col" assert sent_schema[3].field_type == "STRING" - # TODO: Disambiguate TIMESTAMP from DATETIME based on if column is - # localized or at least use field type from table metadata. See: - # https://github.com/googleapis/python-bigquery-pandas/issues/450 assert sent_schema[4].name == "datetime_col" - assert sent_schema[4].field_type == "TIMESTAMP" + assert sent_schema[4].field_type == "DATETIME" assert sent_schema[5].name == "timestamp_col" assert sent_schema[5].field_type == "TIMESTAMP" diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 7fdc616c..48e8862a 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -7,14 +7,12 @@ import google.cloud.bigquery import pandas +import pyarrow import pytest - -@pytest.fixture -def module_under_test(): - import pandas_gbq.schema - - return pandas_gbq.schema +import pandas_gbq +import pandas_gbq.gbq +import pandas_gbq.schema @pytest.mark.parametrize( @@ -45,17 +43,15 @@ def module_under_test(): ), ], ) -def test_schema_is_subset_passes_if_subset( - module_under_test, original_fields, dataframe_fields -): +def test_schema_is_subset_passes_if_subset(original_fields, dataframe_fields): # Issue #24 schema_is_subset indicates whether the schema of the # dataframe is a subset of the schema of the bigquery table table_schema = {"fields": original_fields} tested_schema = {"fields": dataframe_fields} - assert module_under_test.schema_is_subset(table_schema, tested_schema) + assert pandas_gbq.schema.schema_is_subset(table_schema, tested_schema) -def test_schema_is_subset_fails_if_not_subset(module_under_test): +def test_schema_is_subset_fails_if_not_subset(): table_schema = { "fields": [ {"name": "A", "type": "FLOAT"}, @@ -66,12 +62,17 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test): tested_schema = { "fields": [{"name": "A", "type": "FLOAT"}, {"name": "C", "type": "FLOAT"}] } - assert not module_under_test.schema_is_subset(table_schema, tested_schema) + assert not pandas_gbq.schema.schema_is_subset(table_schema, tested_schema) @pytest.mark.parametrize( "dataframe,expected_schema", [ + pytest.param( + pandas.DataFrame(data={"col1": [object()]}), + {"fields": [{"name": "col1", "type": "STRING"}]}, + id="default-type-fails-pyarrow-conversion", + ), ( pandas.DataFrame(data={"col1": [1, 2, 3]}), {"fields": [{"name": "col1", "type": "INTEGER"}]}, @@ -88,13 +89,39 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test): pandas.DataFrame(data={"col1": ["hello", "world"]}), {"fields": [{"name": "col1", "type": "STRING"}]}, ), - ( - pandas.DataFrame(data={"col1": [datetime.datetime.now()]}), - {"fields": [{"name": "col1", "type": "TIMESTAMP"}]}, + pytest.param( + # No time zone -> DATETIME, + # Time zone -> TIMESTAMP + # See: https://github.com/googleapis/python-bigquery-pandas/issues/450 + pandas.DataFrame( + data={ + "object1": pandas.Series([datetime.datetime.now()], dtype="object"), + "object2": pandas.Series( + [datetime.datetime.now(datetime.timezone.utc)], dtype="object" + ), + "datetime1": pandas.Series( + [datetime.datetime.now()], dtype="datetime64[ns]" + ), + "datetime2": pandas.Series( + [datetime.datetime.now(datetime.timezone.utc)], + dtype="datetime64[ns, UTC]", + ), + } + ), + { + "fields": [ + {"name": "object1", "type": "DATETIME"}, + {"name": "object2", "type": "TIMESTAMP"}, + {"name": "datetime1", "type": "DATETIME"}, + {"name": "datetime2", "type": "TIMESTAMP"}, + ] + }, + id="issue450-datetime", ), ( pandas.DataFrame( data={ + "col0": [datetime.datetime.now(datetime.timezone.utc)], "col1": [datetime.datetime.now()], "col2": ["hello"], "col3": [3.14], @@ -104,7 +131,8 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test): ), { "fields": [ - {"name": "col1", "type": "TIMESTAMP"}, + {"name": "col0", "type": "TIMESTAMP"}, + {"name": "col1", "type": "DATETIME"}, {"name": "col2", "type": "STRING"}, {"name": "col3", "type": "FLOAT"}, {"name": "col4", "type": "BOOLEAN"}, @@ -112,10 +140,83 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test): ] }, ), + pytest.param( + # uint8, which is the result from get_dummies, should be INTEGER. + # https://github.com/googleapis/python-bigquery-pandas/issues/616 + pandas.DataFrame({"col": [0, 1]}, dtype="uint8"), + {"fields": [{"name": "col", "type": "INTEGER"}]}, + id="issue616-uint8", + ), + pytest.param( + # object column containing dictionaries should load to STRUCT. + # https://github.com/googleapis/python-bigquery-pandas/issues/452 + pandas.DataFrame( + { + "my_struct": pandas.Series( + [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}], + dtype="object", + ), + } + ), + { + "fields": [ + { + "name": "my_struct", + "type": "RECORD", + "fields": [ + {"name": "test", "type": "STRING", "mode": "NULLABLE"} + ], + } + ] + }, + id="issue452-struct", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series([[], ["abc"], []], dtype="object"), + "list": pandas.Series( + [[], [1, 2, 3], []], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.int64())) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + "list_of_struct": pandas.Series( + [[], [{"test": "abc"}], []], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])) + ) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + } + ), + { + "fields": [ + {"name": "object", "type": "STRING", "mode": "REPEATED"}, + {"name": "list", "type": "INTEGER", "mode": "REPEATED"}, + { + "name": "list_of_struct", + "type": "RECORD", + "mode": "REPEATED", + "fields": [ + {"name": "test", "type": "STRING", "mode": "NULLABLE"}, + ], + }, + ], + }, + id="array", + ), ], ) -def test_generate_bq_schema(module_under_test, dataframe, expected_schema): - schema = module_under_test.generate_bq_schema(dataframe) +def test_generate_bq_schema(dataframe, expected_schema): + schema = pandas_gbq.gbq._generate_bq_schema(dataframe) + + # NULLABLE is the default mode. + for field in expected_schema["fields"]: + if "mode" not in field: + field["mode"] = "NULLABLE" + assert schema == expected_schema @@ -156,8 +257,8 @@ def test_generate_bq_schema(module_under_test, dataframe, expected_schema): ), ], ) -def test_update_schema(module_under_test, schema_old, schema_new, expected_output): - output = module_under_test.update_schema(schema_old, schema_new) +def test_update_schema(schema_old, schema_new, expected_output): + output = pandas_gbq.schema.update_schema(schema_old, schema_new) assert output == expected_output From a61265822d7c9e6f260d4cd17b773d39af59314d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 20 Sep 2024 16:43:27 -0500 Subject: [PATCH 2/3] fix tests on older pandas --- tests/system/test_to_gbq.py | 36 +++++++++++++++----- tests/unit/schema/test_pandas_to_bigquery.py | 2 +- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 0bcd8780..6352fbd7 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -396,13 +396,17 @@ def test_series_round_trip( ("project", pyarrow.string()), ] ) - ), + ) + if hasattr(pandas, "ArrowDtype") + else "object", ), "arrow_array": pandas.Series( [[1, 2, 3], None, [4, 5, 6]], dtype=pandas.ArrowDtype( pyarrow.list_(pyarrow.int64()), - ), + ) + if hasattr(pandas, "ArrowDtype") + else "object", ), "arrow_array_of_struct": pandas.Series( [ @@ -412,7 +416,9 @@ def test_series_round_trip( ], dtype=pandas.ArrowDtype( pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])), - ), + ) + if hasattr(pandas, "ArrowDtype") + else "object", ), }, ), @@ -423,20 +429,26 @@ def test_series_round_trip( [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}], dtype=pandas.ArrowDtype( pyarrow.struct([("test", pyarrow.string())]), - ), + ) + if hasattr(pandas, "ArrowDtype") + else "object", ), # Array of DATETIME requires inspection into list elements. # See: # https://github.com/googleapis/python-bigquery/pull/1061 "object_array_datetime": pandas.Series( [[], [datetime.datetime(1998, 9, 4, 12, 0, 0)], []], - dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.timestamp("us"))), + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.timestamp("us"))) + if hasattr(pandas, "ArrowDtype") + else "object", ), "object_array_of_struct": pandas.Series( [[], [{"test": "str4"}], []], dtype=pandas.ArrowDtype( pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])), - ), + ) + if hasattr(pandas, "ArrowDtype") + else "object", ), "arrow_struct": pandas.Series( [ @@ -451,19 +463,25 @@ def test_series_round_trip( ("project", pyarrow.string()), ] ) - ), + ) + if hasattr(pandas, "ArrowDtype") + else "object", ), "arrow_array": pandas.Series( [[1, 2, 3], [], [4, 5, 6]], dtype=pandas.ArrowDtype( pyarrow.list_(pyarrow.int64()), - ), + ) + if hasattr(pandas, "ArrowDtype") + else "object", ), "arrow_array_of_struct": pandas.Series( [[{"test": "str5"}], [], [{"test": "str6"}, {"test": "str7"}]], dtype=pandas.ArrowDtype( pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])), - ), + ) + if hasattr(pandas, "ArrowDtype") + else "object", ), }, ), diff --git a/tests/unit/schema/test_pandas_to_bigquery.py b/tests/unit/schema/test_pandas_to_bigquery.py index f9b1ddf4..924ce1ee 100644 --- a/tests/unit/schema/test_pandas_to_bigquery.py +++ b/tests/unit/schema/test_pandas_to_bigquery.py @@ -6,9 +6,9 @@ import datetime import operator +from google.cloud.bigquery import schema import pandas import pytest -from google.cloud.bigquery import schema @pytest.fixture From 39cb3017aea3e1038f158834785ebb098b020f07 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 23 Sep 2024 10:12:05 -0500 Subject: [PATCH 3/3] deps: min pyarrow is now 4.0.0 to support compliant nested types --- setup.py | 2 +- testing/constraints-3.8.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a4127a55..10d97733 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ "db-dtypes >=1.0.4,<2.0.0", "numpy >=1.18.1", "pandas >=1.1.4", - "pyarrow >=3.0.0", + "pyarrow >=4.0.0", "pydata-google-auth >=1.5.0", # Note: google-api-core and google-auth are also included via transitive # dependency on google-cloud-bigquery, but this library also uses them diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt index 068480fd..8d6ef4f4 100644 --- a/testing/constraints-3.8.txt +++ b/testing/constraints-3.8.txt @@ -15,7 +15,7 @@ google-cloud-bigquery==3.4.2 google-cloud-bigquery-storage==2.16.2 numpy==1.18.1 pandas==1.1.4 -pyarrow==3.0.0 +pyarrow==4.0.0 pydata-google-auth==1.5.0 Shapely==1.8.4 tqdm==4.23.0