Skip to content

Commit 13a2eae

Browse files
author
Tyler Jewell
committed
added validate_schema parameter to read_parquet
1 parent a4d23dc commit 13a2eae

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

awswrangler/s3.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1200,6 +1200,7 @@ def _read_parquet_init(
12001200
use_threads: bool = True,
12011201
boto3_session: Optional[boto3.Session] = None,
12021202
s3_additional_kwargs: Optional[Dict[str, str]] = None,
1203+
validate_schema: bool = True,
12031204
) -> pyarrow.parquet.ParquetDataset:
12041205
"""Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset."""
12051206
if dataset is False:
@@ -1212,7 +1213,7 @@ def _read_parquet_init(
12121213
fs: s3fs.S3FileSystem = _utils.get_fs(session=boto3_session, s3_additional_kwargs=s3_additional_kwargs)
12131214
cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
12141215
data: pyarrow.parquet.ParquetDataset = pyarrow.parquet.ParquetDataset(
1215-
path_or_paths=path_or_paths, filesystem=fs, metadata_nthreads=cpus, filters=filters, read_dictionary=categories
1216+
path_or_paths=path_or_paths, filesystem=fs, metadata_nthreads=cpus, filters=filters, read_dictionary=categories, validate_schema=validate_schema
12161217
)
12171218
return data
12181219

@@ -1227,6 +1228,7 @@ def read_parquet(
12271228
use_threads: bool = True,
12281229
boto3_session: Optional[boto3.Session] = None,
12291230
s3_additional_kwargs: Optional[Dict[str, str]] = None,
1231+
validate_schema: bool = True
12301232
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
12311233
"""Read Apache Parquet file(s) from from a received S3 prefix or list of S3 objects paths.
12321234
@@ -1261,6 +1263,10 @@ def read_parquet(
12611263
s3_additional_kwargs:
12621264
Forward to s3fs, useful for server side encryption
12631265
https://s3fs.readthedocs.io/en/latest/#serverside-encryption
1266+
validate_schema:
1267+
Check that individual file schemas are all the same / compatible. Schemas within a
1268+
folder prefix should all be the same. Disable if you have schemas that are different
1269+
and want to disable this check.
12641270
12651271
Returns
12661272
-------
@@ -1306,6 +1312,7 @@ def read_parquet(
13061312
use_threads=use_threads,
13071313
boto3_session=boto3_session,
13081314
s3_additional_kwargs=s3_additional_kwargs,
1315+
validate_schema=validate_schema
13091316
)
13101317
if chunked is False:
13111318
return _read_parquet(data=data, columns=columns, categories=categories, use_threads=use_threads)

0 commit comments

Comments
 (0)