@@ -1200,6 +1200,7 @@ def _read_parquet_init(
12001200 use_threads : bool = True ,
12011201 boto3_session : Optional [boto3 .Session ] = None ,
12021202 s3_additional_kwargs : Optional [Dict [str , str ]] = None ,
1203+ validate_schema : bool = True ,
12031204) -> pyarrow .parquet .ParquetDataset :
12041205 """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset."""
12051206 if dataset is False :
@@ -1212,7 +1213,7 @@ def _read_parquet_init(
12121213 fs : s3fs .S3FileSystem = _utils .get_fs (session = boto3_session , s3_additional_kwargs = s3_additional_kwargs )
12131214 cpus : int = _utils .ensure_cpu_count (use_threads = use_threads )
12141215 data : pyarrow .parquet .ParquetDataset = pyarrow .parquet .ParquetDataset (
1215- path_or_paths = path_or_paths , filesystem = fs , metadata_nthreads = cpus , filters = filters , read_dictionary = categories
1216+ path_or_paths = path_or_paths , filesystem = fs , metadata_nthreads = cpus , filters = filters , read_dictionary = categories , validate_schema = validate_schema
12161217 )
12171218 return data
12181219
@@ -1227,6 +1228,7 @@ def read_parquet(
12271228 use_threads : bool = True ,
12281229 boto3_session : Optional [boto3 .Session ] = None ,
12291230 s3_additional_kwargs : Optional [Dict [str , str ]] = None ,
1231+ validate_schema : bool = True
12301232) -> Union [pd .DataFrame , Iterator [pd .DataFrame ]]:
12311233 """Read Apache Parquet file(s) from from a received S3 prefix or list of S3 objects paths.
12321234
@@ -1261,6 +1263,10 @@ def read_parquet(
12611263 s3_additional_kwargs:
12621264 Forward to s3fs, useful for server side encryption
12631265 https://s3fs.readthedocs.io/en/latest/#serverside-encryption
1266+ validate_schema:
1267+ Check that individual file schemas are all the same / compatible. Schemas within a
1268+ folder prefix should all be the same. Disable if you have schemas that are different
1269+ and want to disable this check.
12641270
12651271 Returns
12661272 -------
@@ -1306,6 +1312,7 @@ def read_parquet(
13061312 use_threads = use_threads ,
13071313 boto3_session = boto3_session ,
13081314 s3_additional_kwargs = s3_additional_kwargs ,
1315+ validate_schema = validate_schema
13091316 )
13101317 if chunked is False :
13111318 return _read_parquet (data = data , columns = columns , categories = categories , use_threads = use_threads )
0 commit comments