@@ -1196,11 +1196,11 @@ def _read_parquet_init(
11961196 path : Union [str , List [str ]],
11971197 filters : Optional [Union [List [Tuple ], List [List [Tuple ]]]] = None ,
11981198 categories : List [str ] = None ,
1199+ validate_schema : bool = True ,
11991200 dataset : bool = False ,
12001201 use_threads : bool = True ,
12011202 boto3_session : Optional [boto3 .Session ] = None ,
12021203 s3_additional_kwargs : Optional [Dict [str , str ]] = None ,
1203- validate_schema : bool = True ,
12041204) -> pyarrow .parquet .ParquetDataset :
12051205 """Encapsulate all initialization before the use of the pyarrow.parquet.ParquetDataset."""
12061206 if dataset is False :
@@ -1227,13 +1227,13 @@ def read_parquet(
12271227 path : Union [str , List [str ]],
12281228 filters : Optional [Union [List [Tuple ], List [List [Tuple ]]]] = None ,
12291229 columns : Optional [List [str ]] = None ,
1230+ validate_schema : bool = True ,
12301231 chunked : bool = False ,
12311232 dataset : bool = False ,
12321233 categories : List [str ] = None ,
12331234 use_threads : bool = True ,
12341235 boto3_session : Optional [boto3 .Session ] = None ,
12351236 s3_additional_kwargs : Optional [Dict [str , str ]] = None ,
1236- validate_schema : bool = True ,
12371237) -> Union [pd .DataFrame , Iterator [pd .DataFrame ]]:
12381238 """Read Apache Parquet file(s) from from a received S3 prefix or list of S3 objects paths.
12391239
@@ -1251,7 +1251,11 @@ def read_parquet(
12511251 filters: Union[List[Tuple], List[List[Tuple]]], optional
12521252 List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
12531253 columns : List[str], optional
1254- Names of columns to read from the file(s)
1254+ Names of columns to read from the file(s).
1255+ validate_schema:
1256+ Check that individual file schemas are all the same / compatible. Schemas within a
1257+ folder prefix should all be the same. Disable if you have schemas that are different
1258+ and want to disable this check.
12551259 chunked : bool
12561260 If True will break the data in smaller DataFrames (Non deterministic number of lines).
12571261 Otherwise return a single DataFrame with the whole data.
@@ -1268,10 +1272,6 @@ def read_parquet(
12681272 s3_additional_kwargs:
12691273 Forward to s3fs, useful for server side encryption
12701274 https://s3fs.readthedocs.io/en/latest/#serverside-encryption
1271- validate_schema:
1272- Check that individual file schemas are all the same / compatible. Schemas within a
1273- folder prefix should all be the same. Disable if you have schemas that are different
1274- and want to disable this check.
12751275
12761276 Returns
12771277 -------
@@ -1320,7 +1320,9 @@ def read_parquet(
13201320 validate_schema = validate_schema ,
13211321 )
13221322 if chunked is False :
1323- return _read_parquet (data = data , columns = columns , categories = categories , use_threads = use_threads )
1323+ return _read_parquet (
1324+ data = data , columns = columns , categories = categories , use_threads = use_threads , validate_schema = validate_schema
1325+ )
13241326 return _read_parquet_chunked (data = data , columns = columns , categories = categories , use_threads = use_threads )
13251327
13261328
@@ -1329,14 +1331,16 @@ def _read_parquet(
13291331 columns : Optional [List [str ]] = None ,
13301332 categories : List [str ] = None ,
13311333 use_threads : bool = True ,
1334+ validate_schema : bool = True ,
13321335) -> pd .DataFrame :
13331336 tables : List [pa .Table ] = []
13341337 for piece in data .pieces :
13351338 table : pa .Table = piece .read (
13361339 columns = columns , use_threads = use_threads , partitions = data .partitions , use_pandas_metadata = False
13371340 )
13381341 tables .append (table )
1339- table = pa .lib .concat_tables (tables )
1342+ promote : bool = not validate_schema
1343+ table = pa .lib .concat_tables (tables , promote = promote )
13401344 return table .to_pandas (
13411345 use_threads = use_threads ,
13421346 split_blocks = True ,
0 commit comments