@@ -251,6 +251,7 @@ def _read_parquet(
251251 data : pyarrow .parquet .ParquetDataset ,
252252 columns : Optional [List [str ]] = None ,
253253 categories : List [str ] = None ,
254+ safe : bool = True ,
254255 use_threads : bool = True ,
255256 validate_schema : bool = True ,
256257) -> pd .DataFrame :
@@ -274,6 +275,7 @@ def _read_parquet(
274275 date_as_object = True ,
275276 ignore_metadata = True ,
276277 categories = categories ,
278+ safe = safe ,
277279 types_mapper = _data_types .pyarrow2pandas_extension ,
278280 )
279281
@@ -282,6 +284,7 @@ def _read_parquet_chunked(
282284 data : pyarrow .parquet .ParquetDataset ,
283285 columns : Optional [List [str ]] = None ,
284286 categories : List [str ] = None ,
287+ safe : bool = True ,
285288 chunked : Union [bool , int ] = True ,
286289 use_threads : bool = True ,
287290) -> Iterator [pd .DataFrame ]:
@@ -292,6 +295,7 @@ def _read_parquet_chunked(
292295 columns = columns , use_threads = use_threads , partitions = data .partitions , use_pandas_metadata = False
293296 ),
294297 categories = categories ,
298+ safe = safe ,
295299 use_threads = use_threads ,
296300 )
297301 if chunked is True :
@@ -310,7 +314,9 @@ def _read_parquet_chunked(
310314 yield next_slice
311315
312316
313- def _table2df (table : pa .Table , categories : List [str ] = None , use_threads : bool = True ) -> pd .DataFrame :
317+ def _table2df (
318+ table : pa .Table , categories : List [str ] = None , safe : bool = True , use_threads : bool = True
319+ ) -> pd .DataFrame :
314320 return table .to_pandas (
315321 use_threads = use_threads ,
316322 split_blocks = True ,
@@ -319,6 +325,7 @@ def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool =
319325 date_as_object = True ,
320326 ignore_metadata = True ,
321327 categories = categories ,
328+ safe = safe ,
322329 types_mapper = _data_types .pyarrow2pandas_extension ,
323330 )
324331
@@ -644,6 +651,7 @@ def read_parquet(
644651 chunked : Union [bool , int ] = False ,
645652 dataset : bool = False ,
646653 categories : List [str ] = None ,
654+ safe : bool = True ,
647655 use_threads : bool = True ,
648656 last_modified_begin : Optional [datetime .datetime ] = None ,
649657 last_modified_end : Optional [datetime .datetime ] = None ,
@@ -700,6 +708,11 @@ def read_parquet(
700708 categories: List[str], optional
701709 List of columns names that should be returned as pandas.Categorical.
702710 Recommended for memory restricted environments.
711+ safe : bool, default True
712+ For certain data types, a cast is needed in order to store the
713+ data in a pandas DataFrame or Series (e.g. timestamps are always
714+ stored as nanoseconds in pandas). This option controls whether it
715+ is a safe cast or not.
703716 use_threads : bool
704717 True to enable concurrent requests, False to disable multiple threads.
705718 If enabled os.cpu_count() will be used as the max number of threads.
@@ -773,10 +786,15 @@ def read_parquet(
773786 _logger .debug ("pyarrow.parquet.ParquetDataset initialized." )
774787 if chunked is False :
775788 return _read_parquet (
776- data = data , columns = columns , categories = categories , use_threads = use_threads , validate_schema = validate_schema
789+ data = data ,
790+ columns = columns ,
791+ categories = categories ,
792+ safe = safe ,
793+ use_threads = use_threads ,
794+ validate_schema = validate_schema ,
777795 )
778796 return _read_parquet_chunked (
779- data = data , columns = columns , categories = categories , chunked = chunked , use_threads = use_threads
797+ data = data , columns = columns , categories = categories , safe = safe , chunked = chunked , use_threads = use_threads
780798 )
781799
782800
@@ -863,6 +881,7 @@ def read_parquet_table(
863881 columns : Optional [List [str ]] = None ,
864882 validate_schema : bool = True ,
865883 categories : List [str ] = None ,
884+ safe : bool = True ,
866885 chunked : Union [bool , int ] = False ,
867886 use_threads : bool = True ,
868887 boto3_session : Optional [boto3 .Session ] = None ,
@@ -908,6 +927,11 @@ def read_parquet_table(
908927 categories: List[str], optional
909928 List of columns names that should be returned as pandas.Categorical.
910929 Recommended for memory restricted environments.
930+ safe : bool, default True
931+ For certain data types, a cast is needed in order to store the
932+ data in a pandas DataFrame or Series (e.g. timestamps are always
933+ stored as nanoseconds in pandas). This option controls whether it
934+ is a safe cast or not.
911935 chunked : bool
912936 If True will break the data in smaller DataFrames (Non deterministic number of lines).
913937 Otherwise return a single DataFrame with the whole data.
@@ -966,6 +990,7 @@ def read_parquet_table(
966990 columns = columns ,
967991 validate_schema = validate_schema ,
968992 categories = categories ,
993+ safe = safe ,
969994 chunked = chunked ,
970995 dataset = True ,
971996 use_threads = use_threads ,
0 commit comments