Skip to content

Commit 94bc11b

Browse files
committed
Add safe arg to read_parquet(). #296
1 parent 71d720b commit 94bc11b

File tree

2 files changed

+30
-5
lines changed

2 files changed

+30
-5
lines changed

awswrangler/s3/_read.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ def _read_parquet(
251251
data: pyarrow.parquet.ParquetDataset,
252252
columns: Optional[List[str]] = None,
253253
categories: List[str] = None,
254+
safe: bool = True,
254255
use_threads: bool = True,
255256
validate_schema: bool = True,
256257
) -> pd.DataFrame:
@@ -274,6 +275,7 @@ def _read_parquet(
274275
date_as_object=True,
275276
ignore_metadata=True,
276277
categories=categories,
278+
safe=safe,
277279
types_mapper=_data_types.pyarrow2pandas_extension,
278280
)
279281

@@ -282,6 +284,7 @@ def _read_parquet_chunked(
282284
data: pyarrow.parquet.ParquetDataset,
283285
columns: Optional[List[str]] = None,
284286
categories: List[str] = None,
287+
safe: bool = True,
285288
chunked: Union[bool, int] = True,
286289
use_threads: bool = True,
287290
) -> Iterator[pd.DataFrame]:
@@ -292,6 +295,7 @@ def _read_parquet_chunked(
292295
columns=columns, use_threads=use_threads, partitions=data.partitions, use_pandas_metadata=False
293296
),
294297
categories=categories,
298+
safe=safe,
295299
use_threads=use_threads,
296300
)
297301
if chunked is True:
@@ -310,7 +314,9 @@ def _read_parquet_chunked(
310314
yield next_slice
311315

312316

313-
def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool = True) -> pd.DataFrame:
317+
def _table2df(
318+
table: pa.Table, categories: List[str] = None, safe: bool = True, use_threads: bool = True
319+
) -> pd.DataFrame:
314320
return table.to_pandas(
315321
use_threads=use_threads,
316322
split_blocks=True,
@@ -319,6 +325,7 @@ def _table2df(table: pa.Table, categories: List[str] = None, use_threads: bool =
319325
date_as_object=True,
320326
ignore_metadata=True,
321327
categories=categories,
328+
safe=safe,
322329
types_mapper=_data_types.pyarrow2pandas_extension,
323330
)
324331

@@ -644,6 +651,7 @@ def read_parquet(
644651
chunked: Union[bool, int] = False,
645652
dataset: bool = False,
646653
categories: List[str] = None,
654+
safe: bool = True,
647655
use_threads: bool = True,
648656
last_modified_begin: Optional[datetime.datetime] = None,
649657
last_modified_end: Optional[datetime.datetime] = None,
@@ -700,6 +708,11 @@ def read_parquet(
700708
categories: List[str], optional
701709
List of columns names that should be returned as pandas.Categorical.
702710
Recommended for memory restricted environments.
711+
safe : bool, default True
712+
For certain data types, a cast is needed in order to store the
713+
data in a pandas DataFrame or Series (e.g. timestamps are always
714+
stored as nanoseconds in pandas). This option controls whether it
715+
is a safe cast or not.
703716
use_threads : bool
704717
True to enable concurrent requests, False to disable multiple threads.
705718
If enabled os.cpu_count() will be used as the max number of threads.
@@ -773,10 +786,15 @@ def read_parquet(
773786
_logger.debug("pyarrow.parquet.ParquetDataset initialized.")
774787
if chunked is False:
775788
return _read_parquet(
776-
data=data, columns=columns, categories=categories, use_threads=use_threads, validate_schema=validate_schema
789+
data=data,
790+
columns=columns,
791+
categories=categories,
792+
safe=safe,
793+
use_threads=use_threads,
794+
validate_schema=validate_schema,
777795
)
778796
return _read_parquet_chunked(
779-
data=data, columns=columns, categories=categories, chunked=chunked, use_threads=use_threads
797+
data=data, columns=columns, categories=categories, safe=safe, chunked=chunked, use_threads=use_threads
780798
)
781799

782800

@@ -863,6 +881,7 @@ def read_parquet_table(
863881
columns: Optional[List[str]] = None,
864882
validate_schema: bool = True,
865883
categories: List[str] = None,
884+
safe: bool = True,
866885
chunked: Union[bool, int] = False,
867886
use_threads: bool = True,
868887
boto3_session: Optional[boto3.Session] = None,
@@ -908,6 +927,11 @@ def read_parquet_table(
908927
categories: List[str], optional
909928
List of columns names that should be returned as pandas.Categorical.
910929
Recommended for memory restricted environments.
930+
safe : bool, default True
931+
For certain data types, a cast is needed in order to store the
932+
data in a pandas DataFrame or Series (e.g. timestamps are always
933+
stored as nanoseconds in pandas). This option controls whether it
934+
is a safe cast or not.
911935
chunked : bool
912936
If True will break the data in smaller DataFrames (Non deterministic number of lines).
913937
Otherwise return a single DataFrame with the whole data.
@@ -966,6 +990,7 @@ def read_parquet_table(
966990
columns=columns,
967991
validate_schema=validate_schema,
968992
categories=categories,
993+
safe=safe,
969994
chunked=chunked,
970995
dataset=True,
971996
use_threads=use_threads,

requirements-dev.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ awscli>=1.18.0,<2.0.0
33
black~=19.3b0
44
pylint~=2.5.3
55
flake8~=3.8.3
6-
mypy~=0.781
6+
mypy~=0.782
77
isort~=4.3.21
88
pydocstyle~=5.0.2
99
doc8~=0.8.1
@@ -19,4 +19,4 @@ twine~=3.1.1
1919
sphinx~=3.1.1
2020
sphinx_bootstrap_theme~=0.7.1
2121
moto~=1.3.14
22-
jupyterlab~=2.1.4
22+
jupyterlab~=2.1.5

0 commit comments

Comments
 (0)