Skip to content

Commit 4ac86d5

Browse files
authored
Feature: Add flag to disable pyarrow DataTypes conversion (#590)
1 parent 14b240c commit 4ac86d5

File tree

2 files changed

+28
-1
lines changed

2 files changed

+28
-1
lines changed

awswrangler/s3/_read_parquet.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ def _arrowtable2df(
238238
table: pa.Table,
239239
categories: Optional[List[str]],
240240
safe: bool,
241+
map_types: bool,
241242
use_threads: bool,
242243
dataset: bool,
243244
path: str,
@@ -257,7 +258,7 @@ def _arrowtable2df(
257258
strings_to_categorical=False,
258259
safe=safe,
259260
categories=categories,
260-
types_mapper=_data_types.pyarrow2pandas_extension,
261+
types_mapper=_data_types.pyarrow2pandas_extension if map_types else None,
261262
),
262263
dataset=dataset,
263264
path=path,
@@ -279,6 +280,7 @@ def _read_parquet_chunked(
279280
columns: Optional[List[str]],
280281
categories: Optional[List[str]],
281282
safe: bool,
283+
map_types: bool,
282284
boto3_session: boto3.Session,
283285
dataset: bool,
284286
path_root: Optional[str],
@@ -325,6 +327,7 @@ def _read_parquet_chunked(
325327
),
326328
categories=categories,
327329
safe=safe,
330+
map_types=map_types,
328331
use_threads=use_threads,
329332
dataset=dataset,
330333
path=path,
@@ -404,6 +407,7 @@ def _read_parquet(
404407
columns: Optional[List[str]],
405408
categories: Optional[List[str]],
406409
safe: bool,
410+
map_types: bool,
407411
boto3_session: boto3.Session,
408412
dataset: bool,
409413
path_root: Optional[str],
@@ -421,6 +425,7 @@ def _read_parquet(
421425
),
422426
categories=categories,
423427
safe=safe,
428+
map_types=map_types,
424429
use_threads=use_threads,
425430
dataset=dataset,
426431
path=path,
@@ -441,6 +446,7 @@ def read_parquet(
441446
dataset: bool = False,
442447
categories: Optional[List[str]] = None,
443448
safe: bool = True,
449+
map_types: bool = True,
444450
use_threads: bool = True,
445451
last_modified_begin: Optional[datetime.datetime] = None,
446452
last_modified_end: Optional[datetime.datetime] = None,
@@ -524,6 +530,10 @@ def read_parquet(
524530
data in a pandas DataFrame or Series (e.g. timestamps are always
525531
stored as nanoseconds in pandas). This option controls whether it
526532
is a safe cast or not.
533+
map_types : bool, default True
534+
True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is
535+
used to override the default pandas type for conversion of built-in
536+
pyarrow types or in absence of pandas_metadata in the Table schema.
527537
use_threads : bool
528538
True to enable concurrent requests, False to disable multiple threads.
529539
If enabled os.cpu_count() will be used as the max number of threads.
@@ -597,6 +607,7 @@ def read_parquet(
597607
"columns": columns,
598608
"categories": categories,
599609
"safe": safe,
610+
"map_types": map_types,
600611
"boto3_session": session,
601612
"dataset": dataset,
602613
"path_root": path_root,
@@ -633,6 +644,7 @@ def read_parquet_table(
633644
validate_schema: bool = True,
634645
categories: Optional[List[str]] = None,
635646
safe: bool = True,
647+
map_types: bool = True,
636648
chunked: Union[bool, int] = False,
637649
use_threads: bool = True,
638650
boto3_session: Optional[boto3.Session] = None,
@@ -699,6 +711,10 @@ def read_parquet_table(
699711
data in a pandas DataFrame or Series (e.g. timestamps are always
700712
stored as nanoseconds in pandas). This option controls whether it
701713
is a safe cast or not.
714+
map_types : bool, default True
715+
True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is
716+
used to override the default pandas type for conversion of built-in
717+
pyarrow types or in absence of pandas_metadata in the Table schema.
702718
chunked : bool
703719
If True will break the data in smaller DataFrames (Non deterministic number of lines).
704720
Otherwise return a single DataFrame with the whole data.
@@ -767,6 +783,7 @@ def read_parquet_table(
767783
validate_schema=validate_schema,
768784
categories=categories,
769785
safe=safe,
786+
map_types=map_types,
770787
chunked=chunked,
771788
dataset=True,
772789
use_threads=use_threads,

tests/test_s3_parquet.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,16 @@ def test_to_parquet_file_dtype(path, use_threads):
192192
assert str(df2.c1.dtype) == "string"
193193

194194

195+
def test_read_parquet_map_types(path):
196+
df = pd.DataFrame({"c0": [0, 1, 1, 2]}, dtype=np.int8)
197+
file_path = f"{path}0.parquet"
198+
wr.s3.to_parquet(df, file_path)
199+
df2 = wr.s3.read_parquet(file_path)
200+
assert str(df2.c0.dtype) == "Int8"
201+
df3 = wr.s3.read_parquet(file_path, map_types=False)
202+
assert str(df3.c0.dtype) == "int8"
203+
204+
195205
@pytest.mark.parametrize("use_threads", [True, False])
196206
@pytest.mark.parametrize("max_rows_by_file", [None, 0, 40, 250, 1000])
197207
def test_parquet_with_size(path, use_threads, max_rows_by_file):

0 commit comments

Comments
 (0)