Skip to content

Commit 5e04df0

Browse files
authored
Return DataFrame with unique index for Athena CTAS queries (#527)
1 parent 9382eb0 commit 5e04df0

File tree

2 files changed

+18
-7
lines changed

2 files changed

+18
-7
lines changed

awswrangler/athena/_read.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,12 @@ def _fetch_parquet_result(
227227
if not paths:
228228
return _empty_dataframe_response(bool(chunked), query_metadata)
229229
ret = s3.read_parquet(
230-
path=paths, use_threads=use_threads, boto3_session=boto3_session, chunked=chunked, categories=categories
230+
path=paths,
231+
use_threads=use_threads,
232+
boto3_session=boto3_session,
233+
chunked=chunked,
234+
categories=categories,
235+
ignore_index=True,
231236
)
232237
if chunked is False:
233238
ret = _apply_query_metadata(df=ret, query_metadata=query_metadata)

awswrangler/s3/_read_parquet.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -273,9 +273,10 @@ def _arrowtable2df(
273273
def _read_parquet_chunked(
274274
paths: List[str],
275275
chunked: Union[bool, int],
276+
validate_schema: bool,
277+
ignore_index: Optional[bool],
276278
columns: Optional[List[str]],
277279
categories: Optional[List[str]],
278-
validate_schema: bool,
279280
safe: bool,
280281
boto3_session: boto3.Session,
281282
dataset: bool,
@@ -331,7 +332,7 @@ def _read_parquet_chunked(
331332
yield df
332333
elif isinstance(chunked, int) and chunked > 0:
333334
if next_slice is not None:
334-
df = _union(dfs=[next_slice, df], ignore_index=None)
335+
df = _union(dfs=[next_slice, df], ignore_index=ignore_index)
335336
while len(df.index) >= chunked:
336337
yield df.iloc[:chunked]
337338
df = df.iloc[chunked:]
@@ -430,6 +431,7 @@ def read_parquet(
430431
path_suffix: Union[str, List[str], None] = None,
431432
path_ignore_suffix: Union[str, List[str], None] = None,
432433
ignore_empty: bool = True,
434+
ignore_index: Optional[bool] = None,
433435
partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None,
434436
columns: Optional[List[str]] = None,
435437
validate_schema: bool = False,
@@ -489,6 +491,8 @@ def read_parquet(
489491
If None, will try to read all files. (default)
490492
ignore_empty: bool
491493
Ignore files with 0 bytes.
494+
ignore_index: Optional[bool]
495+
Ignore index when combining multiple parquet files to one DataFrame.
492496
partition_filter: Optional[Callable[[Dict[str, str]], bool]]
493497
Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter).
494498
This function MUST receive a single argument (Dict[str, str]) where keys are partitions
@@ -596,7 +600,9 @@ def read_parquet(
596600
}
597601
_logger.debug("args:\n%s", pprint.pformat(args))
598602
if chunked is not False:
599-
return _read_parquet_chunked(paths=paths, chunked=chunked, validate_schema=validate_schema, **args)
603+
return _read_parquet_chunked(
604+
paths=paths, chunked=chunked, validate_schema=validate_schema, ignore_index=ignore_index, **args
605+
)
600606
if len(paths) == 1:
601607
return _read_parquet(path=paths[0], **args)
602608
if validate_schema is True:
@@ -607,7 +613,7 @@ def read_parquet(
607613
boto3_session=boto3_session,
608614
s3_additional_kwargs=s3_additional_kwargs,
609615
)
610-
return _union(dfs=[_read_parquet(path=p, **args) for p in paths], ignore_index=None)
616+
return _union(dfs=[_read_parquet(path=p, **args) for p in paths], ignore_index=ignore_index)
611617

612618

613619
@apply_configs
@@ -657,10 +663,10 @@ def read_parquet_table(
657663
AWS Glue Catalog table name.
658664
database : str
659665
AWS Glue Catalog database name.
660-
path_suffix: Union[str, List[str], None]
666+
filename_suffix: Union[str, List[str], None]
661667
Suffix or List of suffixes to be read (e.g. [".gz.parquet", ".snappy.parquet"]).
662668
If None, will try to read all files. (default)
663-
path_ignore_suffix: Union[str, List[str], None]
669+
filename_ignore_suffix: Union[str, List[str], None]
664670
Suffix or List of suffixes for S3 keys to be ignored.(e.g. [".csv", "_SUCCESS"]).
665671
If None, will try to read all files. (default)
666672
catalog_id : str, optional

0 commit comments

Comments
 (0)