From 43161f961e3b62d811297f0d8d3e5e35220ef601 Mon Sep 17 00:00:00 2001 From: kukushking Date: Mon, 7 Apr 2025 16:10:38 +0100 Subject: [PATCH 1/2] fix: concat with union categories --- awswrangler/s3/_read.py | 3 ++- awswrangler/s3/_read_parquet.py | 3 ++- awswrangler/s3/_read_text.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/awswrangler/s3/_read.py b/awswrangler/s3/_read.py index 53bc6f045..8efb4559b 100644 --- a/awswrangler/s3/_read.py +++ b/awswrangler/s3/_read.py @@ -116,7 +116,8 @@ def _extract_partitions_dtypes_from_table_details(response: "GetTableResponseTyp return dtypes -def _union(dfs: list[pd.DataFrame], ignore_index: bool) -> pd.DataFrame: +def _concat_union_categoricals(dfs: list[pd.DataFrame], ignore_index: bool) -> pd.DataFrame: + """Concatenate dataframes with union of categorical columns.""" cats: tuple[set[str], ...] = tuple(set(df.select_dtypes(include="category").columns) for df in dfs) for col in set.intersection(*cats): cat = union_categoricals([df[col] for df in dfs]) diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py index f9ee13376..d56967af1 100644 --- a/awswrangler/s3/_read_parquet.py +++ b/awswrangler/s3/_read_parquet.py @@ -38,6 +38,7 @@ _get_path_ignore_suffix, _get_path_root, _get_paths_for_glue_table, + _concat_union_categoricals, _InternalReadTableMetadataReturnValue, _TableMetadataReader, ) @@ -264,7 +265,7 @@ def _read_parquet_chunked( yield df else: if next_slice is not None: - df = pd.concat(objs=[next_slice, df], sort=False, copy=False) + df = _concat_union_categoricals(dfs=[next_slice, df], ignore_index=False) while len(df.index) >= chunked: yield df.iloc[:chunked, :].copy() df = df.iloc[chunked:, :] diff --git a/awswrangler/s3/_read_text.py b/awswrangler/s3/_read_text.py index 055a6b153..dff81e2e8 100644 --- a/awswrangler/s3/_read_text.py +++ b/awswrangler/s3/_read_text.py @@ -22,7 +22,7 @@ _get_num_output_blocks, _get_path_ignore_suffix, _get_path_root, - _union, + _concat_union_categoricals, ) from awswrangler.s3._read_text_core import _read_text_file, _read_text_files_chunked from awswrangler.typing import RaySettings @@ -70,7 +70,7 @@ def _read_text( itertools.repeat(s3_additional_kwargs), itertools.repeat(dataset), ) - return _union(dfs=tables, ignore_index=ignore_index) + return _concat_union_categoricals(dfs=tables, ignore_index=ignore_index) def _read_text_format( From 40d9ce477e94923af8a80e14839a722a248d3c92 Mon Sep 17 00:00:00 2001 From: kukushking Date: Mon, 7 Apr 2025 16:21:25 +0100 Subject: [PATCH 2/2] formatting --- awswrangler/s3/_read_parquet.py | 2 +- awswrangler/s3/_read_text.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py index d56967af1..3ffcb9149 100644 --- a/awswrangler/s3/_read_parquet.py +++ b/awswrangler/s3/_read_parquet.py @@ -33,12 +33,12 @@ from awswrangler.s3._read import ( _apply_partition_filter, _check_version_id, + _concat_union_categoricals, _extract_partitions_dtypes_from_table_details, _get_num_output_blocks, _get_path_ignore_suffix, _get_path_root, _get_paths_for_glue_table, - _concat_union_categoricals, _InternalReadTableMetadataReturnValue, _TableMetadataReader, ) diff --git a/awswrangler/s3/_read_text.py b/awswrangler/s3/_read_text.py index dff81e2e8..4d7f44891 100644 --- a/awswrangler/s3/_read_text.py +++ b/awswrangler/s3/_read_text.py @@ -19,10 +19,10 @@ from awswrangler.s3._read import ( _apply_partition_filter, _check_version_id, + _concat_union_categoricals, _get_num_output_blocks, _get_path_ignore_suffix, _get_path_root, - _concat_union_categoricals, ) from awswrangler.s3._read_text_core import _read_text_file, _read_text_files_chunked from awswrangler.typing import RaySettings