aws · kukushking · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025
diff --git a/awswrangler/s3/_read.py b/awswrangler/s3/_read.py
@@ -116,7 +116,8 @@ def _extract_partitions_dtypes_from_table_details(response: "GetTableResponseTyp
     return dtypes
 
 
-def _union(dfs: list[pd.DataFrame], ignore_index: bool) -> pd.DataFrame:
+def _concat_union_categoricals(dfs: list[pd.DataFrame], ignore_index: bool) -> pd.DataFrame:
+    """Concatenate dataframes with union of categorical columns."""
     cats: tuple[set[str], ...] = tuple(set(df.select_dtypes(include="category").columns) for df in dfs)
     for col in set.intersection(*cats):
         cat = union_categoricals([df[col] for df in dfs])

diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -1,47 +1,48 @@
 """Amazon S3 Read PARQUET Module (PRIVATE)."""

 from __future__ import annotations

 import datetime
 import functools
 import itertools
 import logging
 import warnings
 from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Iterator,
 )

 import boto3
 import pandas as pd
 import pyarrow as pa
 import pyarrow.dataset
 import pyarrow.parquet
 from packaging import version
 from typing_extensions import Literal

 from awswrangler import _data_types, _utils, exceptions
 from awswrangler._arrow import _add_table_partitions, _table_to_df
 from awswrangler._config import apply_configs
 from awswrangler._distributed import engine
 from awswrangler._executor import _BaseExecutor, _get_executor
 from awswrangler.distributed.ray import ray_get  # noqa: F401
 from awswrangler.s3._fs import open_s3_object
 from awswrangler.s3._list import _path2list
 from awswrangler.s3._read import (
    _apply_partition_filter,
    _check_version_id,
    _extract_partitions_dtypes_from_table_details,
    _get_num_output_blocks,
     _get_path_ignore_suffix,
     _get_path_root,
     _get_paths_for_glue_table,
+    _concat_union_categoricals,
     _InternalReadTableMetadataReturnValue,
     _TableMetadataReader,
 )
 from awswrangler.typing import ArrowDecryptionConfiguration, RayReadParquetSettings, _ReadTableMetadataReturnValue

 if TYPE_CHECKING:
    from mypy_boto3_s3 import S3Client
@@ -264,7 +265,7 @@
                         yield df
                     else:
                         if next_slice is not None:
-                            df = pd.concat(objs=[next_slice, df], sort=False, copy=False)
+                            df = _concat_union_categoricals(dfs=[next_slice, df], ignore_index=False)
                         while len(df.index) >= chunked:
                             yield df.iloc[:chunked, :].copy()
                             df = df.iloc[chunked:, :]

diff --git a/awswrangler/s3/_read_text.py b/awswrangler/s3/_read_text.py
@@ -1,31 +1,31 @@
 """Amazon S3 Read Module (PRIVATE)."""

 from __future__ import annotations

 import datetime
 import itertools
 import logging
 import pprint
 from typing import TYPE_CHECKING, Any, Callable, Iterator

 import boto3
 import pandas as pd
 from typing_extensions import Literal

 from awswrangler import _utils, exceptions
 from awswrangler._distributed import engine
 from awswrangler._executor import _BaseExecutor, _get_executor
 from awswrangler.s3._list import _path2list
 from awswrangler.s3._read import (
    _apply_partition_filter,
    _check_version_id,
     _get_num_output_blocks,
     _get_path_ignore_suffix,
     _get_path_root,
-    _union,
+    _concat_union_categoricals,
 )
 from awswrangler.s3._read_text_core import _read_text_file, _read_text_files_chunked
 from awswrangler.typing import RaySettings

 if TYPE_CHECKING:
    from mypy_boto3_s3 import S3Client
@@ -70,7 +70,7 @@
         itertools.repeat(s3_additional_kwargs),
         itertools.repeat(dataset),
     )
-    return _union(dfs=tables, ignore_index=ignore_index)
+    return _concat_union_categoricals(dfs=tables, ignore_index=ignore_index)
 
 
 def _read_text_format(