Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion awswrangler/s3/_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ def _extract_partitions_dtypes_from_table_details(response: "GetTableResponseTyp
return dtypes


def _union(dfs: list[pd.DataFrame], ignore_index: bool) -> pd.DataFrame:
def _concat_union_categoricals(dfs: list[pd.DataFrame], ignore_index: bool) -> pd.DataFrame:
"""Concatenate dataframes with union of categorical columns."""
cats: tuple[set[str], ...] = tuple(set(df.select_dtypes(include="category").columns) for df in dfs)
for col in set.intersection(*cats):
cat = union_categoricals([df[col] for df in dfs])
Expand Down
3 changes: 2 additions & 1 deletion awswrangler/s3/_read_parquet.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,48 @@
"""Amazon S3 Read PARQUET Module (PRIVATE)."""

from __future__ import annotations

import datetime
import functools
import itertools
import logging
import warnings
from typing import (
TYPE_CHECKING,
Any,
Callable,
Iterator,
)

import boto3
import pandas as pd
import pyarrow as pa
import pyarrow.dataset
import pyarrow.parquet
from packaging import version
from typing_extensions import Literal

from awswrangler import _data_types, _utils, exceptions
from awswrangler._arrow import _add_table_partitions, _table_to_df
from awswrangler._config import apply_configs
from awswrangler._distributed import engine
from awswrangler._executor import _BaseExecutor, _get_executor
from awswrangler.distributed.ray import ray_get # noqa: F401
from awswrangler.s3._fs import open_s3_object
from awswrangler.s3._list import _path2list
from awswrangler.s3._read import (
_apply_partition_filter,
_check_version_id,
_extract_partitions_dtypes_from_table_details,
_get_num_output_blocks,
_get_path_ignore_suffix,
_get_path_root,
_get_paths_for_glue_table,
_concat_union_categoricals,
_InternalReadTableMetadataReturnValue,
_TableMetadataReader,
)
from awswrangler.typing import ArrowDecryptionConfiguration, RayReadParquetSettings, _ReadTableMetadataReturnValue

Check failure on line 45 in awswrangler/s3/_read_parquet.py

View workflow job for this annotation

GitHub Actions / Check (3.9)

Ruff (I001)

awswrangler/s3/_read_parquet.py:3:1: I001 Import block is un-sorted or un-formatted

if TYPE_CHECKING:
from mypy_boto3_s3 import S3Client
Expand Down Expand Up @@ -264,7 +265,7 @@
yield df
else:
if next_slice is not None:
df = pd.concat(objs=[next_slice, df], sort=False, copy=False)
df = _concat_union_categoricals(dfs=[next_slice, df], ignore_index=False)
while len(df.index) >= chunked:
yield df.iloc[:chunked, :].copy()
df = df.iloc[chunked:, :]
Expand Down
4 changes: 2 additions & 2 deletions awswrangler/s3/_read_text.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
"""Amazon S3 Read Module (PRIVATE)."""

from __future__ import annotations

import datetime
import itertools
import logging
import pprint
from typing import TYPE_CHECKING, Any, Callable, Iterator

import boto3
import pandas as pd
from typing_extensions import Literal

from awswrangler import _utils, exceptions
from awswrangler._distributed import engine
from awswrangler._executor import _BaseExecutor, _get_executor
from awswrangler.s3._list import _path2list
from awswrangler.s3._read import (
_apply_partition_filter,
_check_version_id,
_get_num_output_blocks,
_get_path_ignore_suffix,
_get_path_root,
_union,
_concat_union_categoricals,
)
from awswrangler.s3._read_text_core import _read_text_file, _read_text_files_chunked
from awswrangler.typing import RaySettings

Check failure on line 28 in awswrangler/s3/_read_text.py

View workflow job for this annotation

GitHub Actions / Check (3.9)

Ruff (I001)

awswrangler/s3/_read_text.py:3:1: I001 Import block is un-sorted or un-formatted

if TYPE_CHECKING:
from mypy_boto3_s3 import S3Client
Expand Down Expand Up @@ -70,7 +70,7 @@
itertools.repeat(s3_additional_kwargs),
itertools.repeat(dataset),
)
return _union(dfs=tables, ignore_index=ignore_index)
return _concat_union_categoricals(dfs=tables, ignore_index=ignore_index)


def _read_text_format(
Expand Down
Loading