|
2 | 2 |
|
3 | 3 | import concurrent.futures |
4 | 4 | import datetime |
| 5 | +import functools |
5 | 6 | import itertools |
6 | 7 | import json |
7 | 8 | import logging |
@@ -339,8 +340,8 @@ def _read_parquet_chunked( |
339 | 340 | if next_slice is not None: |
340 | 341 | df = _union(dfs=[next_slice, df], ignore_index=ignore_index) |
341 | 342 | while len(df.index) >= chunked: |
342 | | - yield df.iloc[:chunked] |
343 | | - df = df.iloc[chunked:] |
| 343 | + yield df.iloc[:chunked, :].copy() |
| 344 | + df = df.iloc[chunked:, :] |
344 | 345 | if df.empty: |
345 | 346 | next_slice = None |
346 | 347 | else: |
@@ -773,26 +774,32 @@ def read_parquet_table( |
773 | 774 | path: str = res["Table"]["StorageDescriptor"]["Location"] |
774 | 775 | except KeyError as ex: |
775 | 776 | raise exceptions.InvalidTable(f"Missing s3 location for {database}.{table}.") from ex |
776 | | - return _data_types.cast_pandas_with_athena_types( |
777 | | - df=read_parquet( |
778 | | - path=path, |
779 | | - path_suffix=filename_suffix, |
780 | | - path_ignore_suffix=filename_ignore_suffix, |
781 | | - partition_filter=partition_filter, |
782 | | - columns=columns, |
783 | | - validate_schema=validate_schema, |
784 | | - categories=categories, |
785 | | - safe=safe, |
786 | | - map_types=map_types, |
787 | | - chunked=chunked, |
788 | | - dataset=True, |
789 | | - use_threads=use_threads, |
790 | | - boto3_session=boto3_session, |
791 | | - s3_additional_kwargs=s3_additional_kwargs, |
792 | | - ), |
793 | | - dtype=_extract_partitions_dtypes_from_table_details(response=res), |
| 777 | + df = read_parquet( |
| 778 | + path=path, |
| 779 | + path_suffix=filename_suffix, |
| 780 | + path_ignore_suffix=filename_ignore_suffix, |
| 781 | + partition_filter=partition_filter, |
| 782 | + columns=columns, |
| 783 | + validate_schema=validate_schema, |
| 784 | + categories=categories, |
| 785 | + safe=safe, |
| 786 | + map_types=map_types, |
| 787 | + chunked=chunked, |
| 788 | + dataset=True, |
| 789 | + use_threads=use_threads, |
| 790 | + boto3_session=boto3_session, |
| 791 | + s3_additional_kwargs=s3_additional_kwargs, |
| 792 | + ) |
| 793 | + partial_cast_function = functools.partial( |
| 794 | + _data_types.cast_pandas_with_athena_types, dtype=_extract_partitions_dtypes_from_table_details(response=res) |
794 | 795 | ) |
795 | 796 |
|
| 797 | + if isinstance(df, pd.DataFrame): |
| 798 | + return partial_cast_function(df) |
| 799 | + |
| 800 | + # df is a generator, so map is needed for casting dtypes |
| 801 | + return map(partial_cast_function, df) |
| 802 | + |
796 | 803 |
|
797 | 804 | @apply_configs |
798 | 805 | def read_parquet_metadata( |
|
0 commit comments