Skip to content

Commit cc48750

Browse files
fix: read parquet file in chunked mode per row group
1 parent 7f52af9 commit cc48750

File tree

1 file changed

+23
-24
lines changed

1 file changed

+23
-24
lines changed

awswrangler/s3/_read_parquet.py

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -248,32 +248,31 @@ def _read_parquet_chunked(
248248
continue
249249

250250
use_threads_flag: bool = use_threads if isinstance(use_threads, bool) else bool(use_threads > 1)
251-
chunks = pq_file.iter_batches(
251+
for chunk in pq_file.iter_batches(
252252
batch_size=batch_size, columns=columns, use_threads=use_threads_flag, use_pandas_metadata=False
253-
)
254-
255-
schema = pq_file.schema.to_arrow_schema()
256-
if columns:
257-
schema = pa.schema([schema.field(column) for column in columns], schema.metadata)
258-
259-
table = _add_table_partitions(
260-
table=pa.Table.from_batches(chunks, schema=schema),
261-
path=path,
262-
path_root=path_root,
263-
)
264-
df = _table_to_df(table=table, kwargs=arrow_kwargs)
265-
if chunked is True:
266-
yield df
267-
else:
268-
if next_slice is not None:
269-
df = pd.concat(objs=[next_slice, df], sort=False, copy=False)
270-
while len(df.index) >= chunked:
271-
yield df.iloc[:chunked, :].copy()
272-
df = df.iloc[chunked:, :]
273-
if df.empty:
274-
next_slice = None
253+
):
254+
schema = pq_file.schema.to_arrow_schema()
255+
if columns:
256+
schema = pa.schema([schema.field(column) for column in columns], schema.metadata)
257+
258+
table = _add_table_partitions(
259+
table=pa.Table.from_batches([chunk], schema=schema),
260+
path=path,
261+
path_root=path_root,
262+
)
263+
df = _table_to_df(table=table, kwargs=arrow_kwargs)
264+
if chunked is True:
265+
yield df
275266
else:
276-
next_slice = df
267+
if next_slice is not None:
268+
df = pd.concat(objs=[next_slice, df], sort=False, copy=False)
269+
while len(df.index) >= chunked:
270+
yield df.iloc[:chunked, :].copy()
271+
df = df.iloc[chunked:, :]
272+
if df.empty:
273+
next_slice = None
274+
else:
275+
next_slice = df
277276
if next_slice is not None:
278277
yield next_slice
279278

0 commit comments

Comments
 (0)