fix: read parquet file in chunked mode per row group

FredericKayser · FredericKayser · commit cc487506f519 · 2024-11-11T18:16:43.000+01:00
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -248,32 +248,31 @@ def _read_parquet_chunked(
                 continue
 
             use_threads_flag: bool = use_threads if isinstance(use_threads, bool) else bool(use_threads > 1)
-            chunks = pq_file.iter_batches(
+            for chunk in pq_file.iter_batches(
                 batch_size=batch_size, columns=columns, use_threads=use_threads_flag, use_pandas_metadata=False
-            )
-
-            schema = pq_file.schema.to_arrow_schema()
-            if columns:
-                schema = pa.schema([schema.field(column) for column in columns], schema.metadata)
-
-            table = _add_table_partitions(
-                table=pa.Table.from_batches(chunks, schema=schema),
-                path=path,
-                path_root=path_root,
-            )
-            df = _table_to_df(table=table, kwargs=arrow_kwargs)
-            if chunked is True:
-                yield df
-            else:
-                if next_slice is not None:
-                    df = pd.concat(objs=[next_slice, df], sort=False, copy=False)
-                while len(df.index) >= chunked:
-                    yield df.iloc[:chunked, :].copy()
-                    df = df.iloc[chunked:, :]
-                if df.empty:
-                    next_slice = None
+            ):
+                schema = pq_file.schema.to_arrow_schema()
+                if columns:
+                    schema = pa.schema([schema.field(column) for column in columns], schema.metadata)
+
+                table = _add_table_partitions(
+                    table=pa.Table.from_batches([chunk], schema=schema),
+                    path=path,
+                    path_root=path_root,
+                )
+                df = _table_to_df(table=table, kwargs=arrow_kwargs)
+                if chunked is True:
+                    yield df
                 else:
-                    next_slice = df
+                    if next_slice is not None:
+                        df = pd.concat(objs=[next_slice, df], sort=False, copy=False)
+                    while len(df.index) >= chunked:
+                        yield df.iloc[:chunked, :].copy()
+                        df = df.iloc[chunked:, :]
+                    if df.empty:
+                        next_slice = None
+                    else:
+                        next_slice = df
     if next_slice is not None:
         yield next_slice