Skip to content

Commit 25b922c

Browse files
committed
Move RowGroup size calculation from TUI to Reader logic and fix bug on uncompressed_size computation from pyarrow
1 parent 6946b71 commit 25b922c

File tree

3 files changed

+113
-14
lines changed

3 files changed

+113
-14
lines changed

src/datanomy/reader/parquet.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,90 @@
77
from pyarrow.lib import ArrowInvalid
88

99

10+
class RowGroup:
11+
"""Class to represent a Parquet row group."""
12+
13+
def __init__(self, row_group_metadata: Any) -> None:
14+
"""
15+
Initialize the RowGroup.
16+
17+
Parameters
18+
----------
19+
row_group_metadata: Metadata for the row group
20+
"""
21+
self._metadata = row_group_metadata
22+
23+
@property
24+
def num_columns(self) -> int:
25+
"""
26+
Get number of columns in the row group.
27+
28+
Returns
29+
-------
30+
Number of columns
31+
"""
32+
return int(self._metadata.num_columns)
33+
34+
@property
35+
def num_rows(self) -> int:
36+
"""
37+
Get number of rows in the row group.
38+
39+
Returns
40+
-------
41+
Number of rows
42+
"""
43+
return int(self._metadata.num_rows)
44+
45+
def column(self, index: int) -> Any:
46+
"""
47+
Get metadata for a specific column in the row group.
48+
49+
Parameters
50+
----------
51+
index: Column index
52+
53+
Returns
54+
-------
55+
Column metadata
56+
"""
57+
return self._metadata.column(index)
58+
59+
@property
60+
def has_compression(self) -> bool:
61+
"""
62+
Check if any column in the row group uses compression.
63+
64+
Returns
65+
-------
66+
True if any column is compressed, False otherwise
67+
"""
68+
for j in range(self.num_columns):
69+
if self.column(j).compression != "UNCOMPRESSED":
70+
return True
71+
return False
72+
73+
@property
74+
def total_sizes(self) -> tuple[int, int]:
75+
"""
76+
Get total compressed and uncompressed size of the row group in bytes.
77+
78+
Returns
79+
-------
80+
Total compressed and uncompressed size of the row group in bytes.
81+
Tuple of (compressed_size, uncompressed_size)
82+
"""
83+
compressed_sum = sum(
84+
self.column(j).total_compressed_size for j in range(self.num_columns)
85+
)
86+
# This should be self._metadata.total_byte_size but there's a bug on
87+
# pyarrow 22.0.0, see: https://github.com/apache/arrow/issues/48138
88+
uncompressed_sum = sum(
89+
self.column(j).total_uncompressed_size for j in range(self.num_columns)
90+
)
91+
return compressed_sum, uncompressed_sum
92+
93+
1094
class ParquetReader:
1195
"""Main class to read and inspect Parquet files."""
1296

@@ -114,6 +198,20 @@ def get_row_group_info(self, index: int) -> Any:
114198
"""
115199
return self.parquet_file.metadata.row_group(index)
116200

201+
def get_row_group(self, index: int) -> RowGroup:
202+
"""
203+
Get a specific RowGroup object.
204+
205+
Parameters
206+
----------
207+
index: Row group index
208+
209+
Returns
210+
-------
211+
RowGroup object
212+
"""
213+
return RowGroup(self.parquet_file.metadata.row_group(index))
214+
117215
@property
118216
def metadata_size(self) -> int:
119217
"""

src/datanomy/tui/parquet.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -92,27 +92,16 @@ def _row_groups(self) -> list[Panel]:
9292
# Row groups
9393
row_group_panels: list[Panel] = []
9494
for i in range(self.reader.num_row_groups):
95-
rg = self.reader.get_row_group_info(i)
96-
97-
# Calculate compressed and uncompressed sizes
98-
compressed_sum = sum(
99-
rg.column(j).total_compressed_size for j in range(rg.num_columns)
100-
)
101-
uncompressed_sum = rg.total_byte_size # This is the uncompressed size
102-
103-
# Check if any column is compressed
104-
has_compression = any(
105-
rg.column(j).compression != "UNCOMPRESSED"
106-
for j in range(rg.num_columns)
107-
)
95+
rg = self.reader.get_row_group(i)
96+
compressed_sum, uncompressed_sum = rg.total_sizes
10897

10998
compressed_str = format_size(compressed_sum)
11099
uncompressed_str = format_size(uncompressed_sum)
111100

112101
# Summary info
113102
rg_summary = Text()
114103
rg_summary.append(f"Rows: {rg.num_rows:,}\n")
115-
if has_compression:
104+
if rg.has_compression:
116105
rg_summary.append(f"Compressed: {compressed_str}\n")
117106
rg_summary.append(f"Uncompressed: {uncompressed_str}\n")
118107
# Calculate compression ratio

tests/test_reader.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,15 @@ def test_reader_accepts_file_without_parquet_extension(
8686
reader = ParquetReader(parquet_without_extension)
8787
assert reader.num_rows == 3
8888
assert len(reader.schema_arrow) == 2
89+
90+
91+
def test_row_group_total_sizes(simple_parquet: Path) -> None:
92+
"""Test that row_group_total_sizes returns correct compressed and uncompressed sizes."""
93+
reader = ParquetReader(simple_parquet)
94+
95+
row_group = reader.get_row_group(0)
96+
97+
assert row_group.has_compression
98+
compressed_size, uncompressed_size = row_group.total_sizes
99+
assert compressed_size == 474
100+
assert uncompressed_size == 487

0 commit comments

Comments
 (0)