|
7 | 7 | from pyarrow.lib import ArrowInvalid |
8 | 8 |
|
9 | 9 |
|
| 10 | +class RowGroup: |
| 11 | + """Class to represent a Parquet row group.""" |
| 12 | + |
| 13 | + def __init__(self, row_group_metadata: Any) -> None: |
| 14 | + """ |
| 15 | + Initialize the RowGroup. |
| 16 | +
|
| 17 | + Parameters |
| 18 | + ---------- |
| 19 | + row_group_metadata: Metadata for the row group |
| 20 | + """ |
| 21 | + self._metadata = row_group_metadata |
| 22 | + |
| 23 | + @property |
| 24 | + def num_columns(self) -> int: |
| 25 | + """ |
| 26 | + Get number of columns in the row group. |
| 27 | +
|
| 28 | + Returns |
| 29 | + ------- |
| 30 | + Number of columns |
| 31 | + """ |
| 32 | + return int(self._metadata.num_columns) |
| 33 | + |
| 34 | + @property |
| 35 | + def num_rows(self) -> int: |
| 36 | + """ |
| 37 | + Get number of rows in the row group. |
| 38 | +
|
| 39 | + Returns |
| 40 | + ------- |
| 41 | + Number of rows |
| 42 | + """ |
| 43 | + return int(self._metadata.num_rows) |
| 44 | + |
| 45 | + def column(self, index: int) -> Any: |
| 46 | + """ |
| 47 | + Get metadata for a specific column in the row group. |
| 48 | +
|
| 49 | + Parameters |
| 50 | + ---------- |
| 51 | + index: Column index |
| 52 | +
|
| 53 | + Returns |
| 54 | + ------- |
| 55 | + Column metadata |
| 56 | + """ |
| 57 | + return self._metadata.column(index) |
| 58 | + |
| 59 | + @property |
| 60 | + def has_compression(self) -> bool: |
| 61 | + """ |
| 62 | + Check if any column in the row group uses compression. |
| 63 | +
|
| 64 | + Returns |
| 65 | + ------- |
| 66 | + True if any column is compressed, False otherwise |
| 67 | + """ |
| 68 | + for j in range(self.num_columns): |
| 69 | + if self.column(j).compression != "UNCOMPRESSED": |
| 70 | + return True |
| 71 | + return False |
| 72 | + |
| 73 | + @property |
| 74 | + def total_sizes(self) -> tuple[int, int]: |
| 75 | + """ |
| 76 | + Get total compressed and uncompressed size of the row group in bytes. |
| 77 | +
|
| 78 | + Returns |
| 79 | + ------- |
| 80 | + Total compressed and uncompressed size of the row group in bytes. |
| 81 | + Tuple of (compressed_size, uncompressed_size) |
| 82 | + """ |
| 83 | + compressed_sum = sum( |
| 84 | + self.column(j).total_compressed_size for j in range(self.num_columns) |
| 85 | + ) |
| 86 | + # This should be self._metadata.total_byte_size but there's a bug on |
| 87 | + # pyarrow 22.0.0, see: https://github.com/apache/arrow/issues/48138 |
| 88 | + uncompressed_sum = sum( |
| 89 | + self.column(j).total_uncompressed_size for j in range(self.num_columns) |
| 90 | + ) |
| 91 | + return compressed_sum, uncompressed_sum |
| 92 | + |
| 93 | + |
10 | 94 | class ParquetReader: |
11 | 95 | """Main class to read and inspect Parquet files.""" |
12 | 96 |
|
@@ -114,6 +198,20 @@ def get_row_group_info(self, index: int) -> Any: |
114 | 198 | """ |
115 | 199 | return self.parquet_file.metadata.row_group(index) |
116 | 200 |
|
| 201 | + def get_row_group(self, index: int) -> RowGroup: |
| 202 | + """ |
| 203 | + Get a specific RowGroup object. |
| 204 | +
|
| 205 | + Parameters |
| 206 | + ---------- |
| 207 | + index: Row group index |
| 208 | +
|
| 209 | + Returns |
| 210 | + ------- |
| 211 | + RowGroup object |
| 212 | + """ |
| 213 | + return RowGroup(self.parquet_file.metadata.row_group(index)) |
| 214 | + |
117 | 215 | @property |
118 | 216 | def metadata_size(self) -> int: |
119 | 217 | """ |
|
0 commit comments