|
16 | 16 | # under the License. |
17 | 17 | from __future__ import annotations |
18 | 18 |
|
| 19 | +import itertools |
19 | 20 | from datetime import datetime, timezone |
20 | | -from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple |
| 21 | +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple, Union |
21 | 22 |
|
22 | 23 | from pyiceberg.conversions import from_bytes |
23 | | -from pyiceberg.manifest import DataFileContent, ManifestContent, ManifestFile, PartitionFieldSummary |
| 24 | +from pyiceberg.expressions import AlwaysTrue, BooleanExpression |
| 25 | +from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, ManifestFile, PartitionFieldSummary |
24 | 26 | from pyiceberg.partitioning import PartitionSpec |
25 | 27 | from pyiceberg.table.snapshots import Snapshot, ancestors_of |
26 | 28 | from pyiceberg.types import PrimitiveType |
|
32 | 34 |
|
33 | 35 | from pyiceberg.table import Table |
34 | 36 |
|
| 37 | +ALWAYS_TRUE = AlwaysTrue() |
| 38 | + |
35 | 39 |
|
36 | 40 | class InspectTable: |
37 | 41 | tbl: Table |
@@ -255,10 +259,16 @@ def refs(self) -> "pa.Table": |
255 | 259 |
|
256 | 260 | return pa.Table.from_pylist(ref_results, schema=ref_schema) |
257 | 261 |
|
258 | | - def partitions(self, snapshot_id: Optional[int] = None) -> "pa.Table": |
| 262 | + def partitions( |
| 263 | + self, |
| 264 | + snapshot_id: Optional[int] = None, |
| 265 | + row_filter: Union[str, BooleanExpression] = ALWAYS_TRUE, |
| 266 | + case_sensitive: bool = True, |
| 267 | + ) -> "pa.Table": |
259 | 268 | import pyarrow as pa |
260 | 269 |
|
261 | 270 | from pyiceberg.io.pyarrow import schema_to_pyarrow |
| 271 | + from pyiceberg.table import DataScan |
262 | 272 |
|
263 | 273 | table_schema = pa.schema( |
264 | 274 | [ |
@@ -289,85 +299,74 @@ def partitions(self, snapshot_id: Optional[int] = None) -> "pa.Table": |
289 | 299 | table_schema = pa.unify_schemas([partitions_schema, table_schema]) |
290 | 300 |
|
291 | 301 | snapshot = self._get_snapshot(snapshot_id) |
292 | | - executor = ExecutorFactory.get_or_create() |
293 | | - local_partitions_maps = executor.map(self._process_manifest, snapshot.manifests(self.tbl.io)) |
294 | | - |
295 | | - partitions_map: Dict[Tuple[str, Any], Any] = {} |
296 | | - for local_map in local_partitions_maps: |
297 | | - for partition_record_key, partition_row in local_map.items(): |
298 | | - if partition_record_key not in partitions_map: |
299 | | - partitions_map[partition_record_key] = partition_row |
300 | | - else: |
301 | | - existing = partitions_map[partition_record_key] |
302 | | - existing["record_count"] += partition_row["record_count"] |
303 | | - existing["file_count"] += partition_row["file_count"] |
304 | | - existing["total_data_file_size_in_bytes"] += partition_row["total_data_file_size_in_bytes"] |
305 | | - existing["position_delete_record_count"] += partition_row["position_delete_record_count"] |
306 | | - existing["position_delete_file_count"] += partition_row["position_delete_file_count"] |
307 | | - existing["equality_delete_record_count"] += partition_row["equality_delete_record_count"] |
308 | | - existing["equality_delete_file_count"] += partition_row["equality_delete_file_count"] |
309 | | - |
310 | | - if partition_row["last_updated_at"] and ( |
311 | | - not existing["last_updated_at"] or partition_row["last_updated_at"] > existing["last_updated_at"] |
312 | | - ): |
313 | | - existing["last_updated_at"] = partition_row["last_updated_at"] |
314 | | - existing["last_updated_snapshot_id"] = partition_row["last_updated_snapshot_id"] |
315 | 302 |
|
316 | | - return pa.Table.from_pylist( |
317 | | - partitions_map.values(), |
318 | | - schema=table_schema, |
| 303 | + scan = DataScan( |
| 304 | + table_metadata=self.tbl.metadata, |
| 305 | + io=self.tbl.io, |
| 306 | + row_filter=row_filter, |
| 307 | + case_sensitive=case_sensitive, |
| 308 | + snapshot_id=snapshot.snapshot_id, |
319 | 309 | ) |
320 | 310 |
|
321 | | - def _process_manifest(self, manifest: ManifestFile) -> Dict[Tuple[str, Any], Any]: |
322 | 311 | partitions_map: Dict[Tuple[str, Any], Any] = {} |
323 | | - for entry in manifest.fetch_manifest_entry(io=self.tbl.io): |
| 312 | + |
| 313 | + for entry in itertools.chain.from_iterable(scan.scan_plan_helper()): |
324 | 314 | partition = entry.data_file.partition |
325 | 315 | partition_record_dict = { |
326 | | - field.name: partition[pos] |
327 | | - for pos, field in enumerate(self.tbl.metadata.specs()[manifest.partition_spec_id].fields) |
| 316 | + field.name: partition[pos] for pos, field in enumerate(self.tbl.metadata.specs()[entry.data_file.spec_id].fields) |
328 | 317 | } |
329 | 318 | entry_snapshot = self.tbl.snapshot_by_id(entry.snapshot_id) if entry.snapshot_id is not None else None |
| 319 | + self._update_partitions_map_from_manifest_entry( |
| 320 | + partitions_map, entry.data_file, partition_record_dict, entry_snapshot |
| 321 | + ) |
330 | 322 |
|
331 | | - partition_record_key = _convert_to_hashable_type(partition_record_dict) |
332 | | - if partition_record_key not in partitions_map: |
333 | | - partitions_map[partition_record_key] = { |
334 | | - "partition": partition_record_dict, |
335 | | - "spec_id": entry.data_file.spec_id, |
336 | | - "record_count": 0, |
337 | | - "file_count": 0, |
338 | | - "total_data_file_size_in_bytes": 0, |
339 | | - "position_delete_record_count": 0, |
340 | | - "position_delete_file_count": 0, |
341 | | - "equality_delete_record_count": 0, |
342 | | - "equality_delete_file_count": 0, |
343 | | - "last_updated_at": entry_snapshot.timestamp_ms if entry_snapshot else None, |
344 | | - "last_updated_snapshot_id": entry_snapshot.snapshot_id if entry_snapshot else None, |
345 | | - } |
| 323 | + return pa.Table.from_pylist( |
| 324 | + partitions_map.values(), |
| 325 | + schema=table_schema, |
| 326 | + ) |
346 | 327 |
|
347 | | - partition_row = partitions_map[partition_record_key] |
348 | | - |
349 | | - if entry_snapshot is not None: |
350 | | - if ( |
351 | | - partition_row["last_updated_at"] is None |
352 | | - or partition_row["last_updated_snapshot_id"] < entry_snapshot.timestamp_ms |
353 | | - ): |
354 | | - partition_row["last_updated_at"] = entry_snapshot.timestamp_ms |
355 | | - partition_row["last_updated_snapshot_id"] = entry_snapshot.snapshot_id |
356 | | - |
357 | | - if entry.data_file.content == DataFileContent.DATA: |
358 | | - partition_row["record_count"] += entry.data_file.record_count |
359 | | - partition_row["file_count"] += 1 |
360 | | - partition_row["total_data_file_size_in_bytes"] += entry.data_file.file_size_in_bytes |
361 | | - elif entry.data_file.content == DataFileContent.POSITION_DELETES: |
362 | | - partition_row["position_delete_record_count"] += entry.data_file.record_count |
363 | | - partition_row["position_delete_file_count"] += 1 |
364 | | - elif entry.data_file.content == DataFileContent.EQUALITY_DELETES: |
365 | | - partition_row["equality_delete_record_count"] += entry.data_file.record_count |
366 | | - partition_row["equality_delete_file_count"] += 1 |
367 | | - else: |
368 | | - raise ValueError(f"Unknown DataFileContent ({entry.data_file.content})") |
| 328 | + def _update_partitions_map_from_manifest_entry( |
| 329 | + self, |
| 330 | + partitions_map: Dict[Tuple[str, Any], Any], |
| 331 | + file: DataFile, |
| 332 | + partition_record_dict: Dict[str, Any], |
| 333 | + snapshot: Optional[Snapshot], |
| 334 | + ) -> None: |
| 335 | + partition_record_key = _convert_to_hashable_type(partition_record_dict) |
| 336 | + if partition_record_key not in partitions_map: |
| 337 | + partitions_map[partition_record_key] = { |
| 338 | + "partition": partition_record_dict, |
| 339 | + "spec_id": file.spec_id, |
| 340 | + "record_count": 0, |
| 341 | + "file_count": 0, |
| 342 | + "total_data_file_size_in_bytes": 0, |
| 343 | + "position_delete_record_count": 0, |
| 344 | + "position_delete_file_count": 0, |
| 345 | + "equality_delete_record_count": 0, |
| 346 | + "equality_delete_file_count": 0, |
| 347 | + "last_updated_at": snapshot.timestamp_ms if snapshot else None, |
| 348 | + "last_updated_snapshot_id": snapshot.snapshot_id if snapshot else None, |
| 349 | + } |
369 | 350 |
|
370 | | - return partitions_map |
| 351 | + partition_row = partitions_map[partition_record_key] |
| 352 | + |
| 353 | + if snapshot is not None: |
| 354 | + if partition_row["last_updated_at"] is None or partition_row["last_updated_snapshot_id"] < snapshot.timestamp_ms: |
| 355 | + partition_row["last_updated_at"] = snapshot.timestamp_ms |
| 356 | + partition_row["last_updated_snapshot_id"] = snapshot.snapshot_id |
| 357 | + |
| 358 | + if file.content == DataFileContent.DATA: |
| 359 | + partition_row["record_count"] += file.record_count |
| 360 | + partition_row["file_count"] += 1 |
| 361 | + partition_row["total_data_file_size_in_bytes"] += file.file_size_in_bytes |
| 362 | + elif file.content == DataFileContent.POSITION_DELETES: |
| 363 | + partition_row["position_delete_record_count"] += file.record_count |
| 364 | + partition_row["position_delete_file_count"] += 1 |
| 365 | + elif file.content == DataFileContent.EQUALITY_DELETES: |
| 366 | + partition_row["equality_delete_record_count"] += file.record_count |
| 367 | + partition_row["equality_delete_file_count"] += 1 |
| 368 | + else: |
| 369 | + raise ValueError(f"Unknown DataFileContent ({file.content})") |
371 | 370 |
|
372 | 371 | def _get_manifests_schema(self) -> "pa.Schema": |
373 | 372 | import pyarrow as pa |
|
0 commit comments