103
103
)
104
104
from pyiceberg .table .name_mapping import (
105
105
NameMapping ,
106
- parse_mapping_from_json ,
107
106
update_mapping ,
108
107
)
109
108
from pyiceberg .table .refs import MAIN_BRANCH , SnapshotRef
@@ -1215,7 +1214,8 @@ def scan(
1215
1214
limit : Optional [int ] = None ,
1216
1215
) -> DataScan :
1217
1216
return DataScan (
1218
- table = self ,
1217
+ table_metadata = self .metadata ,
1218
+ io = self .io ,
1219
1219
row_filter = row_filter ,
1220
1220
selected_fields = selected_fields ,
1221
1221
case_sensitive = case_sensitive ,
@@ -1312,10 +1312,7 @@ def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive
1312
1312
1313
1313
def name_mapping (self ) -> Optional [NameMapping ]:
1314
1314
"""Return the table's field-id NameMapping."""
1315
- if name_mapping_json := self .properties .get (TableProperties .DEFAULT_NAME_MAPPING ):
1316
- return parse_mapping_from_json (name_mapping_json )
1317
- else :
1318
- return None
1315
+ return self .metadata .name_mapping ()
1319
1316
1320
1317
def append (self , df : pa .Table , snapshot_properties : Dict [str , str ] = EMPTY_DICT ) -> None :
1321
1318
"""
@@ -1468,7 +1465,8 @@ def _parse_row_filter(expr: Union[str, BooleanExpression]) -> BooleanExpression:
1468
1465
1469
1466
1470
1467
class TableScan (ABC ):
1471
- table : Table
1468
+ table_metadata : TableMetadata
1469
+ io : FileIO
1472
1470
row_filter : BooleanExpression
1473
1471
selected_fields : Tuple [str , ...]
1474
1472
case_sensitive : bool
@@ -1478,15 +1476,17 @@ class TableScan(ABC):
1478
1476
1479
1477
def __init__ (
1480
1478
self ,
1481
- table : Table ,
1479
+ table_metadata : TableMetadata ,
1480
+ io : FileIO ,
1482
1481
row_filter : Union [str , BooleanExpression ] = ALWAYS_TRUE ,
1483
1482
selected_fields : Tuple [str , ...] = ("*" ,),
1484
1483
case_sensitive : bool = True ,
1485
1484
snapshot_id : Optional [int ] = None ,
1486
1485
options : Properties = EMPTY_DICT ,
1487
1486
limit : Optional [int ] = None ,
1488
1487
):
1489
- self .table = table
1488
+ self .table_metadata = table_metadata
1489
+ self .io = io
1490
1490
self .row_filter = _parse_row_filter (row_filter )
1491
1491
self .selected_fields = selected_fields
1492
1492
self .case_sensitive = case_sensitive
@@ -1496,19 +1496,20 @@ def __init__(
1496
1496
1497
1497
def snapshot (self ) -> Optional [Snapshot ]:
1498
1498
if self .snapshot_id :
1499
- return self .table .snapshot_by_id (self .snapshot_id )
1500
- return self .table .current_snapshot ()
1499
+ return self .table_metadata .snapshot_by_id (self .snapshot_id )
1500
+ return self .table_metadata .current_snapshot ()
1501
1501
1502
1502
def projection (self ) -> Schema :
1503
- current_schema = self .table .schema ()
1503
+ current_schema = self .table_metadata .schema ()
1504
1504
if self .snapshot_id is not None :
1505
- snapshot = self .table .snapshot_by_id (self .snapshot_id )
1505
+ snapshot = self .table_metadata .snapshot_by_id (self .snapshot_id )
1506
1506
if snapshot is not None :
1507
1507
if snapshot .schema_id is not None :
1508
- snapshot_schema = self .table .schemas ().get (snapshot .schema_id )
1509
- if snapshot_schema is not None :
1510
- current_schema = snapshot_schema
1511
- else :
1508
+ try :
1509
+ current_schema = next (
1510
+ schema for schema in self .table_metadata .schemas if schema .schema_id == snapshot .schema_id
1511
+ )
1512
+ except StopIteration :
1512
1513
warnings .warn (f"Metadata does not contain schema with id: { snapshot .schema_id } " )
1513
1514
else :
1514
1515
raise ValueError (f"Snapshot not found: { self .snapshot_id } " )
@@ -1534,7 +1535,7 @@ def update(self: S, **overrides: Any) -> S:
1534
1535
def use_ref (self : S , name : str ) -> S :
1535
1536
if self .snapshot_id :
1536
1537
raise ValueError (f"Cannot override ref, already set snapshot id={ self .snapshot_id } " )
1537
- if snapshot := self .table .snapshot_by_name (name ):
1538
+ if snapshot := self .table_metadata .snapshot_by_name (name ):
1538
1539
return self .update (snapshot_id = snapshot .snapshot_id )
1539
1540
1540
1541
raise ValueError (f"Cannot scan unknown ref={ name } " )
@@ -1626,33 +1627,21 @@ def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_ent
1626
1627
1627
1628
1628
1629
class DataScan (TableScan ):
1629
- def __init__ (
1630
- self ,
1631
- table : Table ,
1632
- row_filter : Union [str , BooleanExpression ] = ALWAYS_TRUE ,
1633
- selected_fields : Tuple [str , ...] = ("*" ,),
1634
- case_sensitive : bool = True ,
1635
- snapshot_id : Optional [int ] = None ,
1636
- options : Properties = EMPTY_DICT ,
1637
- limit : Optional [int ] = None ,
1638
- ):
1639
- super ().__init__ (table , row_filter , selected_fields , case_sensitive , snapshot_id , options , limit )
1640
-
1641
1630
def _build_partition_projection (self , spec_id : int ) -> BooleanExpression :
1642
- project = inclusive_projection (self .table .schema (), self .table .specs ()[spec_id ])
1631
+ project = inclusive_projection (self .table_metadata .schema (), self .table_metadata .specs ()[spec_id ])
1643
1632
return project (self .row_filter )
1644
1633
1645
1634
@cached_property
1646
1635
def partition_filters (self ) -> KeyDefaultDict [int , BooleanExpression ]:
1647
1636
return KeyDefaultDict (self ._build_partition_projection )
1648
1637
1649
1638
def _build_manifest_evaluator (self , spec_id : int ) -> Callable [[ManifestFile ], bool ]:
1650
- spec = self .table .specs ()[spec_id ]
1651
- return manifest_evaluator (spec , self .table .schema (), self .partition_filters [spec_id ], self .case_sensitive )
1639
+ spec = self .table_metadata .specs ()[spec_id ]
1640
+ return manifest_evaluator (spec , self .table_metadata .schema (), self .partition_filters [spec_id ], self .case_sensitive )
1652
1641
1653
1642
def _build_partition_evaluator (self , spec_id : int ) -> Callable [[DataFile ], bool ]:
1654
- spec = self .table .specs ()[spec_id ]
1655
- partition_type = spec .partition_type (self .table .schema ())
1643
+ spec = self .table_metadata .specs ()[spec_id ]
1644
+ partition_type = spec .partition_type (self .table_metadata .schema ())
1656
1645
partition_schema = Schema (* partition_type .fields )
1657
1646
partition_expr = self .partition_filters [spec_id ]
1658
1647
@@ -1687,16 +1676,14 @@ def plan_files(self) -> Iterable[FileScanTask]:
1687
1676
if not snapshot :
1688
1677
return iter ([])
1689
1678
1690
- io = self .table .io
1691
-
1692
1679
# step 1: filter manifests using partition summaries
1693
1680
# the filter depends on the partition spec used to write the manifest file, so create a cache of filters for each spec id
1694
1681
1695
1682
manifest_evaluators : Dict [int , Callable [[ManifestFile ], bool ]] = KeyDefaultDict (self ._build_manifest_evaluator )
1696
1683
1697
1684
manifests = [
1698
1685
manifest_file
1699
- for manifest_file in snapshot .manifests (io )
1686
+ for manifest_file in snapshot .manifests (self . io )
1700
1687
if manifest_evaluators [manifest_file .partition_spec_id ](manifest_file )
1701
1688
]
1702
1689
@@ -1705,7 +1692,7 @@ def plan_files(self) -> Iterable[FileScanTask]:
1705
1692
1706
1693
partition_evaluators : Dict [int , Callable [[DataFile ], bool ]] = KeyDefaultDict (self ._build_partition_evaluator )
1707
1694
metrics_evaluator = _InclusiveMetricsEvaluator (
1708
- self .table .schema (), self .row_filter , self .case_sensitive , self .options .get ("include_empty_files" ) == "true"
1695
+ self .table_metadata .schema (), self .row_filter , self .case_sensitive , self .options .get ("include_empty_files" ) == "true"
1709
1696
).eval
1710
1697
1711
1698
min_data_sequence_number = _min_data_file_sequence_number (manifests )
@@ -1719,7 +1706,7 @@ def plan_files(self) -> Iterable[FileScanTask]:
1719
1706
lambda args : _open_manifest (* args ),
1720
1707
[
1721
1708
(
1722
- io ,
1709
+ self . io ,
1723
1710
manifest ,
1724
1711
partition_evaluators [manifest .partition_spec_id ],
1725
1712
metrics_evaluator ,
@@ -1755,7 +1742,8 @@ def to_arrow(self) -> pa.Table:
1755
1742
1756
1743
return project_table (
1757
1744
self .plan_files (),
1758
- self .table ,
1745
+ self .table_metadata ,
1746
+ self .io ,
1759
1747
self .row_filter ,
1760
1748
self .projection (),
1761
1749
case_sensitive = self .case_sensitive ,
0 commit comments