111
111
DataFileContent ,
112
112
FileFormat ,
113
113
)
114
+ from pyiceberg .partitioning import PartitionField , PartitionSpec , partition_record_value
114
115
from pyiceberg .schema import (
115
116
PartnerAccessor ,
116
117
PreOrderSchemaVisitor ,
124
125
visit ,
125
126
visit_with_partner ,
126
127
)
127
- from pyiceberg .table import AddFileTask , PropertyUtil , TableProperties , WriteTask
128
+ from pyiceberg .table import PropertyUtil , TableProperties , WriteTask
128
129
from pyiceberg .table .metadata import TableMetadata
129
130
from pyiceberg .table .name_mapping import NameMapping
130
131
from pyiceberg .transforms import TruncateTransform
@@ -1594,29 +1595,88 @@ def parquet_path_to_id_mapping(
1594
1595
return result
1595
1596
1596
1597
1597
- def fill_parquet_file_metadata (
1598
- data_file : DataFile ,
1598
+ @dataclass (frozen = True )
1599
+ class DataFileStatistics :
1600
+ record_count : int
1601
+ column_sizes : Dict [int , int ]
1602
+ value_counts : Dict [int , int ]
1603
+ null_value_counts : Dict [int , int ]
1604
+ nan_value_counts : Dict [int , int ]
1605
+ column_aggregates : Dict [int , StatsAggregator ]
1606
+ split_offsets : List [int ]
1607
+
1608
+ def _partition_value (self , partition_field : PartitionField , schema : Schema ) -> Any :
1609
+ if partition_field .source_id not in self .column_aggregates :
1610
+ return None
1611
+
1612
+ if not partition_field .transform .preserves_order :
1613
+ raise ValueError (
1614
+ f"Cannot infer partition value from parquet metadata for a non-linear Partition Field: { partition_field .name } with transform { partition_field .transform } "
1615
+ )
1616
+
1617
+ lower_value = partition_record_value (
1618
+ partition_field = partition_field ,
1619
+ value = self .column_aggregates [partition_field .source_id ].current_min ,
1620
+ schema = schema ,
1621
+ )
1622
+ upper_value = partition_record_value (
1623
+ partition_field = partition_field ,
1624
+ value = self .column_aggregates [partition_field .source_id ].current_max ,
1625
+ schema = schema ,
1626
+ )
1627
+ if lower_value != upper_value :
1628
+ raise ValueError (
1629
+ f"Cannot infer partition value from parquet metadata as there are more than one partition values for Partition Field: { partition_field .name } . { lower_value = } , { upper_value = } "
1630
+ )
1631
+ return lower_value
1632
+
1633
+ def partition (self , partition_spec : PartitionSpec , schema : Schema ) -> Record :
1634
+ return Record (** {field .name : self ._partition_value (field , schema ) for field in partition_spec .fields })
1635
+
1636
+ def to_serialized_dict (self ) -> Dict [str , Any ]:
1637
+ lower_bounds = {}
1638
+ upper_bounds = {}
1639
+
1640
+ for k , agg in self .column_aggregates .items ():
1641
+ _min = agg .min_as_bytes ()
1642
+ if _min is not None :
1643
+ lower_bounds [k ] = _min
1644
+ _max = agg .max_as_bytes ()
1645
+ if _max is not None :
1646
+ upper_bounds [k ] = _max
1647
+ return {
1648
+ "record_count" : self .record_count ,
1649
+ "column_sizes" : self .column_sizes ,
1650
+ "value_counts" : self .value_counts ,
1651
+ "null_value_counts" : self .null_value_counts ,
1652
+ "nan_value_counts" : self .nan_value_counts ,
1653
+ "lower_bounds" : lower_bounds ,
1654
+ "upper_bounds" : upper_bounds ,
1655
+ "split_offsets" : self .split_offsets ,
1656
+ }
1657
+
1658
+
1659
+ def data_file_statistics_from_parquet_metadata (
1599
1660
parquet_metadata : pq .FileMetaData ,
1600
1661
stats_columns : Dict [int , StatisticsCollector ],
1601
1662
parquet_column_mapping : Dict [str , int ],
1602
- ) -> None :
1663
+ ) -> DataFileStatistics :
1603
1664
"""
1604
- Compute and fill the following fields of the DataFile object .
1665
+ Compute and return DataFileStatistics that includes the following .
1605
1666
1606
- - file_format
1667
+ - record_count
1607
1668
- column_sizes
1608
1669
- value_counts
1609
1670
- null_value_counts
1610
1671
- nan_value_counts
1611
- - lower_bounds
1612
- - upper_bounds
1672
+ - column_aggregates
1613
1673
- split_offsets
1614
1674
1615
1675
Args:
1616
- data_file (DataFile): A DataFile object representing the Parquet file for which metadata is to be filled.
1617
1676
parquet_metadata (pyarrow.parquet.FileMetaData): A pyarrow metadata object.
1618
1677
stats_columns (Dict[int, StatisticsCollector]): The statistics gathering plan. It is required to
1619
1678
set the mode for column metrics collection
1679
+ parquet_column_mapping (Dict[str, int]): The mapping of the parquet file name to the field ID
1620
1680
"""
1621
1681
if parquet_metadata .num_columns != len (stats_columns ):
1622
1682
raise ValueError (
@@ -1695,30 +1755,19 @@ def fill_parquet_file_metadata(
1695
1755
1696
1756
split_offsets .sort ()
1697
1757
1698
- lower_bounds = {}
1699
- upper_bounds = {}
1700
-
1701
- for k , agg in col_aggs .items ():
1702
- _min = agg .min_as_bytes ()
1703
- if _min is not None :
1704
- lower_bounds [k ] = _min
1705
- _max = agg .max_as_bytes ()
1706
- if _max is not None :
1707
- upper_bounds [k ] = _max
1708
-
1709
1758
for field_id in invalidate_col :
1710
- del lower_bounds [field_id ]
1711
- del upper_bounds [field_id ]
1759
+ del col_aggs [field_id ]
1712
1760
del null_value_counts [field_id ]
1713
1761
1714
- data_file .record_count = parquet_metadata .num_rows
1715
- data_file .column_sizes = column_sizes
1716
- data_file .value_counts = value_counts
1717
- data_file .null_value_counts = null_value_counts
1718
- data_file .nan_value_counts = nan_value_counts
1719
- data_file .lower_bounds = lower_bounds
1720
- data_file .upper_bounds = upper_bounds
1721
- data_file .split_offsets = split_offsets
1762
+ return DataFileStatistics (
1763
+ record_count = parquet_metadata .num_rows ,
1764
+ column_sizes = column_sizes ,
1765
+ value_counts = value_counts ,
1766
+ null_value_counts = null_value_counts ,
1767
+ nan_value_counts = nan_value_counts ,
1768
+ column_aggregates = col_aggs ,
1769
+ split_offsets = split_offsets ,
1770
+ )
1722
1771
1723
1772
1724
1773
def write_file (io : FileIO , table_metadata : TableMetadata , tasks : Iterator [WriteTask ]) -> Iterator [DataFile ]:
@@ -1747,6 +1796,11 @@ def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteT
1747
1796
with pq .ParquetWriter (fos , schema = arrow_file_schema , ** parquet_writer_kwargs ) as writer :
1748
1797
writer .write_table (task .df , row_group_size = row_group_size )
1749
1798
1799
+ statistics = data_file_statistics_from_parquet_metadata (
1800
+ parquet_metadata = writer .writer .metadata ,
1801
+ stats_columns = compute_statistics_plan (schema , table_metadata .properties ),
1802
+ parquet_column_mapping = parquet_path_to_id_mapping (schema ),
1803
+ )
1750
1804
data_file = DataFile (
1751
1805
content = DataFileContent .DATA ,
1752
1806
file_path = file_path ,
@@ -1761,47 +1815,41 @@ def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteT
1761
1815
spec_id = table_metadata .default_spec_id ,
1762
1816
equality_ids = None ,
1763
1817
key_metadata = None ,
1818
+ ** statistics .to_serialized_dict (),
1764
1819
)
1765
1820
1766
- fill_parquet_file_metadata (
1767
- data_file = data_file ,
1768
- parquet_metadata = writer .writer .metadata ,
1769
- stats_columns = compute_statistics_plan (schema , table_metadata .properties ),
1770
- parquet_column_mapping = parquet_path_to_id_mapping (schema ),
1771
- )
1772
1821
return iter ([data_file ])
1773
1822
1774
1823
1775
- def parquet_files_to_data_files (io : FileIO , table_metadata : TableMetadata , tasks : Iterator [AddFileTask ]) -> Iterator [DataFile ]:
1776
- for task in tasks :
1777
- input_file = io .new_input (task . file_path )
1824
+ def parquet_files_to_data_files (io : FileIO , table_metadata : TableMetadata , file_paths : Iterator [str ]) -> Iterator [DataFile ]:
1825
+ for file_path in file_paths :
1826
+ input_file = io .new_input (file_path )
1778
1827
with input_file .open () as input_stream :
1779
1828
parquet_metadata = pq .read_metadata (input_stream )
1780
1829
1781
1830
if visit_pyarrow (parquet_metadata .schema .to_arrow_schema (), _HasIds ()):
1782
1831
raise NotImplementedError (
1783
- f"Cannot add file { task . file_path } because it has field IDs. `add_files` only supports addition of files without field_ids"
1832
+ f"Cannot add file { file_path } because it has field IDs. `add_files` only supports addition of files without field_ids"
1784
1833
)
1785
-
1786
1834
schema = table_metadata .schema ()
1835
+ statistics = data_file_statistics_from_parquet_metadata (
1836
+ parquet_metadata = parquet_metadata ,
1837
+ stats_columns = compute_statistics_plan (schema , table_metadata .properties ),
1838
+ parquet_column_mapping = parquet_path_to_id_mapping (schema ),
1839
+ )
1787
1840
data_file = DataFile (
1788
1841
content = DataFileContent .DATA ,
1789
- file_path = task . file_path ,
1842
+ file_path = file_path ,
1790
1843
file_format = FileFormat .PARQUET ,
1791
- partition = task .partition_field_value ,
1792
- record_count = parquet_metadata .num_rows ,
1844
+ partition = statistics .partition (table_metadata .spec (), table_metadata .schema ()),
1793
1845
file_size_in_bytes = len (input_file ),
1794
1846
sort_order_id = None ,
1795
1847
spec_id = table_metadata .default_spec_id ,
1796
1848
equality_ids = None ,
1797
1849
key_metadata = None ,
1850
+ ** statistics .to_serialized_dict (),
1798
1851
)
1799
- fill_parquet_file_metadata (
1800
- data_file = data_file ,
1801
- parquet_metadata = parquet_metadata ,
1802
- stats_columns = compute_statistics_plan (schema , table_metadata .properties ),
1803
- parquet_column_mapping = parquet_path_to_id_mapping (schema ),
1804
- )
1852
+
1805
1853
yield data_file
1806
1854
1807
1855
0 commit comments