@@ -2,7 +2,7 @@ use std::hash::Hash;
22use std:: sync:: Arc ;
33
44use polars_core:: frame:: DataFrame ;
5- use polars_core:: prelude:: Column ;
5+ use polars_core:: prelude:: { Column , DataType } ;
66use polars_error:: PolarsResult ;
77use polars_io:: hive:: HivePathFormatter ;
88use polars_io:: utils:: file:: Writeable ;
@@ -29,6 +29,7 @@ pub type FileProviderFunction = PlanCallback<FileProviderArgs, FileProviderRetur
2929#[ derive( Clone , Debug , Hash , PartialEq ) ]
3030pub enum FileProviderType {
3131 Hive ( HivePathProvider ) ,
32+ Iceberg ( IcebergPathProvider ) ,
3233 Function ( FileProviderFunction ) ,
3334}
3435
@@ -40,10 +41,28 @@ pub struct HivePathProvider {
4041}
4142
4243impl FileProviderType {
44+ /// Get a mutable reference to the file part prefix for this file provider.
45+ ///
46+ /// File part prefixes are inserted after the partition prefix, before the file part number.
47+ ///
48+ /// # Returns
49+ /// Returns `None` if this file provider does not support attaching file part prefixes.
50+ pub fn file_part_prefix_mut ( & mut self ) -> Option < & mut String > {
51+ use FileProviderType :: * ;
52+
53+ match self {
54+ Iceberg ( p) => Some ( p. file_part_prefix_mut ( ) ) ,
55+ Hive ( _) | Function ( _) => None ,
56+ }
57+ }
58+
4359 pub fn get_path_or_file ( & self , args : FileProviderArgs ) -> PolarsResult < FileProviderReturn > {
60+ use FileProviderType :: * ;
61+
4462 match self {
45- Self :: Hive ( v) => v. get_path ( args) . map ( FileProviderReturn :: Path ) ,
46- Self :: Function ( v) => v. get_path_or_file ( args) ,
63+ Hive ( p) => p. get_path ( args) . map ( FileProviderReturn :: Path ) ,
64+ Iceberg ( p) => p. get_path ( args) . map ( FileProviderReturn :: Path ) ,
65+ Function ( p) => p. get_path_or_file ( args) ,
4766 }
4867 }
4968}
@@ -59,22 +78,82 @@ impl HivePathProvider {
5978 partition_keys,
6079 } = args;
6180
62- let mut partition_parts = String :: new ( ) ;
81+ let mut path = String :: new ( ) ;
6382
6483 let partition_keys: & [ Column ] = partition_keys. columns ( ) ;
6584
66- write ! (
67- & mut partition_parts,
68- "{}" ,
69- HivePathFormatter :: new( partition_keys)
70- )
71- . unwrap ( ) ;
85+ write ! ( & mut path, "{}" , HivePathFormatter :: new( partition_keys) ) . unwrap ( ) ;
7286
7387 assert ! ( index_in_partition <= 0xffff_ffff ) ;
7488
75- write ! ( & mut partition_parts, "{index_in_partition:08x}.{extension}" ) . unwrap ( ) ;
89+ write ! ( & mut path, "{index_in_partition:08x}.{extension}" ) . unwrap ( ) ;
90+
91+ Ok ( path)
92+ }
93+ }
94+
95+ #[ cfg_attr( feature = "serde" , derive( serde:: Serialize , serde:: Deserialize ) ) ]
96+ #[ cfg_attr( feature = "dsl-schema" , derive( schemars:: JsonSchema ) ) ]
97+ #[ derive( Clone , Debug , Hash , PartialEq ) ]
98+ pub struct IcebergPathProvider {
99+ pub extension : PlSmallStr ,
100+ pub file_part_prefix : String ,
101+ }
102+
103+ impl IcebergPathProvider {
104+ pub fn file_part_prefix_mut ( & mut self ) -> & mut String {
105+ & mut self . file_part_prefix
106+ }
107+
108+ /// # Panics
109+ /// Panics if `self.file_part_prefix` is `None`.
110+ pub fn get_path ( & self , args : FileProviderArgs ) -> PolarsResult < String > {
111+ use std:: fmt:: Write ;
112+
113+ let IcebergPathProvider {
114+ extension,
115+ file_part_prefix,
116+ } = self ;
117+
118+ assert ! ( !file_part_prefix. is_empty( ) ) ;
119+
120+ let FileProviderArgs {
121+ index_in_partition,
122+ partition_keys,
123+ } = args;
124+
125+ let mut partition_keys_hash = None ;
126+
127+ if partition_keys. width ( ) != 0 {
128+ let mut hasher = blake3:: Hasher :: new ( ) ;
129+
130+ for column in partition_keys. columns ( ) {
131+ let column = column. cast ( & DataType :: String ) . unwrap ( ) ;
132+
133+ let value = column. str ( ) . unwrap ( ) . get ( 0 ) ;
134+
135+ hasher. update ( & [ value. is_some ( ) as u8 ] ) ;
136+ hasher. update ( value. unwrap_or_default ( ) . as_bytes ( ) ) ;
137+ }
138+
139+ partition_keys_hash = Some ( hasher. finalize ( ) . to_hex ( ) ) ;
140+ }
141+
142+ let partition_key_prefix: & str = partition_keys_hash. as_ref ( ) . map_or ( "" , |x| & x[ ..32 ] ) ;
143+
144+ let mut path = String :: with_capacity (
145+ partition_key_prefix. len ( ) + file_part_prefix. len ( ) + 8 + 1 + extension. len ( ) ,
146+ ) ;
147+
148+ assert ! ( index_in_partition <= 0xffff_ffff ) ;
149+
150+ write ! (
151+ & mut path,
152+ "{partition_key_prefix}{file_part_prefix}{index_in_partition:08x}.{extension}"
153+ )
154+ . unwrap ( ) ;
76155
77- Ok ( partition_parts )
156+ Ok ( path )
78157 }
79158}
80159
0 commit comments