18
18
//! This module provide `DataFileWriter`.
19
19
20
20
use arrow_array:: RecordBatch ;
21
- use itertools:: Itertools ;
22
21
23
- use crate :: Result ;
24
- use crate :: spec:: { DataContentType , DataFile , Struct } ;
25
- use crate :: writer:: file_writer:: { FileWriter , FileWriterBuilder } ;
22
+ use crate :: spec:: { DataContentType , DataFile , PartitionKey } ;
23
+ use crate :: writer:: file_writer:: FileWriterBuilder ;
24
+ use crate :: writer:: file_writer:: location_generator:: { FileNameGenerator , LocationGenerator } ;
25
+ use crate :: writer:: file_writer:: rolling_writer:: { RollingFileWriter , RollingFileWriterBuilder } ;
26
26
use crate :: writer:: { CurrentFileStatus , IcebergWriter , IcebergWriterBuilder } ;
27
+ use crate :: { Error , ErrorKind , Result } ;
27
28
28
29
/// Builder for `DataFileWriter`.
29
30
#[ derive( Clone , Debug ) ]
30
- pub struct DataFileWriterBuilder < B : FileWriterBuilder > {
31
- inner : B ,
32
- partition_value : Option < Struct > ,
33
- partition_spec_id : i32 ,
31
+ pub struct DataFileWriterBuilder < B : FileWriterBuilder , L : LocationGenerator , F : FileNameGenerator > {
32
+ inner : RollingFileWriterBuilder < B , L , F > ,
33
+ partition_key : Option < PartitionKey > ,
34
34
}
35
35
36
- impl < B : FileWriterBuilder > DataFileWriterBuilder < B > {
37
- /// Create a new `DataFileWriterBuilder` using a `FileWriterBuilder`.
38
- pub fn new ( inner : B , partition_value : Option < Struct > , partition_spec_id : i32 ) -> Self {
36
+ impl < B , L , F > DataFileWriterBuilder < B , L , F >
37
+ where
38
+ B : FileWriterBuilder ,
39
+ L : LocationGenerator ,
40
+ F : FileNameGenerator ,
41
+ {
42
+ /// Create a new `DataFileWriterBuilder` using a `RollingFileWriterBuilder`.
43
+ pub fn new (
44
+ inner_builder : RollingFileWriterBuilder < B , L , F > ,
45
+ partition_key : Option < PartitionKey > ,
46
+ ) -> Self {
39
47
Self {
40
- inner,
41
- partition_value,
42
- partition_spec_id,
48
+ inner : inner_builder,
49
+ partition_key,
43
50
}
44
51
}
45
52
}
46
53
47
54
#[ async_trait:: async_trait]
48
- impl < B : FileWriterBuilder > IcebergWriterBuilder for DataFileWriterBuilder < B > {
49
- type R = DataFileWriter < B > ;
55
+ impl < B , L , F > IcebergWriterBuilder for DataFileWriterBuilder < B , L , F >
56
+ where
57
+ B : FileWriterBuilder ,
58
+ L : LocationGenerator ,
59
+ F : FileNameGenerator ,
60
+ {
61
+ type R = DataFileWriter < B , L , F > ;
50
62
51
63
async fn build ( self ) -> Result < Self :: R > {
52
64
Ok ( DataFileWriter {
53
- inner_writer : Some ( self . inner . clone ( ) . build ( ) . await ?) ,
54
- partition_value : self . partition_value . unwrap_or ( Struct :: empty ( ) ) ,
55
- partition_spec_id : self . partition_spec_id ,
65
+ inner : Some ( self . inner . clone ( ) . build ( ) ) ,
66
+ partition_key : self . partition_key ,
56
67
} )
57
68
}
58
69
}
59
70
60
71
/// A writer write data is within one spec/partition.
61
72
#[ derive( Debug ) ]
62
- pub struct DataFileWriter < B : FileWriterBuilder > {
63
- inner_writer : Option < B :: R > ,
64
- partition_value : Struct ,
65
- partition_spec_id : i32 ,
73
+ pub struct DataFileWriter < B : FileWriterBuilder , L : LocationGenerator , F : FileNameGenerator > {
74
+ inner : Option < RollingFileWriter < B , L , F > > ,
75
+ partition_key : Option < PartitionKey > ,
66
76
}
67
77
68
78
#[ async_trait:: async_trait]
69
- impl < B : FileWriterBuilder > IcebergWriter for DataFileWriter < B > {
79
+ impl < B , L , F > IcebergWriter for DataFileWriter < B , L , F >
80
+ where
81
+ B : FileWriterBuilder ,
82
+ L : LocationGenerator ,
83
+ F : FileNameGenerator ,
84
+ {
70
85
async fn write ( & mut self , batch : RecordBatch ) -> Result < ( ) > {
71
- self . inner_writer . as_mut ( ) . unwrap ( ) . write ( & batch) . await
86
+ if let Some ( writer) = self . inner . as_mut ( ) {
87
+ writer. write ( & self . partition_key , & batch) . await
88
+ } else {
89
+ Err ( Error :: new (
90
+ ErrorKind :: Unexpected ,
91
+ "Writer is not initialized!" ,
92
+ ) )
93
+ }
72
94
}
73
95
74
96
async fn close ( & mut self ) -> Result < Vec < DataFile > > {
75
- let writer = self . inner_writer . take ( ) . unwrap ( ) ;
76
- Ok ( writer
77
- . close ( )
78
- . await ?
79
- . into_iter ( )
80
- . map ( |mut res| {
81
- res. content ( DataContentType :: Data ) ;
82
- res. partition ( self . partition_value . clone ( ) ) ;
83
- res. partition_spec_id ( self . partition_spec_id ) ;
84
- res. build ( ) . expect ( "Guaranteed to be valid" )
85
- } )
86
- . collect_vec ( ) )
97
+ if let Some ( writer) = self . inner . take ( ) {
98
+ writer
99
+ . close ( )
100
+ . await ?
101
+ . into_iter ( )
102
+ . map ( |mut res| {
103
+ res. content ( DataContentType :: Data ) ;
104
+ if let Some ( pk) = self . partition_key . as_ref ( ) {
105
+ res. partition ( pk. data ( ) . clone ( ) ) ;
106
+ res. partition_spec_id ( pk. spec ( ) . spec_id ( ) ) ;
107
+ }
108
+ res. build ( ) . map_err ( |e| {
109
+ Error :: new (
110
+ ErrorKind :: DataInvalid ,
111
+ format ! ( "Failed to build data file: {}" , e) ,
112
+ )
113
+ } )
114
+ } )
115
+ . collect ( )
116
+ } else {
117
+ Err ( Error :: new (
118
+ ErrorKind :: Unexpected ,
119
+ "Data file writer has been closed." ,
120
+ ) )
121
+ }
87
122
}
88
123
}
89
124
90
- impl < B : FileWriterBuilder > CurrentFileStatus for DataFileWriter < B > {
125
+ impl < B , L , F > CurrentFileStatus for DataFileWriter < B , L , F >
126
+ where
127
+ B : FileWriterBuilder ,
128
+ L : LocationGenerator ,
129
+ F : FileNameGenerator ,
130
+ {
91
131
fn current_file_path ( & self ) -> String {
92
- self . inner_writer . as_ref ( ) . unwrap ( ) . current_file_path ( )
132
+ self . inner . as_ref ( ) . unwrap ( ) . current_file_path ( )
93
133
}
94
134
95
135
fn current_row_num ( & self ) -> usize {
96
- self . inner_writer . as_ref ( ) . unwrap ( ) . current_row_num ( )
136
+ self . inner . as_ref ( ) . unwrap ( ) . current_row_num ( )
97
137
}
98
138
99
139
fn current_written_size ( & self ) -> usize {
100
- self . inner_writer . as_ref ( ) . unwrap ( ) . current_written_size ( )
140
+ self . inner . as_ref ( ) . unwrap ( ) . current_written_size ( )
101
141
}
102
142
}
103
143
@@ -116,13 +156,15 @@ mod test {
116
156
use crate :: Result ;
117
157
use crate :: io:: FileIOBuilder ;
118
158
use crate :: spec:: {
119
- DataContentType , DataFileFormat , Literal , NestedField , PrimitiveType , Schema , Struct , Type ,
159
+ DataContentType , DataFileFormat , Literal , NestedField , PartitionKey , PartitionSpec ,
160
+ PrimitiveType , Schema , Struct , Type ,
120
161
} ;
121
162
use crate :: writer:: base_writer:: data_file_writer:: DataFileWriterBuilder ;
122
163
use crate :: writer:: file_writer:: ParquetWriterBuilder ;
123
164
use crate :: writer:: file_writer:: location_generator:: {
124
165
DefaultFileNameGenerator , DefaultLocationGenerator ,
125
166
} ;
167
+ use crate :: writer:: file_writer:: rolling_writer:: RollingFileWriterBuilder ;
126
168
use crate :: writer:: { IcebergWriter , IcebergWriterBuilder , RecordBatch } ;
127
169
128
170
#[ tokio:: test]
@@ -143,16 +185,16 @@ mod test {
143
185
] )
144
186
. build ( ) ?;
145
187
146
- let pw = ParquetWriterBuilder :: new (
147
- WriterProperties :: builder ( ) . build ( ) ,
148
- Arc :: new ( schema ) ,
149
- None ,
188
+ let pw = ParquetWriterBuilder :: new ( WriterProperties :: builder ( ) . build ( ) , Arc :: new ( schema ) ) ;
189
+
190
+ let rolling_file_writer_builder = RollingFileWriterBuilder :: new_with_default_file_size (
191
+ pw ,
150
192
file_io. clone ( ) ,
151
193
location_gen,
152
194
file_name_gen,
153
195
) ;
154
196
155
- let mut data_file_writer = DataFileWriterBuilder :: new ( pw , None , 0 )
197
+ let mut data_file_writer = DataFileWriterBuilder :: new ( rolling_file_writer_builder , None )
156
198
. build ( )
157
199
. await
158
200
. unwrap ( ) ;
@@ -219,20 +261,27 @@ mod test {
219
261
NestedField :: required( 6 , "name" , Type :: Primitive ( PrimitiveType :: String ) ) . into( ) ,
220
262
] )
221
263
. build ( ) ?;
264
+ let schema_ref = Arc :: new ( schema) ;
222
265
223
266
let partition_value = Struct :: from_iter ( [ Some ( Literal :: int ( 1 ) ) ] ) ;
267
+ let partition_key = PartitionKey :: new (
268
+ PartitionSpec :: builder ( schema_ref. clone ( ) ) . build ( ) ?,
269
+ schema_ref. clone ( ) ,
270
+ partition_value. clone ( ) ,
271
+ ) ;
224
272
225
- let parquet_writer_builder = ParquetWriterBuilder :: new (
226
- WriterProperties :: builder ( ) . build ( ) ,
227
- Arc :: new ( schema. clone ( ) ) ,
228
- None ,
273
+ let parquet_writer_builder =
274
+ ParquetWriterBuilder :: new ( WriterProperties :: builder ( ) . build ( ) , schema_ref. clone ( ) ) ;
275
+
276
+ let rolling_file_writer_builder = RollingFileWriterBuilder :: new_with_default_file_size (
277
+ parquet_writer_builder,
229
278
file_io. clone ( ) ,
230
279
location_gen,
231
280
file_name_gen,
232
281
) ;
233
282
234
283
let mut data_file_writer =
235
- DataFileWriterBuilder :: new ( parquet_writer_builder , Some ( partition_value . clone ( ) ) , 0 )
284
+ DataFileWriterBuilder :: new ( rolling_file_writer_builder , Some ( partition_key ) )
236
285
. build ( )
237
286
. await ?;
238
287
0 commit comments