19
19
20
20
use std:: {
21
21
fs:: { remove_file, File } ,
22
- io:: { self , BufReader , Read , Seek , SeekFrom } ,
22
+ io:: BufReader ,
23
23
path:: PathBuf ,
24
24
sync:: Arc ,
25
- vec:: IntoIter ,
26
25
} ;
27
26
28
27
use arrow_array:: { RecordBatch , TimestampMillisecondArray } ;
29
- use arrow_ipc:: { reader:: StreamReader , root_as_message_unchecked , MessageHeader } ;
28
+ use arrow_ipc:: reader:: FileReader ;
30
29
use arrow_schema:: Schema ;
31
- use byteorder:: { LittleEndian , ReadBytesExt } ;
32
30
use itertools:: kmerge_by;
33
31
use tracing:: error;
34
32
@@ -39,7 +37,7 @@ use crate::{
39
37
40
38
#[ derive( Debug ) ]
41
39
pub struct MergedRecordReader {
42
- pub readers : Vec < StreamReader < BufReader < File > > > ,
40
+ pub readers : Vec < FileReader < BufReader < File > > > ,
43
41
}
44
42
45
43
impl MergedRecordReader {
@@ -53,7 +51,7 @@ impl MergedRecordReader {
53
51
remove_file ( file) . unwrap ( ) ;
54
52
} else {
55
53
let Ok ( reader) =
56
- StreamReader :: try_new ( BufReader :: new ( File :: open ( file) . unwrap ( ) ) , None )
54
+ FileReader :: try_new ( BufReader :: new ( File :: open ( file) . unwrap ( ) ) , None )
57
55
else {
58
56
error ! ( "Invalid file detected, ignoring it: {:?}" , file) ;
59
57
continue ;
@@ -74,27 +72,6 @@ impl MergedRecordReader {
74
72
)
75
73
. unwrap ( )
76
74
}
77
- }
78
-
79
- #[ derive( Debug ) ]
80
- pub struct MergedReverseRecordReader {
81
- pub readers : Vec < StreamReader < BufReader < OffsetReader < File > > > > ,
82
- }
83
-
84
- impl MergedReverseRecordReader {
85
- pub fn try_new ( files : & [ PathBuf ] ) -> Self {
86
- let mut readers = Vec :: with_capacity ( files. len ( ) ) ;
87
- for file in files {
88
- let Ok ( reader) = get_reverse_reader ( File :: open ( file) . unwrap ( ) ) else {
89
- error ! ( "Invalid file detected, ignoring it: {:?}" , file) ;
90
- continue ;
91
- } ;
92
-
93
- readers. push ( reader) ;
94
- }
95
-
96
- Self { readers }
97
- }
98
75
99
76
pub fn merged_iter (
100
77
self ,
@@ -111,15 +88,6 @@ impl MergedReverseRecordReader {
111
88
. map ( |batch| reverse ( & batch) )
112
89
. map ( move |batch| adapt_batch ( & schema, & batch) )
113
90
}
114
-
115
- pub fn merged_schema ( & self ) -> Schema {
116
- Schema :: try_merge (
117
- self . readers
118
- . iter ( )
119
- . map ( |reader| reader. schema ( ) . as_ref ( ) . clone ( ) ) ,
120
- )
121
- . unwrap ( )
122
- }
123
91
}
124
92
125
93
fn get_timestamp_millis ( batch : & RecordBatch , time_partition : Option < String > ) -> i64 {
@@ -138,6 +106,7 @@ fn get_timestamp_millis(batch: &RecordBatch, time_partition: Option<String>) ->
138
106
None => get_default_timestamp_millis ( batch) ,
139
107
}
140
108
}
109
+
141
110
fn get_default_timestamp_millis ( batch : & RecordBatch ) -> i64 {
142
111
match batch
143
112
. column ( 0 )
@@ -157,172 +126,18 @@ fn get_default_timestamp_millis(batch: &RecordBatch) -> i64 {
157
126
}
158
127
}
159
128
160
- /// OffsetReader takes in a reader and list of offset and sizes and
161
- /// provides a reader over the file by reading only the offsets
162
- /// from start of the list to end.
163
- ///
164
- /// Safety Invariant: Reader is already validated and all offset and limit are valid to read.
165
- ///
166
- /// On empty list the reader returns no bytes read.
167
- pub struct OffsetReader < R : Read + Seek > {
168
- reader : R ,
169
- offset_list : IntoIter < ( u64 , usize ) > ,
170
- current_offset : u64 ,
171
- current_size : usize ,
172
- buffer : Vec < u8 > ,
173
- buffer_position : usize ,
174
- finished : bool ,
175
- }
176
-
177
- impl < R : Read + Seek > OffsetReader < R > {
178
- fn new ( reader : R , offset_list : Vec < ( u64 , usize ) > ) -> Self {
179
- let mut offset_list = offset_list. into_iter ( ) ;
180
- let mut finished = false ;
181
-
182
- let ( current_offset, current_size) = offset_list. next ( ) . unwrap_or_default ( ) ;
183
- if current_offset == 0 && current_size == 0 {
184
- finished = true
185
- }
186
-
187
- OffsetReader {
188
- reader,
189
- offset_list,
190
- current_offset,
191
- current_size,
192
- buffer : vec ! [ 0 ; 4096 ] ,
193
- buffer_position : 0 ,
194
- finished,
195
- }
196
- }
197
- }
198
-
199
- impl < R : Read + Seek > Read for OffsetReader < R > {
200
- fn read ( & mut self , buf : & mut [ u8 ] ) -> io:: Result < usize > {
201
- let offset = self . current_offset ;
202
- let size = self . current_size ;
203
-
204
- if self . finished {
205
- return Ok ( 0 ) ;
206
- }
207
- // on empty buffer load current data represented by
208
- // current_offset and current_size into self buffer
209
- if self . buffer_position == 0 {
210
- self . reader . seek ( SeekFrom :: Start ( offset) ) ?;
211
- // resize for current message
212
- if self . buffer . len ( ) < size {
213
- self . buffer . resize ( size, 0 )
214
- }
215
- self . reader . read_exact ( & mut self . buffer [ 0 ..size] ) ?;
216
- }
217
-
218
- let remaining_bytes = size - self . buffer_position ;
219
- let max_read = usize:: min ( remaining_bytes, buf. len ( ) ) ;
220
-
221
- // Copy data from the buffer to the provided buffer
222
- let read_data = & self . buffer [ self . buffer_position ..self . buffer_position + max_read] ;
223
- buf[ ..max_read] . copy_from_slice ( read_data) ;
224
-
225
- self . buffer_position += max_read;
226
-
227
- if self . buffer_position >= size {
228
- // If we've read the entire section, move to the next offset
229
- match self . offset_list . next ( ) {
230
- Some ( ( offset, size) ) => {
231
- self . current_offset = offset;
232
- self . current_size = size;
233
- self . buffer_position = 0 ;
234
- }
235
- None => {
236
- // iter is exhausted, no more read can be done
237
- self . finished = true
238
- }
239
- }
240
- }
241
-
242
- Ok ( max_read)
243
- }
244
- }
245
-
246
- pub fn get_reverse_reader < T : Read + Seek > (
247
- mut reader : T ,
248
- ) -> Result < StreamReader < BufReader < OffsetReader < T > > > , io:: Error > {
249
- let mut offset = 0 ;
250
- let mut messages = Vec :: new ( ) ;
251
-
252
- while let Some ( res) = find_limit_and_type ( & mut reader) . transpose ( ) {
253
- match res {
254
- Ok ( ( header, size) ) => {
255
- messages. push ( ( header, offset, size) ) ;
256
- offset += size;
257
- }
258
- Err ( err) if err. kind ( ) == io:: ErrorKind :: UnexpectedEof => break ,
259
- Err ( err) => return Err ( err) ,
260
- }
261
- }
262
-
263
- // reverse everything leaving the first because it has schema message.
264
- messages[ 1 ..] . reverse ( ) ;
265
- let messages = messages
266
- . into_iter ( )
267
- . map ( |( _, offset, size) | ( offset as u64 , size) )
268
- . collect ( ) ;
269
-
270
- // reset reader
271
- reader. rewind ( ) ?;
272
-
273
- Ok ( StreamReader :: try_new ( BufReader :: new ( OffsetReader :: new ( reader, messages) ) , None ) . unwrap ( ) )
274
- }
275
-
276
- // return limit for
277
- fn find_limit_and_type (
278
- reader : & mut ( impl Read + Seek ) ,
279
- ) -> Result < Option < ( MessageHeader , usize ) > , io:: Error > {
280
- let mut size = 0 ;
281
- let marker = reader. read_u32 :: < LittleEndian > ( ) ?;
282
- size += 4 ;
283
-
284
- if marker != 0xFFFFFFFF {
285
- return Err ( io:: Error :: new (
286
- io:: ErrorKind :: InvalidData ,
287
- "Invalid Continuation Marker" ,
288
- ) ) ;
289
- }
290
-
291
- let metadata_size = reader. read_u32 :: < LittleEndian > ( ) ? as usize ;
292
- size += 4 ;
293
-
294
- if metadata_size == 0x00000000 {
295
- return Ok ( None ) ;
296
- }
297
-
298
- let mut message = vec ! [ 0u8 ; metadata_size] ;
299
- reader. read_exact ( & mut message) ?;
300
- size += metadata_size;
301
-
302
- let message = unsafe { root_as_message_unchecked ( & message) } ;
303
- let header = message. header_type ( ) ;
304
- let message_size = message. bodyLength ( ) ;
305
- size += message_size as usize ;
306
-
307
- let padding = ( 8 - ( size % 8 ) ) % 8 ;
308
- reader. seek ( SeekFrom :: Current ( padding as i64 + message_size) ) ?;
309
- size += padding;
310
-
311
- Ok ( Some ( ( header, size) ) )
312
- }
313
-
314
129
#[ cfg( test) ]
315
130
mod tests {
316
- use std:: { io :: Cursor , sync:: Arc } ;
131
+ use std:: { fs :: File , path :: Path , sync:: Arc } ;
317
132
318
133
use arrow_array:: {
319
134
cast:: AsArray , types:: Int64Type , Array , Float64Array , Int64Array , RecordBatch , StringArray ,
320
135
} ;
321
- use arrow_ipc:: writer:: {
322
- write_message, DictionaryTracker , IpcDataGenerator , IpcWriteOptions , StreamWriter ,
136
+ use arrow_ipc:: {
137
+ reader:: FileReader ,
138
+ writer:: { write_message, DictionaryTracker , FileWriter , IpcDataGenerator , IpcWriteOptions } ,
323
139
} ;
324
-
325
- use super :: get_reverse_reader;
140
+ use temp_dir:: TempDir ;
326
141
327
142
fn rb ( rows : usize ) -> RecordBatch {
328
143
let array1: Arc < dyn Array > = Arc :: new ( Int64Array :: from_iter ( 0 ..( rows as i64 ) ) ) ;
@@ -339,42 +154,48 @@ mod tests {
339
154
. unwrap ( )
340
155
}
341
156
342
- fn write_mem ( rbs : & [ RecordBatch ] ) -> Vec < u8 > {
343
- let buf = Vec :: new ( ) ;
344
- let mut writer = StreamWriter :: try_new ( buf , & rbs[ 0 ] . schema ( ) ) . unwrap ( ) ;
157
+ fn write_file ( rbs : & [ RecordBatch ] , path : & Path ) {
158
+ let file = File :: create ( path ) . unwrap ( ) ;
159
+ let mut writer = FileWriter :: try_new_buffered ( file , & rbs[ 0 ] . schema ( ) ) . unwrap ( ) ;
345
160
346
161
for rb in rbs {
347
162
writer. write ( rb) . unwrap ( )
348
163
}
349
164
350
- writer. into_inner ( ) . unwrap ( )
165
+ writer. finish ( ) . unwrap ( ) ;
351
166
}
352
167
353
168
#[ test]
354
169
fn test_empty_row ( ) {
170
+ let temp_dir = TempDir :: new ( ) . unwrap ( ) ;
171
+ let path = temp_dir. path ( ) . join ( "test.arrows" ) ;
355
172
let rb = rb ( 0 ) ;
356
- let buf = write_mem ( & [ rb] ) ;
357
- let reader = Cursor :: new ( buf ) ;
358
- let mut reader = get_reverse_reader ( reader) . unwrap ( ) ;
173
+ write_file ( & [ rb] , & path ) ;
174
+ let reader = File :: open ( path ) . unwrap ( ) ;
175
+ let mut reader = FileReader :: try_new_buffered ( reader, None ) . unwrap ( ) ;
359
176
let rb = reader. next ( ) . unwrap ( ) . unwrap ( ) ;
360
177
assert_eq ! ( rb. num_rows( ) , 0 ) ;
361
178
}
362
179
363
180
#[ test]
364
181
fn test_one_row ( ) {
182
+ let temp_dir = TempDir :: new ( ) . unwrap ( ) ;
183
+ let path = temp_dir. path ( ) . join ( "test.arrows" ) ;
365
184
let rb = rb ( 1 ) ;
366
- let buf = write_mem ( & [ rb] ) ;
367
- let reader = Cursor :: new ( buf ) ;
368
- let mut reader = get_reverse_reader ( reader) . unwrap ( ) ;
185
+ write_file ( & [ rb] , & path ) ;
186
+ let reader = File :: open ( path ) . unwrap ( ) ;
187
+ let mut reader = FileReader :: try_new_buffered ( reader, None ) . unwrap ( ) ;
369
188
let rb = reader. next ( ) . unwrap ( ) . unwrap ( ) ;
370
189
assert_eq ! ( rb. num_rows( ) , 1 ) ;
371
190
}
372
191
373
192
#[ test]
374
193
fn test_multiple_row_multiple_rbs ( ) {
375
- let buf = write_mem ( & [ rb ( 1 ) , rb ( 2 ) , rb ( 3 ) ] ) ;
376
- let reader = Cursor :: new ( buf) ;
377
- let mut reader = get_reverse_reader ( reader) . unwrap ( ) ;
194
+ let temp_dir = TempDir :: new ( ) . unwrap ( ) ;
195
+ let path = temp_dir. path ( ) . join ( "test.arrows" ) ;
196
+ write_file ( & [ rb ( 1 ) , rb ( 2 ) , rb ( 3 ) ] , & path) ;
197
+ let reader = File :: open ( path) . unwrap ( ) ;
198
+ let mut reader = FileReader :: try_new_buffered ( reader, None ) . unwrap ( ) ;
378
199
let rb = reader. next ( ) . unwrap ( ) . unwrap ( ) ;
379
200
assert_eq ! ( rb. num_rows( ) , 3 ) ;
380
201
let col1_val: Vec < i64 > = rb
@@ -394,40 +215,42 @@ mod tests {
394
215
395
216
#[ test]
396
217
fn manual_write ( ) {
218
+ let temp_dir = TempDir :: new ( ) . unwrap ( ) ;
219
+ let path = temp_dir. path ( ) . join ( "test.arrows" ) ;
397
220
let error_on_replacement = true ;
398
221
let options = IpcWriteOptions :: default ( ) ;
399
222
let mut dictionary_tracker = DictionaryTracker :: new ( error_on_replacement) ;
400
223
let data_gen = IpcDataGenerator { } ;
401
-
402
- let mut buf = Vec :: new ( ) ;
224
+ let mut file = File :: create ( & path) . unwrap ( ) ;
403
225
let rb1 = rb ( 1 ) ;
404
226
405
227
let schema = data_gen. schema_to_bytes_with_dictionary_tracker (
406
228
& rb1. schema ( ) ,
407
229
& mut dictionary_tracker,
408
230
& options,
409
231
) ;
410
- write_message ( & mut buf , schema, & options) . unwrap ( ) ;
232
+ write_message ( & mut file , schema, & options) . unwrap ( ) ;
411
233
412
234
for i in ( 1 ..=3 ) . cycle ( ) . skip ( 1 ) . take ( 10000 ) {
413
235
let ( _, encoded_message) = data_gen
414
236
. encoded_batch ( & rb ( i) , & mut dictionary_tracker, & options)
415
237
. unwrap ( ) ;
416
- write_message ( & mut buf , encoded_message, & options) . unwrap ( ) ;
238
+ write_message ( & mut file , encoded_message, & options) . unwrap ( ) ;
417
239
}
418
240
419
241
let schema = data_gen. schema_to_bytes_with_dictionary_tracker (
420
242
& rb1. schema ( ) ,
421
243
& mut dictionary_tracker,
422
244
& options,
423
245
) ;
424
- write_message ( & mut buf , schema, & options) . unwrap ( ) ;
246
+ write_message ( & mut file , schema, & options) . unwrap ( ) ;
425
247
426
- let buf = Cursor :: new ( buf ) ;
427
- let reader = get_reverse_reader ( buf ) . unwrap ( ) . flatten ( ) ;
248
+ let reader = File :: open ( path ) . unwrap ( ) ;
249
+ let reader = FileReader :: try_new_buffered ( reader , None ) . unwrap ( ) ;
428
250
429
251
let mut sum = 0 ;
430
252
for rb in reader {
253
+ let rb = rb. unwrap ( ) ;
431
254
sum += 1 ;
432
255
assert ! ( rb. num_rows( ) > 0 ) ;
433
256
}
0 commit comments