@@ -27,7 +27,6 @@ use std::{
27
27
} ;
28
28
29
29
use arrow_array:: RecordBatch ;
30
- use arrow_ipc:: writer:: FileWriter ;
31
30
use arrow_schema:: { Field , Fields , Schema } ;
32
31
use chrono:: { NaiveDateTime , Timelike , Utc } ;
33
32
use derive_more:: { Deref , DerefMut } ;
@@ -41,7 +40,7 @@ use parquet::{
41
40
} ;
42
41
use rand:: distributions:: DistString ;
43
42
use relative_path:: RelativePathBuf ;
44
- use tracing:: { error, info, trace, warn} ;
43
+ use tracing:: { debug , error, info, trace, warn} ;
45
44
46
45
use crate :: {
47
46
cli:: Options ,
@@ -57,16 +56,18 @@ use crate::{
57
56
} ;
58
57
59
58
use super :: {
60
- staging:: { reader:: MergedRecordReader , writer:: Writer , StagingError } ,
59
+ staging:: {
60
+ reader:: MergedRecordReader ,
61
+ writer:: { DiskWriter , Writer } ,
62
+ StagingError ,
63
+ } ,
61
64
LogStream ,
62
65
} ;
63
66
64
67
#[ derive( Debug , thiserror:: Error ) ]
65
68
#[ error( "Stream not found: {0}" ) ]
66
69
pub struct StreamNotFound ( pub String ) ;
67
70
68
- const ARROW_FILE_EXTENSION : & str = "data.arrows" ;
69
-
70
71
pub type StreamRef = Arc < Stream > ;
71
72
72
73
/// All state associated with a single logstream in Parseable.
@@ -116,22 +117,15 @@ impl Stream {
116
117
}
117
118
None => {
118
119
// entry is not present thus we create it
119
- let file_path = self . path_by_current_time (
120
+ let path_prefix = self . path_prefix_by_current_time (
120
121
schema_key,
121
122
parsed_timestamp,
122
123
custom_partition_values,
123
124
) ;
124
- std:: fs:: create_dir_all ( & self . data_path ) ?;
125
-
126
- let file = OpenOptions :: new ( )
127
- . create ( true )
128
- . append ( true )
129
- . open ( & file_path) ?;
130
-
131
- let mut writer = FileWriter :: try_new_buffered ( file, & record. schema ( ) )
132
- . expect ( "File and RecordBatch both are checked" ) ;
133
125
126
+ let mut writer = DiskWriter :: new ( path_prefix, & record. schema ( ) ) ?;
134
127
writer. write ( record) ?;
128
+
135
129
guard. disk . insert ( schema_key. to_owned ( ) , writer) ;
136
130
}
137
131
} ;
@@ -142,7 +136,7 @@ impl Stream {
142
136
Ok ( ( ) )
143
137
}
144
138
145
- pub fn path_by_current_time (
139
+ pub fn path_prefix_by_current_time (
146
140
& self ,
147
141
stream_hash : & str ,
148
142
parsed_timestamp : NaiveDateTime ,
@@ -153,7 +147,7 @@ impl Stream {
153
147
hostname. push_str ( id) ;
154
148
}
155
149
let filename = format ! (
156
- "{}{stream_hash}.date={}.hour={:02}.minute={}.{}{hostname}.{ARROW_FILE_EXTENSION} " ,
150
+ "{}{stream_hash}.date={}.hour={:02}.minute={}.{}{hostname}" ,
157
151
Utc :: now( ) . format( "%Y%m%dT%H%M" ) ,
158
152
parsed_timestamp. date( ) ,
159
153
parsed_timestamp. hour( ) ,
@@ -345,16 +339,34 @@ impl Stream {
345
339
Ok ( ( ) )
346
340
}
347
341
348
- pub fn recordbatches_cloned ( & self , schema : & Arc < Schema > ) -> Vec < RecordBatch > {
349
- self . writer . lock ( ) . unwrap ( ) . mem . recordbatch_cloned ( schema)
342
+ /// Returns records batches as found in staging
343
+ pub fn recordbatches_cloned (
344
+ & self ,
345
+ schema : & Arc < Schema > ,
346
+ time_partition : Option < String > ,
347
+ ) -> Vec < RecordBatch > {
348
+ // All records found in memory
349
+ let mut records = self . writer . lock ( ) . unwrap ( ) . mem . recordbatch_cloned ( schema) ;
350
+ // Append records batches picked up from `.arrows` files
351
+ let arrow_files = self . arrow_files ( ) ;
352
+ let record_reader = MergedRecordReader :: new ( & arrow_files) ;
353
+ if record_reader. readers . is_empty ( ) {
354
+ return vec ! [ ] ;
355
+ }
356
+ let mut from_file = record_reader
357
+ . merged_iter ( schema. clone ( ) , time_partition)
358
+ . collect ( ) ;
359
+ records. append ( & mut from_file) ;
360
+
361
+ records
350
362
}
351
363
352
364
pub fn clear ( & self ) {
353
365
self . writer . lock ( ) . unwrap ( ) . mem . clear ( ) ;
354
366
}
355
367
356
368
pub fn flush ( & self ) {
357
- let mut disk_writers = {
369
+ let disk_writers = {
358
370
let mut writer = self . writer . lock ( ) . unwrap ( ) ;
359
371
// Flush memory
360
372
writer. mem . clear ( ) ;
@@ -363,8 +375,12 @@ impl Stream {
363
375
} ;
364
376
365
377
// Flush disk
366
- for writer in disk_writers. values_mut ( ) {
367
- _ = writer. finish ( ) ;
378
+ for ( _, writer) in disk_writers {
379
+ if let Err ( err) = writer. finish ( ) {
380
+ warn ! ( "Couldn't finish `.arrows` file: {err}" ) ;
381
+ } else {
382
+ debug ! ( "Finished `.arrows` file sync onto disk" )
383
+ }
368
384
}
369
385
}
370
386
@@ -855,16 +871,19 @@ mod tests {
855
871
) ;
856
872
857
873
let expected_path = staging. data_path . join ( format ! (
858
- "{}{stream_hash}.date={}.hour={:02}.minute={}.{}.{ARROW_FILE_EXTENSION} " ,
874
+ "{}{stream_hash}.date={}.hour={:02}.minute={}.{}" ,
859
875
Utc :: now( ) . format( "%Y%m%dT%H%M" ) ,
860
876
parsed_timestamp. date( ) ,
861
877
parsed_timestamp. hour( ) ,
862
878
minute_to_slot( parsed_timestamp. minute( ) , OBJECT_STORE_DATA_GRANULARITY ) . unwrap( ) ,
863
879
hostname:: get( ) . unwrap( ) . into_string( ) . unwrap( )
864
880
) ) ;
865
881
866
- let generated_path =
867
- staging. path_by_current_time ( stream_hash, parsed_timestamp, & custom_partition_values) ;
882
+ let generated_path = staging. path_prefix_by_current_time (
883
+ stream_hash,
884
+ parsed_timestamp,
885
+ & custom_partition_values,
886
+ ) ;
868
887
869
888
assert_eq ! ( generated_path, expected_path) ;
870
889
}
@@ -890,16 +909,19 @@ mod tests {
890
909
) ;
891
910
892
911
let expected_path = staging. data_path . join ( format ! (
893
- "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.{ARROW_FILE_EXTENSION} " ,
912
+ "{}{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}" ,
894
913
Utc :: now( ) . format( "%Y%m%dT%H%M" ) ,
895
914
parsed_timestamp. date( ) ,
896
915
parsed_timestamp. hour( ) ,
897
916
minute_to_slot( parsed_timestamp. minute( ) , OBJECT_STORE_DATA_GRANULARITY ) . unwrap( ) ,
898
917
hostname:: get( ) . unwrap( ) . into_string( ) . unwrap( )
899
918
) ) ;
900
919
901
- let generated_path =
902
- staging. path_by_current_time ( stream_hash, parsed_timestamp, & custom_partition_values) ;
920
+ let generated_path = staging. path_prefix_by_current_time (
921
+ stream_hash,
922
+ parsed_timestamp,
923
+ & custom_partition_values,
924
+ ) ;
903
925
904
926
assert_eq ! ( generated_path, expected_path) ;
905
927
}
0 commit comments