33use std:: sync:: Arc ;
44
55use vortex_array:: arcref:: ArcRef ;
6+ use vortex_array:: nbytes:: NBytes ;
67use vortex_array:: stats:: { PRUNING_STATS , STATS_TO_WRITE } ;
78use vortex_array:: { Array , ArrayContext , ArrayRef } ;
89use vortex_btrblocks:: BtrBlocksCompressor ;
910use vortex_dtype:: DType ;
1011use vortex_error:: VortexResult ;
1112use vortex_layout:: layouts:: chunked:: writer:: { ChunkedLayoutOptions , ChunkedLayoutWriter } ;
1213use vortex_layout:: layouts:: flat:: FlatLayout ;
13- use vortex_layout:: layouts:: flat:: writer:: FlatLayoutOptions ;
1414use vortex_layout:: layouts:: stats:: writer:: { StatsLayoutOptions , StatsLayoutWriter } ;
1515use vortex_layout:: layouts:: struct_:: writer:: StructLayoutWriter ;
1616use vortex_layout:: segments:: SegmentWriter ;
@@ -32,13 +32,24 @@ impl LayoutStrategy for VortexLayoutStrategy {
3232 ) ;
3333 }
3434
35- // Otherwise, we finish with compressing the chunks
35+ // Leaf arrays are written as flat arrays, above which, we buffer up to 16MB to try to keep
36+ // chunks for the same column next to each other, within some reasonable write-time memory
37+ // buffering limit.
38+ let writer: ArcRef < dyn LayoutStrategy > = ArcRef :: new_arc ( Arc :: new ( BufferedStrategy {
39+ child : ArcRef :: new_ref ( & FlatLayout ) ,
40+ // Buffer 4MB of compressed data per column before writing the chunks consecutively.
41+ // TODO(ngates): this should really be amortized by the number of fields? Maybe the
42+ // strategy could keep track of how many writers were created?
43+ buffer_size : 4 << 20 , // 4 MB
44+ } ) as _ ) ;
45+
46+ // Compress each chunk with btrblocks.
3647 let writer = BtrBlocksCompressedWriter {
3748 child : ChunkedLayoutWriter :: new (
3849 ctx. clone ( ) ,
3950 & DType :: Null ,
4051 ChunkedLayoutOptions {
41- chunk_strategy : ArcRef :: new_arc ( Arc :: new ( FlatLayoutOptions :: default ( ) ) as _ ) ,
52+ chunk_strategy : writer ,
4253 } ,
4354 )
4455 . boxed ( ) ,
@@ -117,6 +128,64 @@ impl LayoutWriter for BtrBlocksCompressedWriter {
117128 self . child . push_chunk ( segment_writer, compressed)
118129 }
119130
131+ fn flush ( & mut self , segment_writer : & mut dyn SegmentWriter ) -> VortexResult < ( ) > {
132+ self . child . flush ( segment_writer)
133+ }
134+
135+ fn finish ( & mut self , segment_writer : & mut dyn SegmentWriter ) -> VortexResult < Layout > {
136+ self . child . finish ( segment_writer)
137+ }
138+ }
139+
140+ struct BufferedStrategy {
141+ child : ArcRef < dyn LayoutStrategy > ,
142+ buffer_size : u64 ,
143+ }
144+
145+ impl LayoutStrategy for BufferedStrategy {
146+ fn new_writer ( & self , ctx : & ArrayContext , dtype : & DType ) -> VortexResult < Box < dyn LayoutWriter > > {
147+ let child = self . child . new_writer ( ctx, dtype) ?;
148+ Ok ( BufferedWriter {
149+ chunks : Vec :: new ( ) ,
150+ nbytes : 0 ,
151+ buffer_size : self . buffer_size ,
152+ child,
153+ }
154+ . boxed ( ) )
155+ }
156+ }
157+
158+ struct BufferedWriter {
159+ chunks : Vec < ArrayRef > ,
160+ nbytes : u64 ,
161+ buffer_size : u64 ,
162+ child : Box < dyn LayoutWriter > ,
163+ }
164+
165+ impl LayoutWriter for BufferedWriter {
166+ fn push_chunk (
167+ & mut self ,
168+ segment_writer : & mut dyn SegmentWriter ,
169+ chunk : ArrayRef ,
170+ ) -> VortexResult < ( ) > {
171+ self . nbytes += chunk. nbytes ( ) as u64 ;
172+ self . chunks . push ( chunk) ;
173+ if self . nbytes >= self . buffer_size {
174+ for chunk in self . chunks . drain ( ..) {
175+ self . child . push_chunk ( segment_writer, chunk) ?;
176+ }
177+ self . nbytes = 0 ;
178+ }
179+ Ok ( ( ) )
180+ }
181+
182+ fn flush ( & mut self , segment_writer : & mut dyn SegmentWriter ) -> VortexResult < ( ) > {
183+ for chunk in self . chunks . drain ( ..) {
184+ self . child . push_chunk ( segment_writer, chunk) ?;
185+ }
186+ self . child . flush ( segment_writer)
187+ }
188+
120189 fn finish ( & mut self , segment_writer : & mut dyn SegmentWriter ) -> VortexResult < Layout > {
121190 self . child . finish ( segment_writer)
122191 }
0 commit comments