|
1 | 1 | //! This module defines the default layout strategy for a Vortex file. |
2 | 2 |
|
3 | | -use std::collections::VecDeque; |
4 | 3 | use std::sync::Arc; |
5 | 4 |
|
6 | 5 | use arcref::ArcRef; |
7 | | -use async_stream::try_stream; |
8 | | -use futures::{FutureExt, StreamExt, pin_mut}; |
9 | | -use vortex_array::stats::{PRUNING_STATS, Stat}; |
10 | | -use vortex_array::{Array, ArrayContext}; |
11 | | -use vortex_btrblocks::BtrBlocksCompressor; |
| 6 | +use vortex_array::stats::PRUNING_STATS; |
| 7 | +use vortex_layout::LayoutStrategy; |
| 8 | +use vortex_layout::layouts::buffered::BufferedStrategy; |
12 | 9 | use vortex_layout::layouts::chunked::writer::ChunkedLayoutStrategy; |
| 10 | +use vortex_layout::layouts::compressed::BtrBlocksCompressedStrategy; |
13 | 11 | use vortex_layout::layouts::dict::writer::DictStrategy; |
14 | 12 | use vortex_layout::layouts::flat::writer::FlatLayoutStrategy; |
15 | 13 | use vortex_layout::layouts::repartition::{RepartitionStrategy, RepartitionWriterOptions}; |
16 | 14 | use vortex_layout::layouts::struct_::writer::StructStrategy; |
17 | 15 | use vortex_layout::layouts::zoned::writer::{ZonedLayoutOptions, ZonedStrategy}; |
18 | | -use vortex_layout::scan::{TaskExecutor, TaskExecutorExt}; |
19 | | -use vortex_layout::segments::SequenceWriter; |
20 | | -use vortex_layout::{ |
21 | | - LayoutStrategy, SendableLayoutWriter, SendableSequentialStream, SequentialStreamAdapter, |
22 | | - SequentialStreamExt as _, |
23 | | -}; |
| 16 | +use vortex_layout::scan::TaskExecutor; |
24 | 17 |
|
25 | 18 | const ROW_BLOCK_SIZE: usize = 8192; |
26 | 19 |
|
@@ -93,121 +86,3 @@ impl VortexLayoutStrategy { |
93 | 86 | fn arcref(item: impl LayoutStrategy) -> ArcRef<dyn LayoutStrategy> { |
94 | 87 | ArcRef::new_arc(Arc::new(item)) |
95 | 88 | } |
96 | | - |
97 | | -/// A layout writer that compresses chunks using a sampling compressor. |
98 | | -struct BtrBlocksCompressedStrategy { |
99 | | - child: ArcRef<dyn LayoutStrategy>, |
100 | | - executor: Arc<dyn TaskExecutor>, |
101 | | - parallelism: usize, |
102 | | -} |
103 | | - |
104 | | -impl BtrBlocksCompressedStrategy { |
105 | | - pub fn new( |
106 | | - child: ArcRef<dyn LayoutStrategy>, |
107 | | - executor: Arc<dyn TaskExecutor>, |
108 | | - parallelism: usize, |
109 | | - ) -> Self { |
110 | | - Self { |
111 | | - child, |
112 | | - executor, |
113 | | - parallelism, |
114 | | - } |
115 | | - } |
116 | | -} |
117 | | - |
118 | | -impl LayoutStrategy for BtrBlocksCompressedStrategy { |
119 | | - fn write_stream( |
120 | | - &self, |
121 | | - ctx: &ArrayContext, |
122 | | - sequence_writer: SequenceWriter, |
123 | | - stream: SendableSequentialStream, |
124 | | - ) -> SendableLayoutWriter { |
125 | | - let executor = self.executor.clone(); |
126 | | - |
127 | | - let dtype = stream.dtype().clone(); |
128 | | - let stream = stream |
129 | | - .map(|chunk| { |
130 | | - async { |
131 | | - let (sequence_id, chunk) = chunk?; |
132 | | - // Compute the stats for the chunk prior to compression |
133 | | - chunk |
134 | | - .statistics() |
135 | | - .compute_all(&Stat::all().collect::<Vec<_>>())?; |
136 | | - Ok((sequence_id, BtrBlocksCompressor.compress(&chunk)?)) |
137 | | - } |
138 | | - .boxed() |
139 | | - }) |
140 | | - .map(move |compress_future| executor.spawn(compress_future)) |
141 | | - .buffered(self.parallelism); |
142 | | - |
143 | | - self.child.write_stream( |
144 | | - ctx, |
145 | | - sequence_writer, |
146 | | - SequentialStreamAdapter::new(dtype, stream).sendable(), |
147 | | - ) |
148 | | - } |
149 | | -} |
150 | | - |
151 | | -struct BufferedStrategy { |
152 | | - child: ArcRef<dyn LayoutStrategy>, |
153 | | - buffer_size: u64, |
154 | | -} |
155 | | - |
156 | | -impl BufferedStrategy { |
157 | | - pub fn new(child: ArcRef<dyn LayoutStrategy>, buffer_size: u64) -> Self { |
158 | | - Self { child, buffer_size } |
159 | | - } |
160 | | -} |
161 | | - |
162 | | -impl LayoutStrategy for BufferedStrategy { |
163 | | - fn write_stream( |
164 | | - &self, |
165 | | - ctx: &ArrayContext, |
166 | | - sequence_writer: SequenceWriter, |
167 | | - stream: SendableSequentialStream, |
168 | | - ) -> SendableLayoutWriter { |
169 | | - let dtype = stream.dtype().clone(); |
170 | | - let buffer_size = self.buffer_size; |
171 | | - let buffered_stream = try_stream! { |
172 | | - let stream = stream.peekable(); |
173 | | - pin_mut!(stream); |
174 | | - |
175 | | - let mut nbytes = 0u64; |
176 | | - let mut chunks = VecDeque::new(); |
177 | | - |
178 | | - while let Some(chunk) = stream.as_mut().next().await { |
179 | | - let (sequence_id, chunk) = chunk?; |
180 | | - nbytes += chunk.nbytes() as u64; |
181 | | - chunks.push_back(chunk); |
182 | | - |
183 | | - // if this is the last element, flush everything |
184 | | - if stream.as_mut().peek().await.is_none() { |
185 | | - let mut sequence_pointer = sequence_id.descend(); |
186 | | - while let Some(chunk) = chunks.pop_front() { |
187 | | - yield (sequence_pointer.advance(), chunk) |
188 | | - } |
189 | | - break; |
190 | | - } |
191 | | - |
192 | | - if nbytes < 2 * buffer_size { |
193 | | - continue; |
194 | | - }; |
195 | | - // Wait until we're at 2x the buffer size before flushing 1x the buffer size |
196 | | - // This avoids small tail stragglers being flushed at the end of the file. |
197 | | - let mut sequence_pointer = sequence_id.descend(); |
198 | | - while nbytes > buffer_size { |
199 | | - let Some(chunk) = chunks.pop_front() else { |
200 | | - break; |
201 | | - }; |
202 | | - nbytes -= chunk.nbytes() as u64; |
203 | | - yield (sequence_pointer.advance(), chunk) |
204 | | - } |
205 | | - } |
206 | | - }; |
207 | | - self.child.write_stream( |
208 | | - ctx, |
209 | | - sequence_writer, |
210 | | - SequentialStreamAdapter::new(dtype, buffered_stream).sendable(), |
211 | | - ) |
212 | | - } |
213 | | -} |
0 commit comments