|
| 1 | +use std::collections::BTreeSet; |
| 2 | +use std::ops::Range; |
| 3 | + |
| 4 | +use itertools::Itertools; |
| 5 | +use vortex_error::VortexResult; |
| 6 | +use vortex_layout::LayoutData; |
| 7 | + |
| 8 | +/// Defines how the Vortex file is split into batches for reading. |
| 9 | +/// |
| 10 | +/// Note that each split must fit into the platform's maximum usize. |
| 11 | +#[derive(Copy, Clone)] |
| 12 | +pub enum SplitBy { |
| 13 | + /// Splits any time there is a chunk boundary in the file. |
| 14 | + Layout, |
| 15 | + /// Splits every n rows. |
| 16 | + RowCount(usize), |
| 17 | + // UncompressedSize(u64), |
| 18 | +} |
| 19 | + |
| 20 | +impl SplitBy { |
| 21 | + /// Compute the splits for the given layout. |
| 22 | + pub(crate) fn splits(&self, layout: &LayoutData) -> VortexResult<Vec<Range<u64>>> { |
| 23 | + Ok(match *self { |
| 24 | + SplitBy::Layout => { |
| 25 | + let mut row_splits = BTreeSet::<u64>::new(); |
| 26 | + // Make sure we always have the first and last row. |
| 27 | + row_splits.insert(0); |
| 28 | + row_splits.insert(layout.row_count()); |
| 29 | + // Register the splits for all the layouts. |
| 30 | + layout.register_splits(0, &mut row_splits)?; |
| 31 | + row_splits |
| 32 | + .into_iter() |
| 33 | + .tuple_windows() |
| 34 | + .map(|(start, end)| start..end) |
| 35 | + .collect() |
| 36 | + } |
| 37 | + SplitBy::RowCount(n) => { |
| 38 | + let row_count = layout.row_count(); |
| 39 | + let mut splits = |
| 40 | + Vec::with_capacity(usize::try_from((row_count + n as u64) / n as u64)?); |
| 41 | + for start in (0..row_count).step_by(n) { |
| 42 | + let end = (start + n as u64).min(row_count); |
| 43 | + splits.push(start..end); |
| 44 | + } |
| 45 | + splits |
| 46 | + } |
| 47 | + }) |
| 48 | + } |
| 49 | +} |
| 50 | + |
| 51 | +#[cfg(test)] |
| 52 | +mod test { |
| 53 | + use vortex_array::IntoArrayData; |
| 54 | + use vortex_buffer::buffer; |
| 55 | + use vortex_dtype::DType; |
| 56 | + use vortex_dtype::Nullability::NonNullable; |
| 57 | + use vortex_layout::layouts::flat::writer::FlatLayoutWriter; |
| 58 | + use vortex_layout::strategies::LayoutWriterExt; |
| 59 | + |
| 60 | + use super::*; |
| 61 | + use crate::v2::segments::BufferedSegmentWriter; |
| 62 | + |
| 63 | + #[test] |
| 64 | + fn test_layout_splits_flat() { |
| 65 | + let mut segments = BufferedSegmentWriter::default(); |
| 66 | + let layout = FlatLayoutWriter::new(DType::Bool(NonNullable)) |
| 67 | + .push_one(&mut segments, buffer![1; 10].into_array()) |
| 68 | + .unwrap(); |
| 69 | + let splits = SplitBy::Layout.splits(&layout).unwrap(); |
| 70 | + assert_eq!(splits, vec![0..10]); |
| 71 | + } |
| 72 | + |
| 73 | + #[test] |
| 74 | + fn test_row_count_splits() { |
| 75 | + let mut segments = BufferedSegmentWriter::default(); |
| 76 | + let layout = FlatLayoutWriter::new(DType::Bool(NonNullable)) |
| 77 | + .push_one(&mut segments, buffer![1; 10].into_array()) |
| 78 | + .unwrap(); |
| 79 | + let splits = SplitBy::RowCount(3).splits(&layout).unwrap(); |
| 80 | + assert_eq!(splits, vec![0..3, 3..6, 6..9, 9..10]); |
| 81 | + } |
| 82 | +} |
0 commit comments