Skip to content

Commit 1eea62a

Browse files
Duckdb REE specialise (#2955)
1 parent 7150be2 commit 1eea62a

File tree

6 files changed

+174
-1
lines changed

6 files changed

+174
-1
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

duckdb-vortex/duckdb-rs

encodings/runend/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
pub use array::*;
2+
pub use iter::trimmed_ends_iter;
23

34
mod array;
45
pub mod compress;

vortex-duckdb/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ vortex-dtype = { workspace = true }
3131
vortex-error = { workspace = true }
3232
vortex-fsst = { workspace = true }
3333
vortex-mask = { workspace = true }
34+
vortex-runend = { workspace = true }
3435
vortex-scalar = { workspace = true }
3536

3637
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]

vortex-duckdb/src/convert/array/mod.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
mod data_chunk_adaptor;
2+
mod run_end;
23
mod varbinview;
34

45
use arrow_array::ArrayRef as ArrowArrayRef;
@@ -23,6 +24,7 @@ use vortex_dtype::{NativePType, match_each_integer_ptype};
2324
use vortex_error::{VortexExpect, VortexResult, vortex_err};
2425
use vortex_fsst::{FSSTArray, FSSTEncoding};
2526
use vortex_mask::Mask;
27+
use vortex_runend::{RunEndArray, RunEndEncoding};
2628

2729
use crate::convert::array::data_chunk_adaptor::{DataChunkHandleSlice, SizedFlatVector};
2830
use crate::convert::scalar::ToDuckDBScalar;
@@ -103,6 +105,13 @@ fn try_to_duckdb(
103105
.vortex_expect("dict id checked")
104106
.to_duckdb(chunk, cache)
105107
.map(Some)
108+
} else if array.is_encoding(RunEndEncoding.id()) {
109+
array
110+
.as_any()
111+
.downcast_ref::<RunEndArray>()
112+
.vortex_expect("dict id checked")
113+
.to_duckdb(chunk, cache)
114+
.map(Some)
106115
} else {
107116
Ok(None)
108117
}
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
use duckdb::core::{FlatVector, SelectionVector};
2+
use duckdb::ffi::{idx_t, sel_t};
3+
use duckdb::vtab::arrow::WritableVector;
4+
use num_traits::AsPrimitive;
5+
use vortex_array::arrays::PrimitiveArray;
6+
use vortex_array::compute::scalar_at;
7+
use vortex_array::variants::PrimitiveArrayTrait;
8+
use vortex_array::{Array, ToCanonical};
9+
use vortex_dtype::{NativePType, match_each_integer_ptype};
10+
use vortex_error::VortexResult;
11+
use vortex_runend::{RunEndArray, trimmed_ends_iter};
12+
13+
use crate::convert::array::to_duckdb;
14+
use crate::convert::scalar::ToDuckDBScalar;
15+
use crate::{ConversionCache, DUCKDB_STANDARD_VECTOR_SIZE, ToDuckDB, ToDuckDBType};
16+
17+
pub fn selection_vector_from_ends_array(
18+
ends: PrimitiveArray,
19+
offset: usize,
20+
length: usize,
21+
) -> VortexResult<SelectionVector> {
22+
match_each_integer_ptype!(ends.ptype(), |$E| {
23+
selection_vector_from_ends_slice(
24+
ends.as_slice::<$E>(),
25+
offset,
26+
length,
27+
)
28+
})
29+
}
30+
31+
pub fn selection_vector_from_ends_slice<E: NativePType + AsPrimitive<usize> + Ord>(
32+
ends: &[E],
33+
offset: usize,
34+
length: usize,
35+
) -> VortexResult<SelectionVector> {
36+
assert!(length <= DUCKDB_STANDARD_VECTOR_SIZE);
37+
38+
let mut selection = SelectionVector::new(length as idx_t);
39+
let data_slice = selection.as_data_slice();
40+
41+
let mut start = 0;
42+
for (value, end) in trimmed_ends_iter(ends, offset, length).enumerate() {
43+
assert!(end <= length, "Runend end must be less than overall length");
44+
45+
// SAFETY:
46+
// We preallocate enough capacity because we know the total length
47+
unsafe {
48+
data_slice
49+
.get_unchecked_mut(start..end)
50+
.fill(sel_t::try_from(value)?);
51+
}
52+
start = end;
53+
}
54+
Ok(selection)
55+
}
56+
57+
// We can convert a run end array into a dictionary like array and pass that to duckdb.
58+
impl ToDuckDB for RunEndArray {
59+
fn to_duckdb(
60+
&self,
61+
chunk: &mut dyn WritableVector,
62+
cache: &mut ConversionCache,
63+
) -> VortexResult<()> {
64+
if self.values().len() == 1 {
65+
let constant = scalar_at(self.values(), 0)?;
66+
let value = constant.try_to_duckdb_scalar()?;
67+
chunk.flat_vector().assign_to_constant(&value);
68+
return Ok(());
69+
}
70+
71+
let mut vector: FlatVector = if self.values().len() <= DUCKDB_STANDARD_VECTOR_SIZE {
72+
to_duckdb(self.values(), chunk, cache)?;
73+
chunk.flat_vector()
74+
} else {
75+
// If the values don't fit allocated a larger vector and that the data chunk vector
76+
// reference this new one.
77+
let mut value_vector = FlatVector::allocate_new_vector_with_capacity(
78+
self.values().dtype().to_duckdb_type()?,
79+
self.values().len(),
80+
);
81+
to_duckdb(self.values(), &mut value_vector, cache)?;
82+
83+
let mut vector = chunk.flat_vector();
84+
vector.reference(&value_vector);
85+
vector
86+
};
87+
let sel = selection_vector_from_ends_array(
88+
self.ends().to_primitive()?,
89+
self.offset(),
90+
self.len(),
91+
)?;
92+
vector.slice(self.values().len() as u64, sel);
93+
Ok(())
94+
}
95+
}
96+
97+
#[cfg(test)]
98+
mod tests {
99+
use duckdb::core::{DataChunkHandle, LogicalTypeHandle, LogicalTypeId};
100+
use itertools::Itertools;
101+
use vortex_array::arrays::StructArray;
102+
use vortex_array::compute::slice;
103+
use vortex_array::{Array, IntoArray};
104+
use vortex_buffer::buffer;
105+
use vortex_runend::RunEndArray;
106+
107+
use crate::{ConversionCache, to_duckdb_chunk};
108+
109+
#[test]
110+
fn test_run_end_array_to_duckdb() {
111+
let arr = RunEndArray::try_new(
112+
buffer![2u32, 5, 10, 14].into_array(),
113+
buffer![1i32, 2, 3, 4].into_array(),
114+
)
115+
.unwrap();
116+
117+
let arr = slice(arr.to_array().as_ref(), 1, 5).unwrap();
118+
119+
let struct_ = StructArray::from_fields(&[("a", arr)]).unwrap();
120+
121+
let mut chunk = DataChunkHandle::new(&[LogicalTypeHandle::from(LogicalTypeId::Integer)]);
122+
to_duckdb_chunk(&struct_, &mut chunk, &mut ConversionCache::default()).unwrap();
123+
124+
chunk.verify();
125+
assert_eq!(
126+
format!("{:?}", chunk),
127+
r#"Chunk - [1 Columns]
128+
- DICTIONARY INTEGER: 4 = [ 1, 2, 2, 2]
129+
"#
130+
);
131+
}
132+
133+
#[test]
134+
fn test_run_end_array_large_to_duckdb() {
135+
let arr = RunEndArray::try_new(
136+
buffer![1000u32, 2000, 3000].into_array(),
137+
buffer![1i32, 2, 3].into_array(),
138+
)
139+
.unwrap();
140+
141+
let arr = slice(arr.to_array().as_ref(), 900, 2948).unwrap();
142+
143+
let struct_ = StructArray::from_fields(&[("a", arr)]).unwrap();
144+
145+
let mut chunk = DataChunkHandle::new(&[LogicalTypeHandle::from(LogicalTypeId::Integer)]);
146+
to_duckdb_chunk(&struct_, &mut chunk, &mut ConversionCache::default()).unwrap();
147+
148+
chunk.verify();
149+
assert_eq!(
150+
format!("{:?}", chunk),
151+
format!(
152+
r#"Chunk - [1 Columns]
153+
- DICTIONARY INTEGER: 2048 = [ {}, {}, {}]
154+
"#,
155+
(0..100).map(|_| "1").join(", "),
156+
(0..1000).map(|_| "2").join(", "),
157+
(0..948).map(|_| "3").join(", "),
158+
),
159+
);
160+
}
161+
}

0 commit comments

Comments
 (0)