Skip to content

Commit 4dea140

Browse files
Update the to_duckdb to use dict and const (#2707)
`to_duckdb` return constant and dict duckdb vectors is vortex array are of that form
1 parent 41e27e9 commit 4dea140

File tree

10 files changed

+173
-26
lines changed

10 files changed

+173
-26
lines changed

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

duckdb-vortex/duckdb

duckdb-vortex/duckdb-rs

vortex-duckdb/Cargo.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,20 @@ readme = { workspace = true }
1717
name = "vortex_duckdb"
1818
path = "src/lib.rs"
1919
bench = false
20-
crate-type = ["staticlib", "cdylib"]
2120

2221
[dependencies]
22+
arrow-array = { workspace = true }
2323
itertools = { workspace = true }
2424
vortex-array = { workspace = true }
25+
vortex-dict = { workspace = true }
2526
vortex-dtype = { workspace = true }
2627
vortex-error = { workspace = true }
28+
vortex-scalar = { workspace = true }
2729

2830
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
2931
duckdb = { workspace = true }
3032

33+
[dev-dependencies]
34+
3135
[lints]
3236
workspace = true

vortex-duckdb/cbindgen.toml

Whitespace-only changes.

vortex-duckdb/src/convert/array/mod.rs

Lines changed: 71 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,73 @@
11
mod data_chunk_adaptor;
22

3-
use duckdb::core::DataChunkHandle;
3+
use arrow_array::ArrayRef as ArrowArrayRef;
4+
use duckdb::core::{DataChunkHandle, SelectionVector};
45
use duckdb::vtab::arrow::{
56
WritableVector, flat_vector_to_arrow_array, write_arrow_array_to_vector,
67
};
78
use vortex_array::arrays::StructArray;
89
use vortex_array::arrow::{FromArrowArray, IntoArrowArray};
10+
use vortex_array::compute::try_cast;
911
use vortex_array::validity::Validity;
10-
use vortex_array::{Array, ArrayRef};
11-
use vortex_error::{VortexResult, vortex_err};
12+
use vortex_array::vtable::EncodingVTable;
13+
use vortex_array::{Array, ArrayRef, ArrayStatistics, ToCanonical};
14+
use vortex_dict::{DictArray, DictEncoding};
15+
use vortex_dtype::DType;
16+
use vortex_dtype::Nullability::NonNullable;
17+
use vortex_dtype::PType::U32;
18+
use vortex_error::{VortexExpect, VortexResult, vortex_err};
1219

1320
use crate::convert::array::data_chunk_adaptor::{
1421
DataChunkHandleSlice, NamedDataChunk, SizedFlatVector,
1522
};
23+
use crate::convert::scalar::ToDuckDBScalar;
1624

1725
pub trait ToDuckDB {
1826
fn to_duckdb(&self, chunk: &mut dyn WritableVector) -> VortexResult<()>;
1927
}
2028

29+
pub fn to_duckdb(array: ArrayRef, chunk: &mut dyn WritableVector) -> VortexResult<()> {
30+
if let Some(constant) = array.as_constant() {
31+
let value = constant.to_duckdb_scalar();
32+
chunk.flat_vector().assign_to_constant(&value);
33+
Ok(())
34+
} else if array.is_encoding(DictEncoding.id()) {
35+
array
36+
.as_any()
37+
.downcast_ref::<DictArray>()
38+
.vortex_expect("dict id checked")
39+
.to_duckdb(chunk)
40+
} else {
41+
array.into_arrow_preferred()?.to_duckdb(chunk)
42+
}
43+
}
44+
45+
impl ToDuckDB for DictArray {
46+
fn to_duckdb(&self, chunk: &mut dyn WritableVector) -> VortexResult<()> {
47+
to_duckdb(self.values().clone(), chunk)?;
48+
let indices =
49+
try_cast(self.codes(), &DType::Primitive(U32, NonNullable))?.to_primitive()?;
50+
let indices = indices.as_slice::<u32>();
51+
let sel = SelectionVector::new_copy(indices);
52+
chunk.flat_vector().slice(sel);
53+
Ok(())
54+
}
55+
}
56+
2157
pub fn to_duckdb_chunk(
2258
struct_array: &StructArray,
2359
chunk: &mut DataChunkHandle,
24-
) -> VortexResult<Vec<bool>> {
25-
let mut nullable = vec![false; struct_array.len()];
60+
) -> VortexResult<()> {
61+
chunk.set_len(struct_array.len());
2662
for (idx, field) in struct_array.fields().iter().enumerate() {
27-
field.to_duckdb(&mut DataChunkHandleSlice::new(chunk, idx))?;
28-
nullable[idx] = field.dtype().is_nullable();
63+
to_duckdb(field.clone(), &mut DataChunkHandleSlice::new(chunk, idx))?;
2964
}
30-
chunk.set_len(struct_array.len());
31-
Ok(nullable)
65+
Ok(())
3266
}
3367

34-
impl ToDuckDB for ArrayRef {
68+
impl ToDuckDB for ArrowArrayRef {
3569
fn to_duckdb(&self, chunk: &mut dyn WritableVector) -> VortexResult<()> {
36-
let arrow = &self.clone().into_arrow_preferred()?;
37-
write_arrow_array_to_vector(arrow, chunk)
70+
write_arrow_array_to_vector(self, chunk)
3871
.map_err(|e| vortex_err!("Failed to convert vrotex duckdb array: {}", e.to_string()))
3972
}
4073
}
@@ -89,9 +122,11 @@ impl FromDuckDB<SizedFlatVector> for ArrayRef {
89122

90123
#[cfg(test)]
91124
mod tests {
92-
use duckdb::core::DataChunkHandle;
93-
use itertools::Itertools;
94-
use vortex_array::arrays::{BoolArray, PrimitiveArray, StructArray, VarBinArray};
125+
126+
use duckdb::core::{DataChunkHandle, LogicalTypeHandle, LogicalTypeId};
127+
use vortex_array::arrays::{
128+
BoolArray, ConstantArray, PrimitiveArray, StructArray, VarBinArray,
129+
};
95130
use vortex_array::validity::Validity;
96131
use vortex_array::variants::StructArrayTrait;
97132
use vortex_array::{Array, ArrayRef, ToCanonical};
@@ -122,16 +157,16 @@ mod tests {
122157
#[test]
123158
fn test_vortex_to_duckdb() {
124159
let arr = data();
125-
let ddb_type = arr
160+
let (nullable, ddb_type): (Vec<_>, Vec<_>) = arr
126161
.dtype()
127162
.as_struct()
128163
.unwrap()
129164
.fields()
130-
.map(|f| f.to_duckdb_type().unwrap())
131-
.collect_vec();
165+
.map(|f| (f.is_nullable(), f.to_duckdb_type().unwrap()))
166+
.unzip();
132167
let struct_arr = arr.to_struct().unwrap();
133168
let mut output_chunk = DataChunkHandle::new(ddb_type.as_slice());
134-
let nullable = to_duckdb_chunk(&struct_arr, &mut output_chunk).unwrap();
169+
to_duckdb_chunk(&struct_arr, &mut output_chunk).unwrap();
135170

136171
let vx_arr = ArrayRef::from_duckdb(&NamedDataChunk::new(
137172
&output_chunk,
@@ -149,4 +184,21 @@ mod tests {
149184
assert_eq!(vx_arr.len(), arr.len());
150185
assert_eq!(vx_arr.dtype(), arr.dtype());
151186
}
187+
188+
#[test]
189+
fn test_const_vortex_to_duckdb() {
190+
let arr = ConstantArray::new::<i64>(23444233, 100).to_array();
191+
let arr2 = ConstantArray::new::<i32>(234, 100).to_array();
192+
let st = StructArray::from_fields(&[("1", arr.clone()), ("2", arr2.clone())]).unwrap();
193+
let mut output_chunk = DataChunkHandle::new(&[
194+
LogicalTypeHandle::from(LogicalTypeId::Bigint),
195+
LogicalTypeHandle::from(LogicalTypeId::Integer),
196+
]);
197+
to_duckdb_chunk(&st, &mut output_chunk).unwrap();
198+
199+
assert_eq!(
200+
format!("{:?}", output_chunk),
201+
"Chunk - [2 Columns]\n- CONSTANT BIGINT: 100 = [ 23444233]\n- CONSTANT INTEGER: 100 = [ 234]\n"
202+
)
203+
}
152204
}

vortex-duckdb/src/convert/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
mod array;
2+
mod scalar;
23
mod types;
34

45
pub use array::{FromDuckDB, ToDuckDB, to_duckdb_chunk};
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
use duckdb::core::Value;
2+
use vortex_dtype::half::f16;
3+
use vortex_dtype::{DType, PType, match_each_native_simd_ptype};
4+
use vortex_error::VortexExpect;
5+
use vortex_scalar::{BoolScalar, PrimitiveScalar, Scalar};
6+
7+
pub trait ToDuckDBScalar {
8+
fn to_duckdb_scalar(&self) -> Value;
9+
}
10+
11+
impl ToDuckDBScalar for Scalar {
12+
fn to_duckdb_scalar(&self) -> Value {
13+
match self.dtype() {
14+
DType::Null => todo!(),
15+
DType::Bool(_) => self.as_bool().to_duckdb_scalar(),
16+
DType::Primitive(..) => prim_to_duckdb_scalar(self.as_primitive()),
17+
DType::Utf8(_)
18+
| DType::Binary(_)
19+
| DType::Struct(..)
20+
| DType::List(..)
21+
| DType::Extension(_) => todo!(),
22+
}
23+
}
24+
}
25+
26+
fn prim_to_duckdb_scalar(scalar: PrimitiveScalar) -> Value {
27+
if scalar.ptype() == PType::F16 {
28+
return Value::from(
29+
scalar
30+
.as_::<f16>()
31+
.vortex_expect("check ptyped")
32+
.map(|f| f.to_f32()),
33+
);
34+
}
35+
match_each_native_simd_ptype!(scalar.ptype(), |$P| {
36+
Value::from(scalar.as_::<$P>().vortex_expect("ptype value mismatch"))
37+
})
38+
}
39+
40+
impl ToDuckDBScalar for BoolScalar<'_> {
41+
fn to_duckdb_scalar(&self) -> Value {
42+
Value::from(self.value())
43+
}
44+
}

vortex-duckdb/src/convert/types/from.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ impl FromDuckDBType<LogicalTypeHandle> for DType {
4242
| LogicalTypeId::Map
4343
| LogicalTypeId::Uuid
4444
| LogicalTypeId::Union
45-
| LogicalTypeId::TimestampTZ => todo!(),
45+
| LogicalTypeId::TimestampTZ => todo!("missing type: {:?}", type_),
4646
}
4747
}
4848
}

vortex-duckdb/src/convert/types/to.rs

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use duckdb::core::{LogicalTypeHandle, LogicalTypeId};
2-
use vortex_dtype::{DType, PType};
3-
use vortex_error::{VortexResult, vortex_bail};
2+
use vortex_dtype::datetime::{TemporalMetadata, TimeUnit, is_temporal_ext_type};
3+
use vortex_dtype::{DType, ExtDType, PType};
4+
use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_panic};
45

56
pub trait ToDuckDBType {
67
fn to_duckdb_type(&self) -> VortexResult<LogicalTypeHandle>;
@@ -39,7 +40,49 @@ impl ToDuckDBType for DType {
3940
);
4041
Ok(duckdb_type)
4142
}
42-
DType::List(..) | DType::Extension(_) => todo!(),
43+
DType::Extension(ext_dtype) => {
44+
if is_temporal_ext_type(ext_dtype.id()) {
45+
Ok(ext_to_duckdb(ext_dtype))
46+
} else {
47+
vortex_bail!("Unsupported extension type \"{}\"", ext_dtype.id())
48+
}
49+
}
50+
DType::List(..) => todo!("type: {self:?}"),
51+
}
52+
}
53+
}
54+
55+
/// Convert temporal ExtDType to a corresponding LogicalType
56+
///
57+
/// panics if the ext_dtype is not a temporal dtype
58+
pub fn ext_to_duckdb(ext_dtype: &ExtDType) -> LogicalTypeHandle {
59+
match TemporalMetadata::try_from(ext_dtype)
60+
.vortex_expect("make_arrow_temporal_dtype must be called with a temporal ExtDType")
61+
{
62+
TemporalMetadata::Date(time_unit) => match time_unit {
63+
TimeUnit::D => LogicalTypeHandle::from(LogicalTypeId::Date),
64+
_ => {
65+
vortex_panic!(InvalidArgument: "Invalid TimeUnit {} for {}", time_unit, ext_dtype.id())
66+
}
67+
},
68+
TemporalMetadata::Time(time_unit) => match time_unit {
69+
TimeUnit::Ms => LogicalTypeHandle::from(LogicalTypeId::Date),
70+
_ => {
71+
vortex_panic!(InvalidArgument: "Invalid TimeUnit {} for {}", time_unit, ext_dtype.id())
72+
}
73+
},
74+
TemporalMetadata::Timestamp(time_unit, tz) => {
75+
if tz.is_some() {
76+
vortex_panic!(InvalidArgument: "Timestamp with timezone is not yet supported")
77+
}
78+
match time_unit {
79+
TimeUnit::Ns => LogicalTypeHandle::from(LogicalTypeId::TimestampNs),
80+
TimeUnit::Ms => LogicalTypeHandle::from(LogicalTypeId::TimestampMs),
81+
TimeUnit::S => LogicalTypeHandle::from(LogicalTypeId::TimestampS),
82+
_ => {
83+
vortex_panic!(InvalidArgument: "Invalid TimeUnit {} for {}", time_unit, ext_dtype.id())
84+
}
85+
}
4386
}
4487
}
4588
}

0 commit comments

Comments
 (0)