Skip to content

Commit 36b83bc

Browse files
y-f-upeasee
andauthored
add support of fixedsizebinary, duration, interval support in arrow (#374)
* add support of fixedsizebinary, duration, interval support in arrow * chore: Cargo fmt * fix: clippy::expect-fun-call --------- Co-authored-by: peasee <[email protected]>
1 parent 02a0f3e commit 36b83bc

File tree

1 file changed

+119
-11
lines changed

1 file changed

+119
-11
lines changed

crates/duckdb/src/vtab/arrow.rs

Lines changed: 119 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,14 @@ use super::{BindInfo, DataChunkHandle, Free, FunctionInfo, InitInfo, LogicalType
22
use std::ptr::null_mut;
33

44
use crate::core::{ArrayVector, FlatVector, Inserter, ListVector, StructVector, Vector};
5-
use arrow::array::{
6-
as_boolean_array, as_generic_binary_array, as_large_list_array, as_list_array, as_primitive_array, as_string_array,
7-
as_struct_array, Array, ArrayData, AsArray, BinaryArray, BooleanArray, Decimal128Array, FixedSizeListArray,
8-
GenericListArray, GenericStringArray, LargeStringArray, OffsetSizeTrait, PrimitiveArray, StructArray,
5+
use arrow::{
6+
array::{
7+
as_boolean_array, as_generic_binary_array, as_large_list_array, as_list_array, as_primitive_array,
8+
as_string_array, as_struct_array, Array, ArrayData, AsArray, BinaryArray, BooleanArray, Decimal128Array,
9+
FixedSizeBinaryArray, FixedSizeListArray, GenericListArray, GenericStringArray, IntervalMonthDayNanoArray,
10+
LargeBinaryArray, LargeStringArray, OffsetSizeTrait, PrimitiveArray, StructArray,
11+
},
12+
compute::cast,
913
};
1014

1115
use arrow::{
@@ -194,9 +198,12 @@ pub fn to_duckdb_logical_type(data_type: &DataType) -> Result<LogicalTypeHandle,
194198
// DuckDB does not support negative decimal scales
195199
Ok(LogicalTypeHandle::decimal(*width, (*scale).try_into().unwrap()))
196200
}
197-
DataType::Boolean | DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => {
198-
Ok(LogicalTypeHandle::from(to_duckdb_type_id(data_type)?))
199-
}
201+
DataType::Boolean
202+
| DataType::Utf8
203+
| DataType::LargeUtf8
204+
| DataType::Binary
205+
| DataType::LargeBinary
206+
| DataType::FixedSizeBinary(_) => Ok(LogicalTypeHandle::from(to_duckdb_type_id(data_type)?)),
200207
dtype if dtype.is_primitive() => Ok(LogicalTypeHandle::from(to_duckdb_type_id(data_type)?)),
201208
_ => Err(format!(
202209
"Unsupported data type: {data_type}, please file an issue https://github.com/wangfenjin/duckdb-rs"
@@ -238,6 +245,18 @@ pub fn record_batch_to_duckdb_data_chunk(
238245
DataType::Binary => {
239246
binary_array_to_vector(as_generic_binary_array(col.as_ref()), &mut chunk.flat_vector(i));
240247
}
248+
DataType::FixedSizeBinary(_) => {
249+
fixed_size_binary_array_to_vector(col.as_ref().as_fixed_size_binary(), &mut chunk.flat_vector(i));
250+
}
251+
DataType::LargeBinary => {
252+
large_binary_array_to_vector(
253+
col.as_ref()
254+
.as_any()
255+
.downcast_ref::<LargeBinaryArray>()
256+
.ok_or_else(|| Box::<dyn std::error::Error>::from("Unable to downcast to LargeBinaryArray"))?,
257+
&mut chunk.flat_vector(i),
258+
);
259+
}
241260
DataType::List(_) => {
242261
list_array_to_vector(as_list_array(col.as_ref()), &mut chunk.list_vector(i))?;
243262
}
@@ -276,7 +295,7 @@ fn primitive_array_to_flat_vector_cast<T: ArrowPrimitiveType>(
276295
array: &dyn Array,
277296
out_vector: &mut dyn Vector,
278297
) {
279-
let array = arrow::compute::kernels::cast::cast(array, &data_type).unwrap();
298+
let array = cast(array, &data_type).unwrap_or_else(|_| panic!("array is casted into {data_type}"));
280299
let out_vector: &mut FlatVector = out_vector.as_mut_any().downcast_mut().unwrap();
281300
out_vector.copy::<T::Native>(array.as_primitive::<T>().values());
282301
set_nulls_in_flat_vector(&array, out_vector);
@@ -354,7 +373,21 @@ fn primitive_array_to_vector(array: &dyn Array, out: &mut dyn Vector) -> Result<
354373
*width,
355374
);
356375
}
357-
376+
DataType::Interval(_) | DataType::Duration(_) => {
377+
let array = IntervalMonthDayNanoArray::from(
378+
cast(array, &DataType::Interval(IntervalUnit::MonthDayNano))
379+
.expect("array is casted into IntervalMonthDayNanoArray")
380+
.as_primitive::<IntervalMonthDayNanoType>()
381+
.values()
382+
.iter()
383+
.map(|a| IntervalMonthDayNanoType::make_value(a.months, a.days, a.nanoseconds / 1000))
384+
.collect::<Vec<_>>(),
385+
);
386+
primitive_array_to_flat_vector::<IntervalMonthDayNanoType>(
387+
as_primitive_array(&array),
388+
out.as_mut_any().downcast_mut().unwrap(),
389+
);
390+
}
358391
// DuckDB Only supports timetamp_tz in microsecond precision
359392
DataType::Timestamp(_, Some(tz)) => primitive_array_to_flat_vector_cast::<TimestampMicrosecondType>(
360393
DataType::Timestamp(TimeUnit::Microsecond, Some(tz.clone())),
@@ -463,6 +496,28 @@ fn binary_array_to_vector(array: &BinaryArray, out: &mut FlatVector) {
463496
set_nulls_in_flat_vector(array, out);
464497
}
465498

499+
fn fixed_size_binary_array_to_vector(array: &FixedSizeBinaryArray, out: &mut FlatVector) {
500+
assert!(array.len() <= out.capacity());
501+
502+
for i in 0..array.len() {
503+
let s = array.value(i);
504+
out.insert(i, s);
505+
}
506+
// Put this back once the other PR #
507+
// set_nulls_in_flat_vector(array, out);
508+
}
509+
510+
fn large_binary_array_to_vector(array: &LargeBinaryArray, out: &mut FlatVector) {
511+
assert!(array.len() <= out.capacity());
512+
513+
for i in 0..array.len() {
514+
let s = array.value(i);
515+
out.insert(i, s);
516+
}
517+
// Put this back once the other PR #
518+
// set_nulls_in_flat_vector(array, out);
519+
}
520+
466521
fn list_array_to_vector<O: OffsetSizeTrait + AsPrimitive<usize>>(
467522
array: &GenericListArray<O>,
468523
out: &mut ListVector,
@@ -648,12 +703,16 @@ mod test {
648703
use arrow::{
649704
array::{
650705
Array, ArrayRef, AsArray, BinaryArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array,
651-
FixedSizeListArray, GenericByteArray, GenericListArray, Int32Array, LargeStringArray, ListArray,
706+
DurationSecondArray, FixedSizeListArray, GenericByteArray, GenericListArray, Int32Array,
707+
IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeStringArray, ListArray,
652708
OffsetSizeTrait, PrimitiveArray, StringArray, StructArray, Time32SecondArray, Time64MicrosecondArray,
653709
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray,
654710
},
655711
buffer::{OffsetBuffer, ScalarBuffer},
656-
datatypes::{i256, ArrowPrimitiveType, ByteArrayType, DataType, Field, Fields, Schema},
712+
datatypes::{
713+
i256, ArrowPrimitiveType, ByteArrayType, DataType, DurationSecondType, Field, Fields, IntervalDayTimeType,
714+
IntervalMonthDayNanoType, IntervalYearMonthType, Schema,
715+
},
657716
record_batch::RecordBatch,
658717
};
659718
use std::{error::Error, sync::Arc};
@@ -1088,6 +1147,55 @@ mod test {
10881147
Ok(())
10891148
}
10901149

1150+
#[test]
1151+
fn test_interval_roundtrip() -> Result<(), Box<dyn Error>> {
1152+
let array: PrimitiveArray<IntervalMonthDayNanoType> = IntervalMonthDayNanoArray::from(vec![
1153+
IntervalMonthDayNanoType::make_value(1, 1, 1000),
1154+
IntervalMonthDayNanoType::make_value(2, 2, 2000),
1155+
IntervalMonthDayNanoType::make_value(3, 3, 3000),
1156+
]);
1157+
check_rust_primitive_array_roundtrip(array.clone(), array)?;
1158+
1159+
let array: PrimitiveArray<IntervalYearMonthType> = IntervalYearMonthArray::from(vec![
1160+
IntervalYearMonthType::make_value(1, 10),
1161+
IntervalYearMonthType::make_value(2, 20),
1162+
IntervalYearMonthType::make_value(3, 30),
1163+
]);
1164+
let expected_array: PrimitiveArray<IntervalMonthDayNanoType> = IntervalMonthDayNanoArray::from(vec![
1165+
IntervalMonthDayNanoType::make_value(22, 0, 0),
1166+
IntervalMonthDayNanoType::make_value(44, 0, 0),
1167+
IntervalMonthDayNanoType::make_value(66, 0, 0),
1168+
]);
1169+
check_rust_primitive_array_roundtrip(array, expected_array)?;
1170+
1171+
let array: PrimitiveArray<IntervalDayTimeType> = IntervalDayTimeArray::from(vec![
1172+
IntervalDayTimeType::make_value(1, 1),
1173+
IntervalDayTimeType::make_value(2, 2),
1174+
IntervalDayTimeType::make_value(3, 3),
1175+
]);
1176+
let expected_array: PrimitiveArray<IntervalMonthDayNanoType> = IntervalMonthDayNanoArray::from(vec![
1177+
IntervalMonthDayNanoType::make_value(0, 1, 1_000_000),
1178+
IntervalMonthDayNanoType::make_value(0, 2, 2_000_000),
1179+
IntervalMonthDayNanoType::make_value(0, 3, 3_000_000),
1180+
]);
1181+
check_rust_primitive_array_roundtrip(array, expected_array)?;
1182+
1183+
Ok(())
1184+
}
1185+
1186+
#[test]
1187+
fn test_duration_roundtrip() -> Result<(), Box<dyn Error>> {
1188+
let array: PrimitiveArray<DurationSecondType> = DurationSecondArray::from(vec![1, 2, 3]);
1189+
let expected_array: PrimitiveArray<IntervalMonthDayNanoType> = IntervalMonthDayNanoArray::from(vec![
1190+
IntervalMonthDayNanoType::make_value(0, 0, 1_000_000_000),
1191+
IntervalMonthDayNanoType::make_value(0, 0, 2_000_000_000),
1192+
IntervalMonthDayNanoType::make_value(0, 0, 3_000_000_000),
1193+
]);
1194+
check_rust_primitive_array_roundtrip(array, expected_array)?;
1195+
1196+
Ok(())
1197+
}
1198+
10911199
#[test]
10921200
fn test_timestamp_tz_insert() -> Result<(), Box<dyn Error>> {
10931201
// TODO: This test should be reworked once we support TIMESTAMP_TZ properly

0 commit comments

Comments
 (0)