Skip to content

Commit bbc85d7

Browse files
authored
feat: support "large" arrow data types (#307)
* feat: add large arrow type support * remove old match entry
1 parent a1aa55a commit bbc85d7

File tree

4 files changed

+65
-36
lines changed

4 files changed

+65
-36
lines changed

src/row.rs

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use std::{convert, sync::Arc};
22

33
use super::{Error, Result, Statement};
4-
use crate::types::{self, EnumType, FromSql, FromSqlError, ValueRef};
4+
use crate::types::{self, EnumType, FromSql, FromSqlError, ListType, ValueRef};
55

66
use arrow::array::DictionaryArray;
77
use arrow::{
@@ -570,22 +570,6 @@ impl<'stmt> Row<'stmt> {
570570
_ => unimplemented!("{:?}", unit),
571571
},
572572
// TODO: support more data types
573-
// DataType::List(_) => make_string_from_list!(column, row),
574-
// DataType::Dictionary(index_type, _value_type) => match **index_type {
575-
// DataType::Int8 => dict_array_value_to_string::<Int8Type>(column, row),
576-
// DataType::Int16 => dict_array_value_to_string::<Int16Type>(column, row),
577-
// DataType::Int32 => dict_array_value_to_string::<Int32Type>(column, row),
578-
// DataType::Int64 => dict_array_value_to_string::<Int64Type>(column, row),
579-
// DataType::UInt8 => dict_array_value_to_string::<UInt8Type>(column, row),
580-
// DataType::UInt16 => dict_array_value_to_string::<UInt16Type>(column, row),
581-
// DataType::UInt32 => dict_array_value_to_string::<UInt32Type>(column, row),
582-
// DataType::UInt64 => dict_array_value_to_string::<UInt64Type>(column, row),
583-
// _ => Err(ArrowError::InvalidArgumentError(format!(
584-
// "Pretty printing not supported for {:?} due to index type",
585-
// column.data_type()
586-
// ))),
587-
// },
588-
589573
// NOTE: DataTypes not supported by duckdb
590574
// DataType::Date64 => make_string_date!(array::Date64Array, column, row),
591575
// DataType::Time32(unit) if *unit == TimeUnit::Second => {
@@ -597,10 +581,15 @@ impl<'stmt> Row<'stmt> {
597581
// DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => {
598582
// make_string_time!(array::Time64NanosecondArray, column, row)
599583
// }
600-
DataType::List(_data) => {
584+
DataType::LargeList(..) => {
585+
let arr = column.as_any().downcast_ref::<array::LargeListArray>().unwrap();
586+
587+
ValueRef::List(ListType::Large(arr), row)
588+
}
589+
DataType::List(..) => {
601590
let arr = column.as_any().downcast_ref::<ListArray>().unwrap();
602591

603-
ValueRef::List(arr, row)
592+
ValueRef::List(ListType::Regular(arr), row)
604593
}
605594
DataType::Dictionary(key_type, ..) => {
606595
let column = column.as_any();

src/test_all_types.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,18 @@ use crate::{
88

99
#[test]
1010
fn test_all_types() -> crate::Result<()> {
11-
let database = Connection::open_in_memory()?;
11+
test_with_database(&Connection::open_in_memory()?)
12+
}
13+
14+
#[test]
15+
fn test_large_arrow_types() -> crate::Result<()> {
16+
let cfg = crate::Config::default().with("arrow_large_buffer_size", "true")?;
17+
let database = Connection::open_in_memory_with_flags(cfg)?;
18+
19+
test_with_database(&database)
20+
}
1221

22+
fn test_with_database(database: &Connection) -> crate::Result<()> {
1323
let excluded = vec![
1424
// uhugeint, time_tz, and dec38_10 aren't supported in the duckdb arrow layer
1525
"uhugeint",

src/types/mod.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ pub use self::{
7171
from_sql::{FromSql, FromSqlError, FromSqlResult},
7272
to_sql::{ToSql, ToSqlOutput},
7373
value::Value,
74-
value_ref::{EnumType, TimeUnit, ValueRef},
74+
value_ref::{EnumType, ListType, TimeUnit, ValueRef},
7575
};
7676

7777
use arrow::datatypes::DataType;
@@ -181,14 +181,12 @@ impl From<&DataType> for Type {
181181
DataType::Binary => Self::Blob,
182182
// DataType::FixedSizeBinary(_) => Self::FixedSizeBinary,
183183
// DataType::LargeBinary => Self::LargeBinary,
184-
DataType::Utf8 => Self::Text,
185-
// DataType::LargeUtf8 => Self::LargeUtf8,
184+
DataType::LargeUtf8 | DataType::Utf8 => Self::Text,
186185
DataType::List(inner) => Self::List(Box::new(Type::from(inner.data_type()))),
187186
// DataType::FixedSizeList(field, size) => Self::Array,
188-
// DataType::LargeList(_) => Self::LargeList,
187+
DataType::LargeList(inner) => Self::List(Box::new(Type::from(inner.data_type()))),
189188
// DataType::Struct(inner) => Self::Struct,
190189
// DataType::Union(_, _) => Self::Union,
191-
// DataType::Dictionary(_, _) => Self::Enum,
192190
DataType::Decimal128(..) => Self::Decimal,
193191
DataType::Decimal256(..) => Self::Decimal,
194192
// DataType::Map(field, ..) => Self::Map,

src/types/value_ref.rs

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use crate::types::{FromSqlError, FromSqlResult};
44
use crate::Row;
55
use rust_decimal::prelude::*;
66

7-
use arrow::array::{Array, DictionaryArray, ListArray};
7+
use arrow::array::{Array, ArrayRef, DictionaryArray, LargeListArray, ListArray};
88
use arrow::datatypes::{UInt16Type, UInt32Type, UInt8Type};
99

1010
/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds.
@@ -75,11 +75,20 @@ pub enum ValueRef<'a> {
7575
nanos: i64,
7676
},
7777
/// The value is a list
78-
List(&'a ListArray, usize),
78+
List(ListType<'a>, usize),
7979
/// The value is an enum
8080
Enum(EnumType<'a>, usize),
8181
}
8282

83+
/// Wrapper type for different list sizes
84+
#[derive(Debug, Copy, Clone, PartialEq)]
85+
pub enum ListType<'a> {
86+
/// The underlying list is a `ListArray`
87+
Regular(&'a ListArray),
88+
/// The underlying list is a `LargeListArray`
89+
Large(&'a LargeListArray),
90+
}
91+
8392
/// Wrapper type for different enum sizes
8493
#[derive(Debug, Copy, Clone, PartialEq)]
8594
pub enum EnumType<'a> {
@@ -116,7 +125,10 @@ impl ValueRef<'_> {
116125
ValueRef::Date32(_) => Type::Date32,
117126
ValueRef::Time64(..) => Type::Time64,
118127
ValueRef::Interval { .. } => Type::Interval,
119-
ValueRef::List(arr, _) => arr.data_type().into(),
128+
ValueRef::List(arr, _) => match arr {
129+
ListType::Large(arr) => arr.data_type().into(),
130+
ListType::Regular(arr) => arr.data_type().into(),
131+
},
120132
ValueRef::Enum(..) => Type::Enum,
121133
}
122134
}
@@ -177,14 +189,26 @@ impl From<ValueRef<'_>> for Value {
177189
ValueRef::Date32(d) => Value::Date32(d),
178190
ValueRef::Time64(t, d) => Value::Time64(t, d),
179191
ValueRef::Interval { months, days, nanos } => Value::Interval { months, days, nanos },
180-
ValueRef::List(items, idx) => {
181-
let offsets = items.offsets();
182-
let range = offsets[idx]..offsets[idx + 1];
183-
let map: Vec<Value> = range
184-
.map(|row| Row::value_ref_internal(row.try_into().unwrap(), idx, items.values()).to_owned())
185-
.collect();
186-
Value::List(map)
187-
}
192+
ValueRef::List(items, idx) => match items {
193+
ListType::Regular(items) => {
194+
let offsets = items.offsets();
195+
from_list(
196+
offsets[idx].try_into().unwrap(),
197+
offsets[idx + 1].try_into().unwrap(),
198+
idx,
199+
items.values(),
200+
)
201+
}
202+
ListType::Large(items) => {
203+
let offsets = items.offsets();
204+
from_list(
205+
offsets[idx].try_into().unwrap(),
206+
offsets[idx + 1].try_into().unwrap(),
207+
idx,
208+
items.values(),
209+
)
210+
}
211+
},
188212
ValueRef::Enum(items, idx) => {
189213
let value = Row::value_ref_internal(
190214
idx,
@@ -207,6 +231,14 @@ impl From<ValueRef<'_>> for Value {
207231
}
208232
}
209233

234+
fn from_list(start: usize, end: usize, idx: usize, values: &ArrayRef) -> Value {
235+
Value::List(
236+
(start..end)
237+
.map(|row| Row::value_ref_internal(row, idx, values).to_owned())
238+
.collect(),
239+
)
240+
}
241+
210242
impl<'a> From<&'a str> for ValueRef<'a> {
211243
#[inline]
212244
fn from(s: &str) -> ValueRef<'_> {

0 commit comments

Comments
 (0)