Skip to content

Commit af562ba

Browse files
authored
feat(query): Support Vector data type (#18044)
* feat(query): Support Vector data type * native format support vector * fix * remove impl ArgType
1 parent 1ed0ae8 commit af562ba

File tree

55 files changed

+1811
-45
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+1811
-45
lines changed

src/common/native/src/nested.rs

Lines changed: 90 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ use databend_common_expression::types::AnyType;
1818
use databend_common_expression::types::ArrayColumn;
1919
use databend_common_expression::types::Bitmap;
2020
use databend_common_expression::types::Buffer;
21+
use databend_common_expression::types::NumberColumn;
22+
use databend_common_expression::types::VectorColumn;
2123
use databend_common_expression::Column;
2224
use databend_common_expression::TableDataType;
2325

@@ -30,6 +32,8 @@ pub enum Nested {
3032
Primitive(usize, bool, Option<Bitmap>),
3133
/// a list
3234
LargeList(ListNested),
35+
/// a fixed list
36+
FixedList(FixedListNested),
3337
/// A struct column
3438
Struct(usize, bool, Option<Bitmap>),
3539
}
@@ -51,13 +55,38 @@ impl ListNested {
5155
}
5256
}
5357

58+
#[derive(Debug, Clone, PartialEq)]
59+
pub struct FixedListNested {
60+
pub is_nullable: bool,
61+
pub dimension: usize,
62+
pub length: usize,
63+
pub validity: Option<Bitmap>,
64+
}
65+
66+
impl FixedListNested {
67+
pub fn new(
68+
dimension: usize,
69+
length: usize,
70+
validity: Option<Bitmap>,
71+
is_nullable: bool,
72+
) -> Self {
73+
Self {
74+
is_nullable,
75+
dimension,
76+
length,
77+
validity,
78+
}
79+
}
80+
}
81+
5482
pub type NestedState = Vec<Nested>;
5583

5684
impl Nested {
5785
pub fn length(&self) -> usize {
5886
match self {
5987
Nested::Primitive(len, _, _) => *len,
6088
Nested::LargeList(l) => l.offsets.len(),
89+
Nested::FixedList(l) => l.length,
6190
Nested::Struct(len, _, _) => *len,
6291
}
6392
}
@@ -66,37 +95,34 @@ impl Nested {
6695
match self {
6796
Nested::Primitive(_, b, _) => *b,
6897
Nested::LargeList(l) => l.is_nullable,
98+
Nested::FixedList(l) => l.is_nullable,
6999
Nested::Struct(_, b, _) => *b,
70100
}
71101
}
72102

73-
pub fn inner(&self) -> (Buffer<u64>, &Option<Bitmap>) {
103+
pub fn offsets(&self) -> Option<Buffer<u64>> {
74104
match self {
75-
Nested::Primitive(_, _, v) => (Buffer::new(), v),
76105
Nested::LargeList(l) => {
77106
let start = *l.offsets.first().unwrap();
78107
let buffer = if start == 0 {
79108
l.offsets.clone()
80109
} else {
81110
l.offsets.iter().map(|x| *x - start).collect()
82111
};
83-
(buffer, &l.validity)
112+
Some(buffer)
84113
}
85-
Nested::Struct(_, _, v) => (Buffer::new(), v),
114+
_ => None,
86115
}
87116
}
88117

89118
pub fn validity(&self) -> &Option<Bitmap> {
90119
match self {
91120
Nested::Primitive(_, _, v) => v,
92121
Nested::LargeList(l) => &l.validity,
122+
Nested::FixedList(l) => &l.validity,
93123
Nested::Struct(_, _, v) => v,
94124
}
95125
}
96-
97-
pub fn is_list(&self) -> bool {
98-
matches!(self, Nested::LargeList(_))
99-
}
100126
}
101127

102128
/// Constructs the necessary `Vec<Vec<Nested>>` to write the rep and def levels of `column` to parquet
@@ -110,7 +136,10 @@ pub fn to_nested(column: &Column) -> Result<Vec<Vec<Nested>>> {
110136
pub fn is_nested_type(t: &TableDataType) -> bool {
111137
matches!(
112138
t,
113-
TableDataType::Tuple { .. } | TableDataType::Array(_) | TableDataType::Map(_)
139+
TableDataType::Tuple { .. }
140+
| TableDataType::Array(_)
141+
| TableDataType::Map(_)
142+
| TableDataType::Vector(_)
114143
)
115144
}
116145

@@ -133,6 +162,13 @@ pub fn slice_nest_column(
133162
current_length = r as usize;
134163
current_offset = *l_nested.offsets.first().unwrap() as usize;
135164
}
165+
Nested::FixedList(l_nested) => {
166+
if let Some(validity) = l_nested.validity.as_mut() {
167+
validity.slice(current_offset, current_length)
168+
};
169+
current_offset *= l_nested.dimension;
170+
current_length *= l_nested.dimension;
171+
}
136172
Nested::Struct(length, _, validity) => {
137173
*length = current_length;
138174
if let Some(validity) = validity.as_mut() {
@@ -176,6 +212,15 @@ fn to_nested_recursive(
176212
}));
177213
to_nested_recursive(&inner.underlying_column(), nested, parents)?;
178214
}
215+
Column::Vector(inner) => {
216+
parents.push(Nested::FixedList(FixedListNested {
217+
is_nullable: nullable,
218+
dimension: inner.dimension(),
219+
length: inner.len(),
220+
validity,
221+
}));
222+
to_nested_recursive(&inner.underlying_column(), nested, parents)?;
223+
}
179224
Column::Map(inner) => {
180225
parents.push(Nested::LargeList(ListNested {
181226
is_nullable: nullable,
@@ -208,6 +253,9 @@ fn to_leaves_recursive(column: &Column, leaves: &mut Vec<Column>) {
208253
Column::Array(col) => {
209254
to_leaves_recursive(&col.underlying_column(), leaves);
210255
}
256+
Column::Vector(col) => {
257+
to_leaves_recursive(&col.underlying_column(), leaves);
258+
}
211259
Column::Map(col) => {
212260
to_leaves_recursive(&col.underlying_column(), leaves);
213261
}
@@ -218,14 +266,15 @@ fn to_leaves_recursive(column: &Column, leaves: &mut Vec<Column>) {
218266
}
219267
}
220268

221-
/// The initial info of nested data types.
222269
/// The initial info of nested data types.
223270
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
224271
pub enum InitNested {
225272
/// Primitive data types
226273
Primitive(bool),
227274
/// List data types
228275
List(bool),
276+
/// Fixed List data types
277+
FixedList(bool),
229278
/// Struct data types
230279
Struct(bool),
231280
}
@@ -235,14 +284,16 @@ impl InitNested {
235284
match self {
236285
InitNested::Primitive(b) => *b,
237286
InitNested::List(b) => *b,
287+
InitNested::FixedList(b) => *b,
238288
InitNested::Struct(b) => *b,
239289
}
240290
}
241291
}
242292

243293
pub fn create_list(data_type: TableDataType, nested: &mut NestedState, values: Column) -> Column {
244294
let n = nested.pop().unwrap();
245-
let (offsets, validity) = n.inner();
295+
let offsets = n.offsets().unwrap();
296+
let validity = n.validity();
246297
let col = Column::Array(Box::new(ArrayColumn::<AnyType>::new(values, offsets)));
247298

248299
if data_type.is_nullable() {
@@ -252,10 +303,36 @@ pub fn create_list(data_type: TableDataType, nested: &mut NestedState, values: C
252303
}
253304
}
254305

306+
pub fn create_fixed_list(
307+
data_type: TableDataType,
308+
dimension: usize,
309+
nested: &mut NestedState,
310+
values: Column,
311+
) -> Column {
312+
let n = nested.pop().unwrap();
313+
let validity = n.validity();
314+
let col = match values {
315+
Column::Number(NumberColumn::Int8(vals)) => {
316+
Column::Vector(VectorColumn::Int8((vals.clone(), dimension)))
317+
}
318+
Column::Number(NumberColumn::Float32(vals)) => {
319+
Column::Vector(VectorColumn::Float32((vals.clone(), dimension)))
320+
}
321+
_ => unreachable!(),
322+
};
323+
324+
if data_type.is_nullable() {
325+
col.wrap_nullable(validity.clone())
326+
} else {
327+
col
328+
}
329+
}
330+
255331
/// Creates a new [`Mapcolumn`].
256332
pub fn create_map(data_type: TableDataType, nested: &mut NestedState, values: Column) -> Column {
257333
let n = nested.pop().unwrap();
258-
let (offsets, validity) = n.inner();
334+
let offsets = n.offsets().unwrap();
335+
let validity = n.validity();
259336
let col = Column::Map(Box::new(ArrayColumn::<AnyType>::new(values, offsets)));
260337
if data_type.is_nullable() {
261338
col.wrap_nullable(validity.clone())
@@ -271,7 +348,7 @@ pub fn create_struct(
271348
) -> (NestedState, Column) {
272349
let mut nest = nested.pop().unwrap();
273350
let n = nest.pop().unwrap();
274-
let (_, validity) = n.inner();
351+
let validity = n.validity();
275352

276353
let col = Column::Tuple(values);
277354
if is_nullable {
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// Copyright 2021 Datafuse Labs
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use databend_common_expression::Column;
16+
use databend_common_expression::TableDataType;
17+
18+
use crate::error::Result;
19+
use crate::nested::create_fixed_list;
20+
use crate::nested::NestedState;
21+
use crate::read::deserialize::DynIter;
22+
23+
/// An iterator adapter over [`DynIter`] assumed to be encoded as List columns
24+
pub struct FixedListIterator<'a> {
25+
iter: DynIter<'a, Result<(NestedState, Column)>>,
26+
data_type: TableDataType,
27+
dimension: usize,
28+
}
29+
30+
impl<'a> FixedListIterator<'a> {
31+
/// Creates a new [`FixedListIterator`] with `iter` and `data_type`.
32+
pub fn new(
33+
iter: DynIter<'a, Result<(NestedState, Column)>>,
34+
data_type: TableDataType,
35+
dimension: usize,
36+
) -> Self {
37+
Self {
38+
iter,
39+
data_type,
40+
dimension,
41+
}
42+
}
43+
}
44+
45+
impl FixedListIterator<'_> {
46+
fn deserialize(
47+
&mut self,
48+
value: Option<Result<(NestedState, Column)>>,
49+
) -> Option<Result<(NestedState, Column)>> {
50+
let (mut nested, values) = match value {
51+
Some(Ok((nested, values))) => (nested, values),
52+
Some(Err(err)) => return Some(Err(err)),
53+
None => return None,
54+
};
55+
let array = create_fixed_list(self.data_type.clone(), self.dimension, &mut nested, values);
56+
Some(Ok((nested, array)))
57+
}
58+
}
59+
60+
impl Iterator for FixedListIterator<'_> {
61+
type Item = Result<(NestedState, Column)>;
62+
63+
fn nth(&mut self, n: usize) -> Option<Self::Item> {
64+
let value = self.iter.nth(n);
65+
self.deserialize(value)
66+
}
67+
68+
fn next(&mut self) -> Option<Self::Item> {
69+
let value = self.iter.next();
70+
self.deserialize(value)
71+
}
72+
}

src/common/native/src/read/array/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ mod struct_;
3131
pub use struct_::*;
3232
mod list;
3333
pub use list::*;
34+
mod fixed_list;
35+
pub use fixed_list::*;
3436
mod interval;
3537
mod map;
3638
pub use interval::*;

src/common/native/src/read/batch_read.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use databend_common_expression::TableDataType;
2222
use super::array::*;
2323
use super::NativeReadBuf;
2424
use crate::error::Result;
25+
use crate::nested::create_fixed_list;
2526
use crate::nested::create_list;
2627
use crate::nested::create_map;
2728
use crate::nested::create_struct;
@@ -153,6 +154,18 @@ pub fn read_nested_column<R: NativeReadBuf>(
153154
}
154155
columns
155156
}
157+
Vector(vector_ty) => {
158+
init.push(InitNested::FixedList(is_nullable));
159+
let dimension = vector_ty.dimension() as usize;
160+
let inner_ty = vector_ty.inner_data_type();
161+
let results = read_nested_column(readers, inner_ty, init, page_metas)?;
162+
let mut columns = Vec::with_capacity(results.len());
163+
for (mut nested, values) in results {
164+
let array = create_fixed_list(data_type.clone(), dimension, &mut nested, values);
165+
columns.push((nested, array));
166+
}
167+
columns
168+
}
156169
Map(inner) => {
157170
init.push(InitNested::List(is_nullable));
158171
let results = read_nested_column(readers, inner.as_ref().clone(), init, page_metas)?;

src/common/native/src/read/deserialize.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,13 @@ where
199199
let iter = deserialize_nested(readers, inner.as_ref().clone(), init)?;
200200
DynIter::new(ListIterator::new(iter, data_type.clone()))
201201
}
202+
TableDataType::Vector(vector_ty) => {
203+
init.push(InitNested::FixedList(is_nullable));
204+
let dimension = vector_ty.dimension() as usize;
205+
let inner_ty = vector_ty.inner_data_type();
206+
let iter = deserialize_nested(readers, inner_ty, init)?;
207+
DynIter::new(FixedListIterator::new(iter, data_type.clone(), dimension))
208+
}
202209
TableDataType::Map(inner) => {
203210
init.push(InitNested::List(is_nullable));
204211
let iter = deserialize_nested(readers, inner.as_ref().clone(), init)?;

src/common/native/src/read/read_basic.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use databend_common_expression::types::Bitmap;
2020
use super::NativeReadBuf;
2121
use crate::compression::Compression;
2222
use crate::error::Result;
23+
use crate::nested::FixedListNested;
2324
use crate::nested::InitNested;
2425
use crate::nested::ListNested;
2526
use crate::nested::Nested;
@@ -80,6 +81,17 @@ pub fn read_nested<R: NativeReadBuf>(
8081
n.is_nullable(),
8182
)))
8283
}
84+
InitNested::FixedList(_) => {
85+
let mut buf = vec![0u8; 4];
86+
let length = read_u32(reader, &mut buf)?;
87+
let dimension = read_u32(reader, &mut buf)?;
88+
results.push(Nested::FixedList(FixedListNested::new(
89+
dimension as usize,
90+
length as usize,
91+
bitmap,
92+
n.is_nullable(),
93+
)))
94+
}
8395
InitNested::Struct(_) => {
8496
results.push(Nested::Struct(leaf_length, n.is_nullable(), bitmap))
8597
}

0 commit comments

Comments
 (0)