Skip to content

Commit 28c5ce5

Browse files
authored
VarBinVector (#5115)
Implements the scaffolding for varbin vectors --------- Signed-off-by: Nicholas Gates <[email protected]>
1 parent 09949b6 commit 28c5ce5

File tree

13 files changed

+705
-15
lines changed

13 files changed

+705
-15
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-compute/src/mask/mod.rs

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ use std::ops::BitAnd;
88
use vortex_dtype::NativePType;
99
use vortex_mask::Mask;
1010
use vortex_vector::{
11-
BoolVector, NullVector, PrimitiveVector, StructVector, Vector, match_each_pvector,
12-
match_each_vector,
11+
BoolVector, NullVector, PVector, PrimitiveVector, StructVector, VarBinType, VarBinVector,
12+
Vector, match_each_pvector, match_each_vector,
1313
};
1414

1515
/// Trait for masking the validity of an array or vector.
@@ -37,7 +37,8 @@ impl MaskValidity for NullVector {
3737
impl MaskValidity for BoolVector {
3838
fn mask_validity(self, mask: &Mask) -> Self {
3939
let (bits, validity) = self.into_parts();
40-
Self::new(bits, validity.bitand(mask))
40+
// SAFETY: we are preserving the original bits buffer and only modifying the validity.
41+
unsafe { Self::new_unchecked(bits, validity.bitand(mask)) }
4142
}
4243
}
4344

@@ -47,16 +48,26 @@ impl MaskValidity for PrimitiveVector {
4748
}
4849
}
4950

50-
impl<T: NativePType> MaskValidity for vortex_vector::PVector<T> {
51+
impl<T: NativePType> MaskValidity for PVector<T> {
5152
fn mask_validity(self, mask: &Mask) -> Self {
5253
let (data, validity) = self.into_parts();
53-
Self::new(data, validity.bitand(mask))
54+
// SAFETY: we are preserving the original data buffer and only modifying the validity.
55+
unsafe { Self::new_unchecked(data, validity.bitand(mask)) }
56+
}
57+
}
58+
59+
impl<T: VarBinType> MaskValidity for VarBinVector<T> {
60+
fn mask_validity(self, mask: &Mask) -> Self {
61+
let (views, buffers, validity) = self.into_parts();
62+
// SAFETY: we are preserving the original views and buffers, only modifying the validity.
63+
unsafe { Self::new_unchecked(views, buffers, validity.bitand(mask)) }
5464
}
5565
}
5666

5767
impl MaskValidity for StructVector {
5868
fn mask_validity(self, mask: &Mask) -> Self {
5969
let (fields, validity) = self.into_parts();
60-
StructVector::new(fields, validity.bitand(mask))
70+
// SAFETY: we are preserving the original fields and only modifying the validity.
71+
unsafe { StructVector::new_unchecked(fields, validity.bitand(mask)) }
6172
}
6273
}

vortex-vector/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,5 @@ vortex-buffer = { workspace = true }
2424
vortex-dtype = { workspace = true }
2525
vortex-error = { workspace = true }
2626
vortex-mask = { workspace = true }
27+
28+
static_assertions = { workspace = true }

vortex-vector/src/lib.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
//! Immutable and mutable decompressed (canonical) vectors for Vortex.
77
88
#![deny(missing_docs)]
9-
#![deny(clippy::missing_docs_in_private_items)]
109
#![deny(clippy::missing_errors_doc)]
1110
#![deny(clippy::missing_panics_doc)]
1211
#![deny(clippy::missing_safety_doc)]
@@ -15,11 +14,13 @@ mod bool;
1514
mod null;
1615
mod primitive;
1716
mod struct_;
17+
mod varbin;
1818

19-
pub use bool::{BoolVector, BoolVectorMut};
20-
pub use null::{NullVector, NullVectorMut};
21-
pub use primitive::{PVector, PVectorMut, PrimitiveVector, PrimitiveVectorMut};
22-
pub use struct_::{StructVector, StructVectorMut};
19+
pub use bool::*;
20+
pub use null::*;
21+
pub use primitive::*;
22+
pub use struct_::*;
23+
pub use varbin::*;
2324

2425
mod ops;
2526
mod vector;

vortex-vector/src/macros.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ macro_rules! match_each_vector {
5151
let $vec = v;
5252
$body
5353
}
54+
$crate::Vector::String(v) => {
55+
let $vec = v;
56+
$body
57+
}
58+
$crate::Vector::Binary(v) => {
59+
let $vec = v;
60+
$body
61+
}
5462
$crate::Vector::Struct(v) => {
5563
let $vec = v;
5664
$body
@@ -108,6 +116,14 @@ macro_rules! match_each_vector_mut {
108116
let $vec = v;
109117
$body
110118
}
119+
$crate::VectorMut::String(v) => {
120+
let $vec = v;
121+
$body
122+
}
123+
$crate::VectorMut::Binary(v) => {
124+
let $vec = v;
125+
$body
126+
}
111127
$crate::VectorMut::Struct(v) => {
112128
let $vec = v;
113129
$body

vortex-vector/src/private.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,8 @@ impl Sealed for PrimitiveVectorMut {}
2929
impl<T: NativePType> Sealed for PVector<T> {}
3030
impl<T: NativePType> Sealed for PVectorMut<T> {}
3131

32+
impl<T: VarBinType> Sealed for VarBinVector<T> {}
33+
impl<T: VarBinType> Sealed for VarBinVectorMut<T> {}
34+
3235
impl Sealed for StructVector {}
3336
impl Sealed for StructVectorMut {}

vortex-vector/src/varbin/mod.rs

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
pub use types::*;
5+
pub use vector::*;
6+
pub use vector_mut::*;
7+
use vortex_error::vortex_panic;
8+
9+
use crate::{Vector, VectorMut};
10+
11+
mod types;
12+
mod vector;
13+
mod vector_mut;
14+
mod view;
15+
16+
/// Type alias for non-utf8 variable-length binary vectors.
17+
pub type BinaryVector = VarBinVector<BinaryType>;
18+
/// Type alias for mutable non-utf8 variable-length binary vectors.
19+
pub type BinaryVectorMut = VarBinVectorMut<BinaryType>;
20+
/// Type alias for UTF-8 variable-length string vectors.
21+
pub type StringVector = VarBinVector<StringType>;
22+
/// Type alias for mutable UTF-8 variable-length string vectors.
23+
pub type StringVectorMut = VarBinVectorMut<StringType>;
24+
25+
impl VarBinTypeDowncast for Vector {
26+
type Output<T: VarBinType> = VarBinVector<T>;
27+
28+
fn into_binary(self) -> Self::Output<BinaryType> {
29+
if let Vector::Binary(v) = self {
30+
return v;
31+
}
32+
vortex_panic!("Expected BinaryVector, got {self:?}");
33+
}
34+
35+
fn into_string(self) -> Self::Output<StringType> {
36+
if let Vector::String(v) = self {
37+
return v;
38+
}
39+
vortex_panic!("Expected StringVector, got {self:?}");
40+
}
41+
}
42+
43+
impl VarBinTypeUpcast for Vector {
44+
type Input<T: VarBinType> = VarBinVector<T>;
45+
46+
fn from_binary(input: Self::Input<BinaryType>) -> Self {
47+
Vector::Binary(input)
48+
}
49+
50+
fn from_string(input: Self::Input<StringType>) -> Self {
51+
Vector::String(input)
52+
}
53+
}
54+
55+
impl VarBinTypeDowncast for VectorMut {
56+
type Output<T: VarBinType> = VarBinVectorMut<T>;
57+
58+
fn into_binary(self) -> Self::Output<BinaryType> {
59+
if let VectorMut::Binary(v) = self {
60+
return v;
61+
}
62+
vortex_panic!("Expected BinaryVector, got {self:?}");
63+
}
64+
65+
fn into_string(self) -> Self::Output<StringType> {
66+
if let VectorMut::String(v) = self {
67+
return v;
68+
}
69+
vortex_panic!("Expected StringVector, got {self:?}");
70+
}
71+
}
72+
73+
impl VarBinTypeUpcast for VectorMut {
74+
type Input<T: VarBinType> = VarBinVectorMut<T>;
75+
76+
fn from_binary(input: Self::Input<BinaryType>) -> Self {
77+
VectorMut::Binary(input)
78+
}
79+
80+
fn from_string(input: Self::Input<StringType>) -> Self {
81+
VectorMut::String(input)
82+
}
83+
}

vortex-vector/src/varbin/types.rs

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Variable-length binary types and related traits.
5+
6+
use std::fmt::Debug;
7+
8+
use crate::{VarBinVector, VarBinVectorMut, Vector, VectorMut};
9+
10+
impl<T: VarBinType> From<VarBinVector<T>> for Vector {
11+
fn from(value: VarBinVector<T>) -> Self {
12+
T::upcast(value)
13+
}
14+
}
15+
16+
impl<T: VarBinType> From<VarBinVectorMut<T>> for VectorMut {
17+
fn from(value: VarBinVectorMut<T>) -> Self {
18+
T::upcast(value)
19+
}
20+
}
21+
22+
/// Trait to mark supported binary view types.
23+
pub trait VarBinType: Debug + Sized + private::Sealed {
24+
/// The slice type for this variable binary type.
25+
type Slice: ?Sized + AsRef<[u8]>;
26+
27+
/// Downcast the provided object to a type-specific instance.
28+
fn downcast<V: VarBinTypeDowncast>(visitor: V) -> V::Output<Self>;
29+
30+
/// Upcast a type-specific instance to a generic instance.
31+
fn upcast<V: VarBinTypeUpcast>(input: V::Input<Self>) -> V;
32+
}
33+
34+
/// [`BinaryType`] for UTF-8 strings.
35+
#[derive(Clone, Debug)]
36+
pub struct StringType;
37+
impl VarBinType for StringType {
38+
type Slice = str;
39+
40+
fn downcast<V: VarBinTypeDowncast>(visitor: V) -> V::Output<Self> {
41+
visitor.into_string()
42+
}
43+
44+
fn upcast<V: VarBinTypeUpcast>(input: V::Input<Self>) -> V {
45+
V::from_string(input)
46+
}
47+
}
48+
49+
/// [`BinaryType`] for raw binary data.
50+
#[derive(Clone, Debug)]
51+
pub struct BinaryType;
52+
impl VarBinType for BinaryType {
53+
type Slice = [u8];
54+
55+
fn downcast<V: VarBinTypeDowncast>(visitor: V) -> V::Output<Self> {
56+
visitor.into_binary()
57+
}
58+
59+
fn upcast<V: VarBinTypeUpcast>(input: V::Input<Self>) -> V {
60+
V::from_binary(input)
61+
}
62+
}
63+
64+
/// Trait for downcasting generic variable binary types to specific types.
65+
pub trait VarBinTypeDowncast {
66+
/// The output type after downcasting.
67+
type Output<T: VarBinType>;
68+
69+
/// Downcast to a binary type.
70+
fn into_binary(self) -> Self::Output<BinaryType>;
71+
/// Downcast to a string type.
72+
fn into_string(self) -> Self::Output<StringType>;
73+
}
74+
75+
/// Trait for upcasting specific variable binary types to generic types.
76+
pub trait VarBinTypeUpcast {
77+
/// The input type for upcasting.
78+
type Input<T: VarBinType>;
79+
80+
/// Upcast from a binary type.
81+
fn from_binary(input: Self::Input<BinaryType>) -> Self;
82+
/// Upcast from a string type.
83+
fn from_string(input: Self::Input<StringType>) -> Self;
84+
}
85+
86+
/// Private module to seal the [`VarBinType`] trait.
87+
mod private {
88+
/// Sealed trait to prevent external implementations of [`VarBinType`].
89+
pub trait Sealed {}
90+
91+
impl Sealed for super::StringType {}
92+
impl Sealed for super::BinaryType {}
93+
}

vortex-vector/src/varbin/vector.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Variable-length binary vector implementation.
5+
6+
use std::sync::Arc;
7+
8+
use vortex_buffer::{Buffer, ByteBuffer};
9+
use vortex_mask::Mask;
10+
11+
use crate::VectorOps;
12+
use crate::varbin::VarBinType;
13+
use crate::varbin::vector_mut::VarBinVectorMut;
14+
use crate::varbin::view::BinaryView;
15+
16+
/// A variable-length binary vector.
17+
#[derive(Debug, Clone)]
18+
pub struct VarBinVector<T: VarBinType> {
19+
/// Views into the binary data.
20+
views: Buffer<BinaryView>,
21+
/// Buffers holding the referenced binary data.
22+
buffers: Arc<Box<[ByteBuffer]>>,
23+
/// Validity mask for the vector.
24+
validity: Mask,
25+
/// Marker trait for the [`VarBinType`].
26+
_marker: std::marker::PhantomData<T>,
27+
}
28+
29+
impl<T: VarBinType> VarBinVector<T> {
30+
/// Creates a new [`VarBinVector`] from the provided components.
31+
///
32+
/// # Safety
33+
///
34+
/// This function is unsafe because it does not validate the consistency of the provided
35+
/// components.
36+
///
37+
/// The caller must ensure that:
38+
/// - The length of the `validity` mask matches the length of the `views` buffer.
39+
/// - The `views` buffer correctly references the data in the `buffers`.
40+
pub unsafe fn new_unchecked(
41+
views: Buffer<BinaryView>,
42+
buffers: Arc<Box<[ByteBuffer]>>,
43+
validity: Mask,
44+
) -> Self {
45+
Self {
46+
views,
47+
buffers,
48+
validity,
49+
_marker: std::marker::PhantomData,
50+
}
51+
}
52+
53+
/// Decomposes the vector into its constituent parts.
54+
pub fn into_parts(self) -> (Buffer<BinaryView>, Arc<Box<[ByteBuffer]>>, Mask) {
55+
(self.views, self.buffers, self.validity)
56+
}
57+
}
58+
59+
impl<T: VarBinType> VectorOps for VarBinVector<T> {
60+
type Mutable = VarBinVectorMut<T>;
61+
62+
fn len(&self) -> usize {
63+
self.views.len()
64+
}
65+
66+
fn validity(&self) -> &Mask {
67+
&self.validity
68+
}
69+
70+
fn try_into_mut(self) -> Result<Self::Mutable, Self>
71+
where
72+
Self: Sized,
73+
{
74+
todo!()
75+
}
76+
}

0 commit comments

Comments
 (0)