Skip to content

Commit dd22351

Browse files
authored
feat: rest of BinaryViewVector{,Mut} (#5133)
Tracking Issue: #5028 --------- Signed-off-by: Andrew Duffy <[email protected]>
1 parent 6277036 commit dd22351

File tree

16 files changed

+848
-271
lines changed

16 files changed

+848
-271
lines changed

Cargo.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-buffer/src/buffer.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,12 @@ impl<T> Buffer<T> {
226226
unsafe { std::slice::from_raw_parts(self.bytes.as_ptr().cast(), self.length) }
227227
}
228228

229+
/// Return a view over the buffer as an opaque byte slice.
230+
#[inline(always)]
231+
pub fn as_bytes(&self) -> &[u8] {
232+
self.bytes.as_ref()
233+
}
234+
229235
/// Returns an iterator over the buffer of elements of type T.
230236
pub fn iter(&self) -> Iter<'_, T> {
231237
Iter {
@@ -319,7 +325,7 @@ impl<T> Buffer<T> {
319325

320326
/// Returns a slice of self that is equivalent to the given subset.
321327
///
322-
/// When processing the buffer you will often end up with &\[T\] that is a subset
328+
/// When processing the buffer you will often end up with `&[T]` that is a subset
323329
/// of the underlying buffer. This function turns the slice into a slice of the buffer
324330
/// it has been taken from.
325331
///
@@ -332,7 +338,7 @@ impl<T> Buffer<T> {
332338

333339
/// Returns a slice of self that is equivalent to the given subset.
334340
///
335-
/// When processing the buffer you will often end up with &\[T\] that is a subset
341+
/// When processing the buffer you will often end up with `&[T]` that is a subset
336342
/// of the underlying buffer. This function turns the slice into a slice of the buffer
337343
/// it has been taken from.
338344
///
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ use std::sync::Arc;
66
use arrow_array::{ArrayRef, GenericByteViewArray};
77
use vortex_buffer::Buffer;
88
use vortex_error::VortexResult;
9-
use vortex_vector::{BinaryType, StringType, VarBinVector};
9+
use vortex_vector::{BinaryType, BinaryViewVector, StringType};
1010

1111
use crate::arrow::IntoArrow;
1212

1313
macro_rules! impl_varbin {
1414
($T:ty, $A:ty) => {
15-
impl IntoArrow<ArrayRef> for VarBinVector<$T> {
15+
impl IntoArrow<ArrayRef> for BinaryViewVector<$T> {
1616
fn into_arrow(self) -> VortexResult<ArrayRef> {
1717
let (views, buffers, validity) = self.into_parts();
1818

vortex-compute/src/arrow/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
66
use vortex_error::VortexResult;
77

8+
mod binaryview;
89
mod bool;
910
mod decimal;
1011
mod mask;
1112
mod null;
1213
mod primitive;
1314
mod struct_;
14-
mod varbin;
1515
mod vector;
1616

1717
/// Trait for converting Vortex vector types into Arrow types.

vortex-compute/src/mask/mod.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ use std::ops::BitAnd;
88
use vortex_dtype::{NativeDecimalType, NativePType};
99
use vortex_mask::Mask;
1010
use vortex_vector::{
11-
BoolVector, DVector, DecimalVector, NullVector, PVector, PrimitiveVector, StructVector,
12-
VarBinType, VarBinVector, Vector, match_each_dvector, match_each_pvector, match_each_vector,
11+
BinaryViewType, BinaryViewVector, BoolVector, DVector, DecimalVector, NullVector, PVector,
12+
PrimitiveVector, StructVector, Vector, match_each_dvector, match_each_pvector,
13+
match_each_vector,
1314
};
1415

1516
/// Trait for masking the validity of an array or vector.
@@ -70,7 +71,7 @@ impl<T: NativePType> MaskValidity for PVector<T> {
7071
}
7172
}
7273

73-
impl<T: VarBinType> MaskValidity for VarBinVector<T> {
74+
impl<T: BinaryViewType> MaskValidity for BinaryViewVector<T> {
7475
fn mask_validity(self, mask: &Mask) -> Self {
7576
let (views, buffers, validity) = self.into_parts();
7677
// SAFETY: we are preserving the original views and buffers, only modifying the validity.
Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
//! Definition and implementation of variable-length binary types.
5+
//!
6+
//! All types are specializations of the [`BinaryViewVector`] type, which is represented internally
7+
//! by `BinaryView`s. `BinaryView`s are identical to the `BinaryView` type defined by the Arrow
8+
//! [specification](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout),
9+
//! which are inspired by "German" strings.
10+
411
pub use types::*;
512
pub use vector::*;
613
pub use vector_mut::*;
@@ -14,16 +21,16 @@ mod vector_mut;
1421
mod view;
1522

1623
/// Type alias for non-utf8 variable-length binary vectors.
17-
pub type BinaryVector = VarBinVector<BinaryType>;
24+
pub type BinaryVector = BinaryViewVector<BinaryType>;
1825
/// Type alias for mutable non-utf8 variable-length binary vectors.
19-
pub type BinaryVectorMut = VarBinVectorMut<BinaryType>;
26+
pub type BinaryVectorMut = BinaryViewVectorMut<BinaryType>;
2027
/// Type alias for UTF-8 variable-length string vectors.
21-
pub type StringVector = VarBinVector<StringType>;
28+
pub type StringVector = BinaryViewVector<StringType>;
2229
/// Type alias for mutable UTF-8 variable-length string vectors.
23-
pub type StringVectorMut = VarBinVectorMut<StringType>;
30+
pub type StringVectorMut = BinaryViewVectorMut<StringType>;
2431

25-
impl VarBinTypeDowncast for Vector {
26-
type Output<T: VarBinType> = VarBinVector<T>;
32+
impl BinaryViewDowncast for Vector {
33+
type Output<T: BinaryViewType> = BinaryViewVector<T>;
2734

2835
fn into_binary(self) -> Self::Output<BinaryType> {
2936
if let Vector::Binary(v) = self {
@@ -40,8 +47,8 @@ impl VarBinTypeDowncast for Vector {
4047
}
4148
}
4249

43-
impl VarBinTypeUpcast for Vector {
44-
type Input<T: VarBinType> = VarBinVector<T>;
50+
impl BinaryViewTypeUpcast for Vector {
51+
type Input<T: BinaryViewType> = BinaryViewVector<T>;
4552

4653
fn from_binary(input: Self::Input<BinaryType>) -> Self {
4754
Vector::Binary(input)
@@ -52,8 +59,8 @@ impl VarBinTypeUpcast for Vector {
5259
}
5360
}
5461

55-
impl VarBinTypeDowncast for VectorMut {
56-
type Output<T: VarBinType> = VarBinVectorMut<T>;
62+
impl BinaryViewDowncast for VectorMut {
63+
type Output<T: BinaryViewType> = BinaryViewVectorMut<T>;
5764

5865
fn into_binary(self) -> Self::Output<BinaryType> {
5966
if let VectorMut::Binary(v) = self {
@@ -70,8 +77,8 @@ impl VarBinTypeDowncast for VectorMut {
7077
}
7178
}
7279

73-
impl VarBinTypeUpcast for VectorMut {
74-
type Input<T: VarBinType> = VarBinVectorMut<T>;
80+
impl BinaryViewTypeUpcast for VectorMut {
81+
type Input<T: BinaryViewType> = BinaryViewVectorMut<T>;
7582

7683
fn from_binary(input: Self::Input<BinaryType>) -> Self {
7784
VectorMut::Binary(input)
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Variable-length binary types and related traits.
5+
6+
use std::fmt::Debug;
7+
8+
use crate::{BinaryViewVector, BinaryViewVectorMut, Vector, VectorMut};
9+
10+
impl<T: BinaryViewType> From<BinaryViewVector<T>> for Vector {
11+
fn from(value: BinaryViewVector<T>) -> Self {
12+
T::upcast(value)
13+
}
14+
}
15+
16+
impl<T: BinaryViewType> From<BinaryViewVectorMut<T>> for VectorMut {
17+
fn from(value: BinaryViewVectorMut<T>) -> Self {
18+
T::upcast(value)
19+
}
20+
}
21+
22+
/// Trait to mark supported binary view types.
23+
pub trait BinaryViewType: Debug + Sized + private::Sealed {
24+
/// The slice type for this variable binary type.
25+
type Slice: ?Sized + AsRef<[u8]>;
26+
27+
/// Validate if a set of bytes conforms to the logical type constraints of the native `Slice`.
28+
fn validate(bytes: &[u8]) -> bool;
29+
30+
/// Returns the bytes as the native `Slice` type
31+
/// for this binary view vector.
32+
///
33+
/// # Safety
34+
///
35+
/// The caller must check beforehand that bytes return from the vector conform to the type
36+
/// requirements of this binary type.
37+
///
38+
/// Failure to do so can result in undefined behavior or incorrect results in downstream
39+
/// vector operations.
40+
unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self::Slice;
41+
42+
/// Downcast the provided object to a type-specific instance.
43+
fn downcast<V: BinaryViewDowncast>(visitor: V) -> V::Output<Self>;
44+
45+
/// Upcast a type-specific instance to a generic instance.
46+
fn upcast<V: BinaryViewTypeUpcast>(input: V::Input<Self>) -> V;
47+
}
48+
49+
/// [`BinaryType`] for UTF-8 strings.
50+
#[derive(Clone, Debug)]
51+
pub struct StringType;
52+
impl BinaryViewType for StringType {
53+
type Slice = str;
54+
55+
#[inline(always)]
56+
fn validate(bytes: &[u8]) -> bool {
57+
std::str::from_utf8(bytes).is_ok()
58+
}
59+
60+
unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self::Slice {
61+
// SAFETY: vectors should be checked at the boundary for upholding the UTF8 variant,
62+
// or only be built from vectors that are known to satisfy the variant.
63+
unsafe { std::str::from_utf8_unchecked(bytes) }
64+
}
65+
66+
fn downcast<V: BinaryViewDowncast>(visitor: V) -> V::Output<Self> {
67+
visitor.into_string()
68+
}
69+
70+
fn upcast<V: BinaryViewTypeUpcast>(input: V::Input<Self>) -> V {
71+
V::from_string(input)
72+
}
73+
}
74+
75+
/// [`BinaryType`] for raw binary data.
76+
#[derive(Clone, Debug)]
77+
pub struct BinaryType;
78+
impl BinaryViewType for BinaryType {
79+
type Slice = [u8];
80+
81+
#[inline(always)]
82+
fn validate(_bytes: &[u8]) -> bool {
83+
true
84+
}
85+
86+
unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self::Slice {
87+
bytes
88+
}
89+
90+
fn downcast<V: BinaryViewDowncast>(visitor: V) -> V::Output<Self> {
91+
visitor.into_binary()
92+
}
93+
94+
fn upcast<V: BinaryViewTypeUpcast>(input: V::Input<Self>) -> V {
95+
V::from_binary(input)
96+
}
97+
}
98+
99+
/// Trait for downcasting generic variable binary types to specific types.
100+
pub trait BinaryViewDowncast {
101+
/// The output type after downcasting.
102+
type Output<T: BinaryViewType>;
103+
104+
/// Downcast to a binary type.
105+
fn into_binary(self) -> Self::Output<BinaryType>;
106+
/// Downcast to a string type.
107+
fn into_string(self) -> Self::Output<StringType>;
108+
}
109+
110+
/// Trait for upcasting specific variable binary types to generic types.
111+
pub trait BinaryViewTypeUpcast {
112+
/// The input type for upcasting.
113+
type Input<T: BinaryViewType>;
114+
115+
/// Upcast from a binary type.
116+
fn from_binary(input: Self::Input<BinaryType>) -> Self;
117+
/// Upcast from a string type.
118+
fn from_string(input: Self::Input<StringType>) -> Self;
119+
}
120+
121+
/// Private module to seal the `BinaryViewType` trait.
122+
mod private {
123+
/// Sealed trait to prevent external implementations of [`VarBinType`].
124+
pub trait Sealed {}
125+
126+
impl Sealed for super::StringType {}
127+
impl Sealed for super::BinaryType {}
128+
}

0 commit comments

Comments
 (0)