Skip to content

Commit 710dfcf

Browse files
committed
Vector Scalars
Signed-off-by: Nicholas Gates <[email protected]>
1 parent 81add46 commit 710dfcf

File tree

38 files changed

+887
-163
lines changed

38 files changed

+887
-163
lines changed

vortex-buffer/src/bit/buf.rs

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
use std::ops::{BitAnd, BitOr, BitXor, Not, Range};
4+
use std::ops::{BitAnd, BitOr, BitXor, Not, RangeBounds};
55

66
use crate::bit::ops::{bitwise_binary_op, bitwise_unary_op};
77
use crate::bit::{
@@ -194,16 +194,24 @@ impl BitBuffer {
194194
/// for `len` bits.
195195
///
196196
/// Panics if the slice would extend beyond the end of the buffer.
197-
pub fn slice(&self, range: Range<usize>) -> Self {
198-
assert!(
199-
range.len() <= self.len,
200-
"slice from {} to {} exceeds len {}",
201-
range.start,
202-
range.end,
203-
range.len()
204-
);
205-
206-
Self::new_with_offset(self.buffer.clone(), range.len(), self.offset + range.start)
197+
pub fn slice(&self, range: impl RangeBounds<usize>) -> Self {
198+
let start = match range.start_bound() {
199+
std::ops::Bound::Included(&s) => s,
200+
std::ops::Bound::Excluded(&s) => s + 1,
201+
std::ops::Bound::Unbounded => 0,
202+
};
203+
let end = match range.end_bound() {
204+
std::ops::Bound::Included(&e) => e + 1,
205+
std::ops::Bound::Excluded(&e) => e,
206+
std::ops::Bound::Unbounded => self.len,
207+
};
208+
209+
assert!(start <= end);
210+
assert!(start <= self.len);
211+
assert!(end <= self.len);
212+
let len = end - start;
213+
214+
Self::new_with_offset(self.buffer.clone(), len, self.offset + start)
207215
}
208216

209217
/// Slice any full bytes from the buffer, leaving the offset < 8.

vortex-buffer/src/buffer.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ impl<T> Buffer<T> {
246246
/// Requires that `begin <= end` and `end <= self.len()`.
247247
/// Also requires that both `begin` and `end` are aligned to the buffer's required alignment.
248248
#[inline(always)]
249-
pub fn slice(&self, range: impl RangeBounds<usize>) -> Self {
249+
pub fn slice(&self, range: impl RangeBounds<usize> + Clone + Debug) -> Self {
250250
self.slice_with_alignment(range, self.alignment)
251251
}
252252

@@ -257,7 +257,7 @@ impl<T> Buffer<T> {
257257
///
258258
/// Requires that `begin <= end` and `end <= self.len()`.
259259
#[inline(always)]
260-
pub fn slice_unaligned(&self, range: impl RangeBounds<usize>) -> Self {
260+
pub fn slice_unaligned(&self, range: impl RangeBounds<usize> + Clone + Debug) -> Self {
261261
self.slice_with_alignment(range, Alignment::of::<u8>())
262262
}
263263

@@ -270,7 +270,7 @@ impl<T> Buffer<T> {
270270
/// Also requires that both `begin` and `end` are aligned to the given alignment.
271271
pub fn slice_with_alignment(
272272
&self,
273-
range: impl RangeBounds<usize>,
273+
range: impl RangeBounds<usize> + Clone,
274274
alignment: Alignment,
275275
) -> Self {
276276
let len = self.len();

vortex-buffer/src/string.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use crate::ByteBuffer;
1010

1111
/// A wrapper around a [`ByteBuffer`] that guarantees that the buffer contains valid UTF-8.
1212
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
13+
#[repr(transparent)]
1314
pub struct BufferString(ByteBuffer);
1415

1516
impl BufferString {

vortex-mask/src/lib.rs

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ mod tests;
1515

1616
use std::cmp::Ordering;
1717
use std::fmt::{Debug, Formatter};
18-
use std::ops::Range;
18+
use std::ops::RangeBounds;
1919
use std::sync::{Arc, OnceLock};
2020

2121
use itertools::Itertools;
@@ -393,11 +393,26 @@ impl Mask {
393393

394394
/// Slice the mask.
395395
#[inline]
396-
pub fn slice(&self, range: Range<usize>) -> Self {
397-
assert!(range.end <= self.len());
396+
pub fn slice(&self, range: impl RangeBounds<usize>) -> Self {
397+
let start = match range.start_bound() {
398+
std::ops::Bound::Included(&s) => s,
399+
std::ops::Bound::Excluded(&s) => s + 1,
400+
std::ops::Bound::Unbounded => 0,
401+
};
402+
let end = match range.end_bound() {
403+
std::ops::Bound::Included(&e) => e + 1,
404+
std::ops::Bound::Excluded(&e) => e,
405+
std::ops::Bound::Unbounded => self.len(),
406+
};
407+
408+
assert!(start <= end);
409+
assert!(start <= self.len());
410+
assert!(end <= self.len());
411+
let len = end - start;
412+
398413
match &self {
399-
Self::AllTrue(_) => Self::new_true(range.len()),
400-
Self::AllFalse(_) => Self::new_false(range.len()),
414+
Self::AllTrue(_) => Self::new_true(len),
415+
Self::AllFalse(_) => Self::new_false(len),
401416
Self::Values(values) => Self::from_buffer(values.buffer.slice(range)),
402417
}
403418
}

vortex-vector/src/binaryview/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
//! [specification](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout),
99
//! which are inspired by "German" strings.
1010
11+
pub use scalar::*;
1112
pub use types::*;
1213
pub use vector::*;
1314
pub use vector_mut::*;
1415
use vortex_error::vortex_panic;
1516

1617
use crate::{Vector, VectorMut};
1718

19+
mod scalar;
1820
mod types;
1921
mod vector;
2022
mod vector_mut;
@@ -28,6 +30,10 @@ pub type BinaryVectorMut = BinaryViewVectorMut<BinaryType>;
2830
pub type StringVector = BinaryViewVector<StringType>;
2931
/// Type alias for mutable UTF-8 variable-length string vectors.
3032
pub type StringVectorMut = BinaryViewVectorMut<StringType>;
33+
/// Type alias for non-utf8 variable-length binary scalars.
34+
pub type BinaryScalar = BinaryViewScalar<BinaryType>;
35+
/// Type alias for UTF-8 variable-length string scalars.
36+
pub type StringScalar = BinaryViewScalar<StringType>;
3137

3238
impl BinaryViewDowncast for Vector {
3339
type Output<T: BinaryViewType> = BinaryViewVector<T>;
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use crate::binaryview::{
5+
BinaryType, BinaryViewType, BinaryViewTypeUpcast, BinaryViewVectorMut, StringType,
6+
};
7+
use crate::{Scalar, ScalarOps, VectorMutOps};
8+
9+
/// A scalar value for types that implement [`BinaryViewType`].
10+
pub struct BinaryViewScalar<T: BinaryViewType>(Option<T::Scalar>);
11+
12+
impl<T: BinaryViewType> From<Option<T::Scalar>> for BinaryViewScalar<T> {
13+
fn from(value: Option<T::Scalar>) -> Self {
14+
Self(value)
15+
}
16+
}
17+
18+
impl<T: BinaryViewType> BinaryViewScalar<T> {
19+
/// Returns the scalar value as [`T::Scalar`], or `None` if the scalar is null.
20+
pub fn value(&self) -> Option<&T::Scalar> {
21+
self.0.as_ref()
22+
}
23+
}
24+
25+
impl<T: BinaryViewType> ScalarOps for BinaryViewScalar<T> {
26+
fn is_valid(&self) -> bool {
27+
self.0.is_some()
28+
}
29+
30+
fn repeat(&self, n: usize) -> crate::VectorMut {
31+
let mut vec = BinaryViewVectorMut::<T>::with_capacity(n);
32+
match self.value() {
33+
None => vec.append_nulls(n),
34+
Some(buf) => vec.append_owned_values(buf.clone(), n),
35+
}
36+
vec.into()
37+
}
38+
}
39+
40+
impl BinaryViewTypeUpcast for Scalar {
41+
type Input<T: BinaryViewType> = BinaryViewScalar<T>;
42+
43+
fn from_binary(input: Self::Input<BinaryType>) -> Self {
44+
Scalar::Binary(input)
45+
}
46+
47+
fn from_string(input: Self::Input<StringType>) -> Self {
48+
Scalar::String(input)
49+
}
50+
}
51+
52+
impl<T: BinaryViewType> Into<Scalar> for BinaryViewScalar<T> {
53+
fn into(self) -> Scalar {
54+
T::upcast(self)
55+
}
56+
}

vortex-vector/src/binaryview/types.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
66
use std::fmt::Debug;
77

8+
use vortex_buffer::{BufferString, ByteBuffer};
9+
810
use crate::binaryview::{BinaryViewVector, BinaryViewVectorMut};
911
use crate::{Vector, VectorMut};
1012

@@ -24,6 +26,8 @@ impl<T: BinaryViewType> From<BinaryViewVectorMut<T>> for VectorMut {
2426
pub trait BinaryViewType: Debug + Sized + private::Sealed {
2527
/// The slice type for this variable binary type.
2628
type Slice: ?Sized + AsRef<[u8]>;
29+
/// The scalar type for this variable binary type.
30+
type Scalar: Sized + Clone + AsRef<Self::Slice> + Into<ByteBuffer>;
2731

2832
/// Validate if a set of bytes conforms to the logical type constraints of the native `Slice`.
2933
fn validate(bytes: &[u8]) -> bool;
@@ -40,6 +44,14 @@ pub trait BinaryViewType: Debug + Sized + private::Sealed {
4044
/// vector operations.
4145
unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self::Slice;
4246

47+
/// Returns the scalar value from a byte buffer.
48+
///
49+
/// # Safety
50+
///
51+
/// The caller must ensure that the buffer conforms to the type requirements of this binary
52+
/// type.
53+
unsafe fn scalar_from_buffer_unchecked(buffer: ByteBuffer) -> Self::Scalar;
54+
4355
/// Downcast the provided object to a type-specific instance.
4456
fn downcast<V: BinaryViewDowncast>(visitor: V) -> V::Output<Self>;
4557

@@ -52,18 +64,25 @@ pub trait BinaryViewType: Debug + Sized + private::Sealed {
5264
pub struct StringType;
5365
impl BinaryViewType for StringType {
5466
type Slice = str;
67+
type Scalar = BufferString;
5568

5669
#[inline(always)]
5770
fn validate(bytes: &[u8]) -> bool {
5871
std::str::from_utf8(bytes).is_ok()
5972
}
6073

74+
#[inline(always)]
6175
unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self::Slice {
6276
// SAFETY: vectors should be checked at the boundary for upholding the UTF8 variant,
6377
// or only be built from vectors that are known to satisfy the variant.
6478
unsafe { std::str::from_utf8_unchecked(bytes) }
6579
}
6680

81+
#[inline(always)]
82+
unsafe fn scalar_from_buffer_unchecked(buffer: ByteBuffer) -> Self::Scalar {
83+
unsafe { BufferString::new_unchecked(buffer) }
84+
}
85+
6786
fn downcast<V: BinaryViewDowncast>(visitor: V) -> V::Output<Self> {
6887
visitor.into_string()
6988
}
@@ -78,16 +97,23 @@ impl BinaryViewType for StringType {
7897
pub struct BinaryType;
7998
impl BinaryViewType for BinaryType {
8099
type Slice = [u8];
100+
type Scalar = ByteBuffer;
81101

82102
#[inline(always)]
83103
fn validate(_bytes: &[u8]) -> bool {
84104
true
85105
}
86106

107+
#[inline(always)]
87108
unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self::Slice {
88109
bytes
89110
}
90111

112+
#[inline(always)]
113+
unsafe fn scalar_from_buffer_unchecked(buffer: ByteBuffer) -> Self::Scalar {
114+
buffer
115+
}
116+
91117
fn downcast<V: BinaryViewDowncast>(visitor: V) -> V::Output<Self> {
92118
visitor.into_binary()
93119
}

vortex-vector/src/binaryview/vector.rs

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,18 @@
33

44
//! Variable-length binary vector implementation.
55
6+
use std::fmt::Debug;
7+
use std::ops::RangeBounds;
68
use std::sync::Arc;
79

8-
use vortex_buffer::{Buffer, ByteBuffer};
10+
use vortex_buffer::{Alignment, Buffer, ByteBuffer};
911
use vortex_error::{VortexExpect, VortexResult, vortex_ensure};
1012
use vortex_mask::Mask;
1113

12-
use crate::VectorOps;
13-
use crate::binaryview::BinaryViewType;
1414
use crate::binaryview::vector_mut::BinaryViewVectorMut;
1515
use crate::binaryview::view::{BinaryView, validate_views};
16+
use crate::binaryview::{BinaryViewScalar, BinaryViewType};
17+
use crate::{Scalar, VectorOps};
1618

1719
/// A variable-length binary vector.
1820
///
@@ -108,10 +110,50 @@ impl<T: BinaryViewType> BinaryViewVector<T> {
108110
(self.views, self.buffers, self.validity)
109111
}
110112

113+
/// Get the `index` item from the vector as an owned `Scalar` type with zero-copy.
114+
///
115+
/// This function will panic is `index` is out of range for the vector's length.
116+
pub fn get(&self, index: usize) -> Option<T::Scalar> {
117+
if !self.validity.value(index) {
118+
return None;
119+
}
120+
121+
let view = &self.views[index];
122+
if view.is_inlined() {
123+
let view = view.as_inlined();
124+
125+
// We find the occurrence of the inlined data in the views buffer.
126+
let buffer = self
127+
.views
128+
.clone()
129+
.into_byte_buffer()
130+
.aligned(Alignment::none())
131+
.slice_ref(&view.data[..view.size as usize]);
132+
133+
// SAFETY: validation that the string data contained in this vector is performed
134+
// at construction time, either in the constructor for safe construction, or by
135+
// the caller (when using the unchecked constructor).
136+
Some(unsafe { T::scalar_from_buffer_unchecked(buffer) })
137+
} else {
138+
// Get a pointer into the buffer range
139+
let view_ref = view.as_view();
140+
let buffer = &self.buffers[view_ref.buffer_index as usize];
141+
142+
let start = view_ref.offset as usize;
143+
let length = view_ref.size as usize;
144+
let buffer_slice = buffer.slice(start..start + length);
145+
146+
// SAFETY: validation that the string data contained in this vector is performed
147+
// at construction time, either in the constructor for safe construction, or by
148+
// the caller (when using the unchecked constructor).
149+
Some(unsafe { T::scalar_from_buffer_unchecked(buffer_slice) })
150+
}
151+
}
152+
111153
/// Get the `index` item from the vector as a native `Slice` type.
112154
///
113155
/// This function will panic is `index` is out of range for the vector's length.
114-
pub fn get(&self, index: usize) -> Option<&T::Slice> {
156+
pub fn get_ref(&self, index: usize) -> Option<&T::Slice> {
115157
if !self.validity.value(index) {
116158
return None;
117159
}
@@ -160,6 +202,15 @@ impl<T: BinaryViewType> VectorOps for BinaryViewVector<T> {
160202
&self.validity
161203
}
162204

205+
fn scalar_at(&self, index: usize) -> Scalar {
206+
assert!(index < self.len());
207+
BinaryViewScalar::<T>::from(self.get(index)).into()
208+
}
209+
210+
fn slice(&self, _range: impl RangeBounds<usize> + Clone + Debug) -> Self {
211+
todo!()
212+
}
213+
163214
fn try_into_mut(self) -> Result<BinaryViewVectorMut<T>, Self> {
164215
let views_mut = match self.views.try_into_mut() {
165216
Ok(views_mut) => views_mut,

0 commit comments

Comments
 (0)