Skip to content

Commit 6317892

Browse files
connortsui20gatesn
andauthored
Feature: Add vortex-vector (#5010)
Tracking Issue: #4492 Continuation of #4954 This is mostly boilerplate code with a bit more documentation than normal. There are still a few bugs in the implementation (documented by TODOs), but I would like this PR to just focus on the structure of the crate rather than the logic. Also add logic in other crates that was missing before. --- The general structure of the crate is as such: ``` vortex-vector ├── Cargo.toml └── src ├── <type> │ ├── mod.rs │ ├── vector.rs │ └── vector_mut.rs ├── lib.rs ├── macros.rs ├── ops.rs ├── private.rs ├── vector.rs └── vector_mut.rs ``` The exceptions to this structure will be `primitive`, which needs a generic type as well (so there will be extra files), a `varbin` module that holds both `binary` and `string` modules, and probably the 2 list types might share a lot of code (we'll see once I get there). Again, a lot of the logic here is not super clean and some of it is wrong, but I'd like to get this merged first before I fix those. --------- Signed-off-by: Nicholas Gates <[email protected]> Signed-off-by: Connor Tsui <[email protected]> Co-authored-by: Nicholas Gates <[email protected]>
1 parent c698a77 commit 6317892

File tree

30 files changed

+2400
-86
lines changed

30 files changed

+2400
-86
lines changed

Cargo.lock

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ members = [
2929
"vortex-scalar",
3030
"vortex-tui",
3131
"vortex-utils",
32+
"vortex-vector",
3233
"xtask",
3334
"vortex-gpu",
3435
]

vortex-buffer/src/bit/buf.rs

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ use crate::bit::{
1111
use crate::{Alignment, BitBufferMut, Buffer, BufferMut, ByteBuffer, buffer};
1212

1313
/// An immutable bitset stored as a packed byte buffer.
14-
#[derive(Clone, Debug, Eq)]
14+
#[derive(Debug, Clone, Eq)]
1515
pub struct BitBuffer {
1616
buffer: ByteBuffer,
17-
len: usize,
1817
offset: usize,
18+
len: usize,
1919
}
2020

2121
impl PartialEq for BitBuffer {
@@ -25,8 +25,8 @@ impl PartialEq for BitBuffer {
2525
}
2626

2727
self.chunks()
28-
.iter()
29-
.zip(other.chunks())
28+
.iter_padded()
29+
.zip(other.chunks().iter_padded())
3030
.all(|(a, b)| a == b)
3131
}
3232
}
@@ -48,10 +48,10 @@ impl BitBuffer {
4848
}
4949
}
5050

51-
/// Create a new `BoolBuffer` backed by a [`ByteBuffer`] with `len` bits in view, starting at the
52-
/// given `offset` (in bits).
51+
/// Create a new `BoolBuffer` backed by a [`ByteBuffer`] with `len` bits in view, starting at
52+
/// the given `offset` (in bits).
5353
///
54-
/// Panics if the buffer is not large enough to hold `len` bits or if the offset is greater than
54+
/// Panics if the buffer is not large enough to hold `len` bits after the offset.
5555
pub fn new_with_offset(buffer: ByteBuffer, len: usize, offset: usize) -> Self {
5656
assert!(
5757
len.saturating_add(offset) <= buffer.len().saturating_mul(8),
@@ -61,8 +61,8 @@ impl BitBuffer {
6161

6262
Self {
6363
buffer,
64-
len,
6564
offset,
65+
len,
6666
}
6767
}
6868

@@ -277,6 +277,14 @@ impl BitBuffer {
277277
self.buffer.slice(word_start..word_end)
278278
}
279279

280+
/// Attempt to convert this `BitBuffer` into a mutable version.
281+
pub fn try_into_mut(self) -> Result<BitBufferMut, Self> {
282+
match self.buffer.try_into_mut() {
283+
Ok(buffer) => Ok(BitBufferMut::from_buffer(buffer, self.offset, self.len)),
284+
Err(buffer) => Err(BitBuffer::new_with_offset(buffer, self.len, self.offset)),
285+
}
286+
}
287+
280288
/// Get a mutable version of this `BitBuffer` along with bit offset in the first byte.
281289
///
282290
/// If the caller doesn't hold only reference to the underlying buffer, a copy is created.
@@ -442,4 +450,29 @@ mod tests {
442450
}
443451
}
444452
}
453+
454+
#[test]
455+
fn test_padded_equaltiy() {
456+
let buf1 = BitBuffer::new_set(64); // All bits set.
457+
let buf2 = BitBuffer::collect_bool(64, |x| x < 32); // First half set, other half unset.
458+
459+
for i in 0..32 {
460+
assert_eq!(buf1.value(i), buf2.value(i), "Bit {} should be the same", i);
461+
}
462+
463+
for i in 32..64 {
464+
assert_ne!(buf1.value(i), buf2.value(i), "Bit {} should differ", i);
465+
}
466+
467+
assert_eq!(
468+
buf1.slice(0..32),
469+
buf2.slice(0..32),
470+
"Buffer slices with same bits should be equal (`PartialEq` needs `iter_padded()`)"
471+
);
472+
assert_ne!(
473+
buf1.slice(32..64),
474+
buf2.slice(32..64),
475+
"Buffer slices with different bits should not be equal (`PartialEq` needs `iter_padded()`)"
476+
);
477+
}
445478
}

vortex-buffer/src/bit/buf_mut.rs

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
// TODO(connor): The API of `BitBufferMut` should probably share more methods with `BitBuffer`.
5+
6+
use arrow_buffer::bit_chunk_iterator::BitChunks;
47
use bitvec::view::BitView;
58

69
use crate::bit::{get_bit_unchecked, set_bit_unchecked, unset_bit_unchecked};
@@ -25,12 +28,26 @@ use crate::{BitBuffer, BufferMut, ByteBufferMut, buffer_mut};
2528
/// ```
2629
///
2730
/// See also: [`BitBuffer`].
31+
#[derive(Debug, Clone, Eq)]
2832
pub struct BitBufferMut {
2933
buffer: ByteBufferMut,
3034
offset: usize,
3135
len: usize,
3236
}
3337

38+
impl PartialEq for BitBufferMut {
39+
fn eq(&self, other: &Self) -> bool {
40+
if self.len != other.len {
41+
return false;
42+
}
43+
44+
self.chunks()
45+
.iter_padded()
46+
.zip(other.chunks().iter_padded())
47+
.all(|(a, b)| a == b)
48+
}
49+
}
50+
3451
impl BitBufferMut {
3552
/// Create new bit buffer from given byte buffer and logical bit length
3653
pub fn from_buffer(buffer: ByteBufferMut, offset: usize, len: usize) -> Self {
@@ -118,6 +135,13 @@ impl BitBufferMut {
118135
unsafe { get_bit_unchecked(self.buffer.as_ptr(), self.offset + index) }
119136
}
120137

138+
/// Access chunks of the underlying buffer as 8 byte chunks with a final trailer
139+
///
140+
/// If you're performing operations on a single buffer, prefer [BitBuffer::unaligned_chunks]
141+
pub fn chunks(&self) -> BitChunks<'_> {
142+
BitChunks::new(self.buffer.as_slice(), self.offset, self.len)
143+
}
144+
121145
/// Get the bit capacity of the buffer.
122146
#[inline(always)]
123147
pub fn capacity(&self) -> usize {
@@ -362,6 +386,63 @@ impl BitBufferMut {
362386
self.len += bit_len;
363387
}
364388

389+
/// Splits the bit buffer into two at the given index.
390+
///
391+
/// Afterward, self contains elements `[0, at)`, and the returned buffer contains elements
392+
/// `[at, capacity)`.
393+
///
394+
/// Unlike bytes, if the split position is not on a byte-boundary this operation will copy
395+
/// data into the result type, and mutate self.
396+
pub fn split_off(&mut self, at: usize) -> Self {
397+
assert!(at <= self.len, "index {at} exceeds len {}", self.len);
398+
399+
let new_offset = self.offset;
400+
let new_len = self.len - at;
401+
402+
// If we are splitting on a byte boundary, we can just slice the buffer
403+
if (self.offset + at) % 8 == 0 {
404+
let byte_pos = (self.offset + at) / 8;
405+
let new_buffer = self.buffer.split_off(byte_pos);
406+
self.len = at;
407+
return Self {
408+
buffer: new_buffer,
409+
offset: new_offset,
410+
len: new_len,
411+
};
412+
}
413+
414+
// Otherwise, we need to copy bits into a new buffer
415+
let mut new_buffer = BitBufferMut::with_capacity(new_len);
416+
for i in 0..new_len {
417+
let value = self.value(at + i);
418+
new_buffer.append(value);
419+
}
420+
421+
// Truncate self to the split position
422+
self.truncate(at);
423+
424+
new_buffer
425+
}
426+
427+
/// Absorbs a mutable buffer that was previously split off.
428+
///
429+
/// If the two buffers were previously contiguous and not mutated in a way that causes
430+
/// re-allocation i.e., if other was created by calling split_off on this buffer, then this is
431+
/// an O(1) operation that just decreases a reference count and sets a few indices.
432+
///
433+
/// Otherwise, this method degenerates to self.append_buffer(&other).
434+
pub fn unsplit(&mut self, other: Self) {
435+
if (self.offset + self.len) % 8 == 0 && other.offset == 0 {
436+
// We are aligned and can just append the buffers
437+
self.buffer.unsplit(other.buffer);
438+
self.len += other.len;
439+
return;
440+
}
441+
442+
// Otherwise, we need to append the bits one by one
443+
self.append_buffer(&other.freeze())
444+
}
445+
365446
/// Freeze the buffer in its current state into an immutable `BoolBuffer`.
366447
pub fn freeze(self) -> BitBuffer {
367448
BitBuffer::new_with_offset(self.buffer.freeze(), self.len, self.offset)

vortex-buffer/src/buffer.rs

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -447,66 +447,6 @@ impl<T> Buffer<T> {
447447
vortex_panic!("Buffer is not aligned to requested alignment {}", alignment)
448448
}
449449
}
450-
451-
/// Align the buffer to alignment of U
452-
pub fn align_to<U>(mut self) -> (Buffer<T>, Buffer<U>, Buffer<T>) {
453-
let offset = self.as_ptr().align_offset(align_of::<U>());
454-
if offset > self.len() {
455-
(
456-
self,
457-
Buffer::empty_aligned(Alignment::of::<U>()),
458-
Buffer::empty_aligned(Alignment::of::<T>()),
459-
)
460-
} else {
461-
let left = self.bytes.split_to(offset);
462-
self.length -= offset;
463-
let (us_len, _) = self.align_to_offsets::<U>();
464-
let trailer = self.bytes.split_off(us_len * size_of::<U>());
465-
(
466-
Buffer::from_bytes_aligned(left, Alignment::of::<T>()),
467-
Buffer::from_bytes_aligned(self.bytes, Alignment::of::<U>()),
468-
Buffer::from_bytes_aligned(trailer, Alignment::of::<T>()),
469-
)
470-
}
471-
}
472-
473-
/// Adapted from standard library slice::align_to_offsets
474-
/// Function to calculate lengths of the middle and trailing slice for `align_to`.
475-
fn align_to_offsets<U>(&self) -> (usize, usize) {
476-
// What we're going to do about `rest` is figure out what multiple of `U`s we can put in the
477-
// lowest number of `T`s. And how many `T`s we need for each such "multiple".
478-
//
479-
// Consider for example T=u8 U=u16. Then we can put 1 U in 2 Ts. Simple. Now, consider
480-
// for example a case where size_of::<T> = 16, size_of::<U> = 24. We can put 2 Us in
481-
// place of every 3 Ts in the `rest` slice. A bit more complicated.
482-
//
483-
// Formula to calculate this is:
484-
//
485-
// Us = lcm(size_of::<T>, size_of::<U>) / size_of::<U>
486-
// Ts = lcm(size_of::<T>, size_of::<U>) / size_of::<T>
487-
//
488-
// Expanded and simplified:
489-
//
490-
// Us = size_of::<T> / gcd(size_of::<T>, size_of::<U>)
491-
// Ts = size_of::<U> / gcd(size_of::<T>, size_of::<U>)
492-
//
493-
// Luckily since all this is constant-evaluated... performance here matters not!
494-
const fn gcd(a: usize, b: usize) -> usize {
495-
if b == 0 { a } else { gcd(b, a % b) }
496-
}
497-
498-
// Explicitly wrap the function call in a const block so it gets
499-
// constant-evaluated even in debug mode.
500-
let gcd: usize = const { gcd(size_of::<T>(), size_of::<U>()) };
501-
let ts: usize = size_of::<U>() / gcd;
502-
let us: usize = size_of::<T>() / gcd;
503-
504-
// Armed with this knowledge, we can find how many `U`s we can fit!
505-
let us_len = self.len() / ts * us;
506-
// And how many `T`s will be in the trailing slice!
507-
let ts_len = self.len() % ts;
508-
(us_len, ts_len)
509-
}
510450
}
511451

512452
/// An iterator over Buffer elements.

vortex-buffer/src/buffer_mut.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,60 @@ impl<T> BufferMut<T> {
328328
self.length += slice.len();
329329
}
330330

331+
/// Splits the buffer into two at the given index.
332+
///
333+
/// Afterward, self contains elements `[0, at)`, and the returned buffer contains elements
334+
/// `[at, capacity)`. It’s guaranteed that the memory does not move, that is, the address of
335+
/// self does not change, and the address of the returned slice is at bytes after that.
336+
///
337+
/// This is an O(1) operation that just increases the reference count and sets a few indices.
338+
///
339+
/// Panics if either half would have a length that is not a multiple of the alignment.
340+
pub fn split_off(&mut self, at: usize) -> Self {
341+
if at > self.len() {
342+
vortex_panic!("Cannot split buffer of length {} at {}", self.len(), at);
343+
}
344+
345+
let bytes_at = at * size_of::<T>();
346+
if !bytes_at.is_multiple_of(*self.alignment) {
347+
vortex_panic!(
348+
"Cannot split buffer at {}, resulting alignment is not {}",
349+
at,
350+
self.alignment
351+
);
352+
}
353+
354+
let new_bytes = self.bytes.split_off(bytes_at);
355+
let new_length = self.length - at;
356+
self.length = at;
357+
358+
BufferMut {
359+
bytes: new_bytes,
360+
length: new_length,
361+
alignment: self.alignment,
362+
_marker: Default::default(),
363+
}
364+
}
365+
366+
/// Absorbs a mutable buffer that was previously split off.
367+
///
368+
/// If the two buffers were previously contiguous and not mutated in a way that causes
369+
/// re-allocation i.e., if other was created by calling split_off on this buffer, then this is
370+
/// an O(1) operation that just decreases a reference count and sets a few indices.
371+
///
372+
/// Otherwise, this method degenerates to self.extend_from_slice(other.as_ref()).
373+
pub fn unsplit(&mut self, other: Self) {
374+
if self.alignment != other.alignment {
375+
vortex_panic!(
376+
"Cannot unsplit buffers with different alignments: {} and {}",
377+
self.alignment,
378+
other.alignment
379+
);
380+
}
381+
self.bytes.unsplit(other.bytes);
382+
self.length += other.length;
383+
}
384+
331385
/// Freeze the `BufferMut` into a `Buffer`.
332386
pub fn freeze(self) -> Buffer<T> {
333387
Buffer {

vortex-dtype/src/nullability.rs

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,21 @@ pub enum Nullability {
1515
}
1616

1717
impl Nullability {
18-
/// A self-describing displayed form.
18+
/// Returns `true` if the nullability is [`Nullable`](Self::Nullable), otherwise returns
19+
/// `false`.
1920
///
20-
/// The usual Display renders [Nullability::NonNullable] as the empty string.
21-
pub fn verbose_display(&self) -> impl Display {
21+
/// # Examples
22+
///
23+
/// ```
24+
/// use vortex_dtype::Nullability::*;
25+
///
26+
/// assert!(!NonNullable.is_nullable());
27+
/// assert!(Nullable.is_nullable());
28+
/// ```
29+
pub fn is_nullable(&self) -> bool {
2230
match self {
23-
Nullability::NonNullable => "NonNullable",
24-
Nullability::Nullable => "Nullable",
31+
Nullability::NonNullable => false,
32+
Nullability::Nullable => true,
2533
}
2634
}
2735
}

0 commit comments

Comments
 (0)