Skip to content

Commit 5155c0d

Browse files
committed
fix compressor bug
Signed-off-by: Connor Tsui <[email protected]>
1 parent ad0163a commit 5155c0d

File tree

5 files changed

+72
-39
lines changed

5 files changed

+72
-39
lines changed

vortex-array/src/arrays/list/array.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
use std::sync::Arc;
55

66
use num_traits::AsPrimitive;
7-
use vortex_dtype::{DType, match_each_integer_ptype, match_each_native_ptype};
7+
use vortex_dtype::{DType, NativePType, match_each_integer_ptype, match_each_native_ptype};
88
use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_ensure};
99

1010
use crate::arrays::{ListVTable, PrimitiveVTable};
@@ -191,8 +191,11 @@ impl ListArray {
191191

192192
vortex_ensure!(
193193
max_offset
194-
<= P::try_from(elements.len())
195-
.vortex_expect("Offsets type must be able to fit elements length"),
194+
<= P::try_from(elements.len()).vortex_expect(&format!(
195+
"Offsets type {} must be able to fit elements length {}",
196+
<P as NativePType>::PTYPE,
197+
elements.len()
198+
)),
196199
"Max offset {max_offset} is beyond the length of the elements array {}",
197200
elements.len()
198201
);

vortex-array/src/arrays/listview/array.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,13 @@ where
377377
vortex_err!("offset[{i}] ({offset_u64}) + size[{i}] ({size_u64}) would overflow u64")
378378
})?;
379379

380+
if offset_u64 == elements_len {
381+
vortex_ensure!(
382+
size_u64 == 0,
383+
"views to the end of the elements array (length {elements_len}) must have size 0"
384+
);
385+
}
386+
380387
vortex_ensure!(
381388
end <= elements_len,
382389
"offset[{i}] + size[{i}] = {offset_u64} + {size_u64} = {end} \

vortex-array/src/arrays/listview/conversion.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,22 @@ pub fn list_view_from_list(list: ListArray) -> ListViewArray {
2121
return Canonical::empty(list.dtype()).into_listview();
2222
}
2323

24+
// TODO(connor): Should we reset offsets?
25+
2426
let len = list.len();
2527

2628
// Get the `offsets` array directly from the `ListArray` (preserving its type).
2729
let list_offsets = list.offsets().clone();
2830

29-
// We need to slice the `offsets` to remove the last element (`ListArray` has n+1 offsets).
30-
let adjusted_offsets = list_offsets.slice(0..len);
31-
3231
// Create sizes array by computing differences between consecutive offsets.
3332
// Use the same dtype as the offsets array to ensure compatibility.
3433
let sizes = match_each_integer_ptype!(list_offsets.dtype().as_ptype(), |O| {
3534
build_sizes_from_offsets::<O>(&list)
3635
});
3736

37+
// We need to slice the `offsets` to remove the last element (`ListArray` has `n + 1` offsets).
38+
let adjusted_offsets = list_offsets.slice(0..len);
39+
3840
// Since the data came from a valid `ListArray`, we know it is zero-copyable to a `ListArray`.
3941
let shape = ListViewShape::as_zero_copy_to_list();
4042

@@ -58,8 +60,11 @@ fn build_sizes_from_offsets<O: IntegerPType>(list: &ListArray) -> ArrayRef {
5860

5961
// Create `UninitRange` for direct memory access.
6062
let mut sizes_range = sizes_builder.uninit_range(len);
63+
6164
let offsets = list.offsets().to_primitive();
6265
let offsets_slice = offsets.as_slice::<O>();
66+
debug_assert_eq!(len + 1, offsets_slice.len());
67+
debug_assert!(offsets_slice.is_sorted());
6368

6469
// Compute sizes as the difference between consecutive offsets.
6570
for i in 0..len {

vortex-array/src/builders/listview.rs

Lines changed: 44 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -261,12 +261,12 @@ impl<O: IntegerPType, S: IntegerPType> ArrayBuilder for ListViewBuilder<O, S> {
261261

262262
self.nulls.append_validity_mask(array.validity_mask());
263263

264-
let curr_elements_len = self.elements_builder.len();
265-
266264
// Bulk append the new elements (which should have no gaps or overlaps).
265+
let old_elements_len = self.elements_builder.len();
267266
self.elements_builder
268267
.reserve_exact(listview.elements().len());
269268
self.elements_builder.extend_from_array(listview.elements());
269+
let new_elements_len = self.elements_builder.len();
270270

271271
// Reserve enough space for the new views.
272272
let extend_length = listview.len();
@@ -287,36 +287,13 @@ impl<O: IntegerPType, S: IntegerPType> ArrayBuilder for ListViewBuilder<O, S> {
287287
// This should be cheap because we didn't compress after rebuilding.
288288
let new_offsets = listview.offsets().to_primitive();
289289

290-
fn adjust_offsets<'a, O: IntegerPType, A: IntegerPType>(
291-
mut uninit_range: UninitRange<'a, O>,
292-
new_offsets: PrimitiveArray,
293-
curr_elements_len: usize,
294-
) {
295-
let new_offsets_slice = new_offsets.as_slice::<A>();
296-
let curr_elements_len = O::from_usize(curr_elements_len).vortex_expect(
297-
"the current elements length did not fit into the offset type (impossible)",
298-
);
299-
300-
for i in 0..uninit_range.len() {
301-
let new_offset = new_offsets_slice[i];
302-
let new_offset_correct_type = O::from_usize(
303-
new_offset
304-
.to_usize()
305-
.vortex_expect("Offsets must always fit in usize"),
306-
)
307-
.vortex_expect("New offset somehow did not fit into the builder's offset type");
308-
309-
let adjusted_new_offset = new_offset_correct_type + curr_elements_len;
310-
uninit_range.set_value(i, adjusted_new_offset);
311-
}
312-
313-
// SAFETY: We have set all the values in the range, and since `offsets` are
314-
// non-nullable, we are done.
315-
unsafe { uninit_range.finish() };
316-
}
317-
318290
match_each_integer_ptype!(new_offsets.ptype(), |A| {
319-
adjust_offsets::<O, A>(uninit_range, new_offsets, curr_elements_len);
291+
adjust_offsets::<O, A>(
292+
uninit_range,
293+
new_offsets,
294+
old_elements_len,
295+
new_elements_len,
296+
);
320297
})
321298
}
322299

@@ -341,6 +318,42 @@ impl<O: IntegerPType, S: IntegerPType> ArrayBuilder for ListViewBuilder<O, S> {
341318
}
342319
}
343320

321+
/// Given new offsets, adds them to the `UninitRange` after adding the `old_elements_len` to each
322+
/// offset.
323+
fn adjust_offsets<'a, O: IntegerPType, A: IntegerPType>(
324+
mut uninit_range: UninitRange<'a, O>,
325+
new_offsets: PrimitiveArray,
326+
old_elements_len: usize,
327+
new_elements_len: usize,
328+
) {
329+
let new_offsets_slice = new_offsets.as_slice::<A>();
330+
let old_elements_len = O::from_usize(old_elements_len)
331+
.vortex_expect("the old elements length did not fit into the offset type (impossible)");
332+
let new_elements_len = O::from_usize(new_elements_len)
333+
.vortex_expect("the current elements length did not fit into the offset type (impossible)");
334+
335+
for i in 0..uninit_range.len() {
336+
let new_offset = O::from_usize(
337+
new_offsets_slice[i]
338+
.to_usize()
339+
.vortex_expect("Offsets must always fit in usize"),
340+
)
341+
.vortex_expect("New offset somehow did not fit into the builder's offset type");
342+
343+
let adjusted_new_offset = new_offset + old_elements_len;
344+
debug_assert!(
345+
adjusted_new_offset <= new_elements_len,
346+
"[{i}/{}]: {new_offset} + {old_elements_len} = {adjusted_new_offset} <= {new_elements_len} failed",
347+
uninit_range.len()
348+
);
349+
uninit_range.set_value(i, adjusted_new_offset);
350+
}
351+
352+
// SAFETY: We have set all the values in the range, and since `offsets` are
353+
// non-nullable, we are done.
354+
unsafe { uninit_range.finish() };
355+
}
356+
344357
#[cfg(test)]
345358
mod tests {
346359
use std::sync::Arc;

vortex-btrblocks/src/lib.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,10 +403,15 @@ impl BtrBlocksCompressor {
403403
// list and list view.
404404
let list_array = list_from_list_view(list_view_array);
405405

406+
// Reset the offsets to remove garbage data that might prevent us from narrowing our
407+
// offsets.
408+
let list_array = list_array.reset_offsets(true)?;
409+
406410
let compressed_elems = self.compress(list_array.elements())?;
407411

408-
// Note that since the type of our offsets are not encoded in our `DType`,
409-
// we may narrow the widths.
412+
// Note that since the type of our offsets are not encoded in our `DType`, and since
413+
// we guarantee above that all elements are referenced by offsets, we may narrow the
414+
// widths.
410415

411416
let compressed_offsets = IntCompressor::compress_no_dict(
412417
&list_array.offsets().to_primitive().narrow()?,

0 commit comments

Comments
 (0)