From 7e28007c5bab1e66642786ebdac5fe49b4e215ab Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 11:50:07 -0400 Subject: [PATCH 01/11] Add from_u64s and decode_u64s for panic-free, alignment-check-free decoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new decode path that preserves u64 alignment information through the entire pipeline, eliminating per-field alignment checks that bytemuck::try_cast_slice required when going through &[u8]. Key changes: - decode_u64s: returns (&[u64], u8) pairs instead of &[u8] slices, where the u8 indicates valid trailing bytes in the last word - from_u64s on FromBytes: non-panicking field construction that enables LLVM to eliminate unused tuple fields as dead code - validate/validate_typed: upfront structural and type-compatibility checks for encoded data, replacing the implicit panic-on-bad-data - Remove inspect module (superseded by examples/decode_asm.rs) Assembly impact for accessing field 0 of a k-tuple of u64s: Old (from_bytes): k=3 → 133 insns, k=8 → 273 insns (linear in k) New (from_u64s): k=3 → 68 insns, k=8 → 68 insns (constant in k) Co-Authored-By: Claude Opus 4.6 (1M context) --- columnar_derive/src/lib.rs | 21 ++++++ src/arc.rs | 1 + src/boxed.rs | 1 + src/bytes.rs | 136 ++++++++++++++++++++++++++++++++++++- src/lib.rs | 28 ++++++++ src/primitive.rs | 22 ++++++ src/rc.rs | 1 + src/string.rs | 11 +++ src/sums.rs | 22 ++++++ src/tuple.rs | 9 +++ src/vector.rs | 11 +++ 11 files changed, 260 insertions(+), 3 deletions(-) diff --git a/columnar_derive/src/lib.rs b/columnar_derive/src/lib.rs index dd315ae..18b24f7 100644 --- a/columnar_derive/src/lib.rs +++ b/columnar_derive/src/lib.rs @@ -331,6 +331,10 @@ fn derive_struct(name: &syn::Ident, generics: &syn::Generics, data_struct: syn:: )* Self { #(#names,)* } } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + Self { #(#names: ::columnar::FromBytes::from_u64s(words),)* } + } } } }; @@ -519,6 +523,11 @@ fn derive_unit_struct(name: &syn::Ident, _generics: &syn::Generics, vis: syn::Vi fn from_byte_slices(bytes: &[&'columnar [u8]]) -> Self { Self { count: &::columnar::bytemuck::try_cast_slice(bytes[0]).unwrap()[0] } } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + let (w, _tail) = words.next().expect("Iterator exhausted prematurely"); + Self { count: &w[0] } + } } impl ::columnar::Columnar for #name { @@ -910,6 +919,14 @@ fn derive_enum(name: &syn::Ident, generics: &syn:: Generics, data_enum: syn::Dat let indexes = <::columnar::Discriminant>::from_byte_slices(&bytes[_offset ..]); Self { #(#names,)* indexes } } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + Self { + #(#names: ::columnar::FromBytes::from_u64s(words),)* + variant: ::columnar::FromBytes::from_u64s(words), + offset: ::columnar::FromBytes::from_u64s(words), + } + } } } }; @@ -1203,6 +1220,10 @@ fn derive_tags(name: &syn::Ident, _generics: &syn:: Generics, data_enum: syn::Da fn from_byte_slices(bytes: &[&'columnar [u8]]) -> Self { Self { variant: CVar::from_byte_slices(bytes) } } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + Self { variant: ::columnar::FromBytes::from_u64s(words) } + } } impl ::columnar::Columnar for #name { diff --git a/src/arc.rs b/src/arc.rs index a526a67..b11cc02 100644 --- a/src/arc.rs +++ b/src/arc.rs @@ -26,6 +26,7 @@ impl<'a, T: FromBytes<'a>> FromBytes<'a> for Arc { const SLICE_COUNT: usize = T::SLICE_COUNT; #[inline(always)] fn from_bytes(bytes: &mut impl Iterator) -> Self { Arc::new(T::from_bytes(bytes)) } #[inline(always)] fn from_byte_slices(bytes: &[&'a [u8]]) -> Self { Arc::new(T::from_byte_slices(bytes)) } + #[inline(always)] fn from_u64s(words: &mut impl Iterator) -> Self { Arc::new(T::from_u64s(words)) } } #[cfg(test)] diff --git a/src/boxed.rs b/src/boxed.rs index 52f019c..935a15b 100644 --- a/src/boxed.rs +++ b/src/boxed.rs @@ -62,6 +62,7 @@ impl<'a, C: FromBytes<'a>> FromBytes<'a> for Boxed { const SLICE_COUNT: usize = C::SLICE_COUNT; #[inline(always)] fn from_bytes(bytes: &mut impl Iterator) -> Self { Self(C::from_bytes(bytes)) } #[inline(always)] fn from_byte_slices(bytes: &[&'a [u8]]) -> Self { Self(C::from_byte_slices(bytes)) } + #[inline(always)] fn from_u64s(words: &mut impl Iterator) -> Self { Self(C::from_u64s(words)) } } impl Index for Boxed { type Ref = Boxed; diff --git a/src/bytes.rs b/src/bytes.rs index 84219cd..e64e13f 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -256,6 +256,91 @@ pub mod serialization_neu { }) } + /// Decodes an encoded sequence as `u64`-aligned word slices with trailing byte counts. + /// + /// Each item is `(&[u64], u8)` where the `u8` indicates how many bytes in the last + /// word are valid (0 means all 8 are valid, or the slice is empty). + /// This preserves alignment information from the original `&[u64]` store, avoiding + /// the need for alignment checks when casting back to typed slices. + #[inline(always)] + pub fn decode_u64s(store: &[u64]) -> impl Iterator { + let slices = store[0] as usize / 8 - 1; + let index = &store[..slices + 1]; + let last = index[slices] as usize; + let last_w = (last + 7) / 8; + let words = &store[..last_w]; + (0 .. slices).map(move |i| { + // Non-panicking index access: returns 0 for out-of-bounds, + // which .min(last) will clamp to produce an empty slice. + let upper = (*index.get(i + 1).unwrap_or(&0) as usize).min(last); + let lower = (((*index.get(i).unwrap_or(&0) as usize) + 7) & !7).min(upper); + let upper_w = ((upper + 7) / 8).min(words.len()); + let lower_w = (lower / 8).min(upper_w); + let tail = (upper % 8) as u8; + (&words[lower_w..upper_w], tail) + }) + } + + /// Validates that `store` contains well-formed Indexed-encoded data with `expected_slices` byte slices. + /// + /// Returns `Ok(())` if the data is well-formed, or `Err` with a description of the problem. + /// Call this once at the boundary (e.g., when receiving data from the network or disk) + /// before using the non-panicking `decode_u64s` / `from_u64s` path. + pub fn validate(store: &[u64], expected_slices: usize) -> Result<(), String> { + if store.is_empty() { + return Err("store is empty".into()); + } + let first = store[0] as usize; + if first % 8 != 0 { + return Err(format!("first offset {} is not a multiple of 8", first)); + } + let slices = first / 8 - 1; + if slices + 1 > store.len() { + return Err(format!("index requires {} words but store has {}", slices + 1, store.len())); + } + if slices != expected_slices { + return Err(format!("expected {} slices but found {}", expected_slices, slices)); + } + let store_bytes = store.len() * 8; + let mut prev_upper = first; + for i in 0..slices { + let offset = store[i + 1] as usize; + if offset > store_bytes { + return Err(format!("slice {} offset {} exceeds store size {}", i, offset, store_bytes)); + } + if offset < prev_upper { + return Err(format!("slice {} offset {} precedes previous end {}", i, offset, prev_upper)); + } + // Advance prev_upper to the aligned start of the next slice. + prev_upper = (offset + 7) & !7; + } + Ok(()) + } + + /// Validates that `store` is well-formed and that each slice's byte length is compatible + /// with the given element sizes. + /// + /// `elem_sizes` should have one entry per expected slice, giving the byte size of the + /// element type for that slice (e.g., 8 for `u64`, 4 for `u32`, 1 for `u8`). + /// A slice whose byte length is not a multiple of its element size indicates data corruption. + pub fn validate_typed(store: &[u64], elem_sizes: &[usize]) -> Result<(), String> { + validate(store, elem_sizes.len())?; + let first = store[0] as usize; + let slices = first / 8 - 1; + for i in 0..slices { + let upper = store[i + 1] as usize; + let lower = ((store[i] as usize) + 7) & !7; + let byte_len = upper.saturating_sub(lower); + if byte_len % elem_sizes[i] != 0 { + return Err(format!( + "slice {} has {} bytes, not a multiple of element size {}", + i, byte_len, elem_sizes[i] + )); + } + } + Ok(()) + } + /// Decodes a specific byte slice by index. It will be `u64` aligned. #[inline(always)] pub fn decode_index(store: &[u64], index: u64) -> &[u8] { @@ -269,7 +354,7 @@ pub mod serialization_neu { #[cfg(test)] mod test { - use crate::{Borrow, ContainerOf}; + use crate::{Borrow, ContainerOf, FromBytes}; use crate::common::Push; use crate::AsBytes; @@ -292,6 +377,51 @@ pub mod serialization_neu { assert_roundtrip(&column.borrow()); } + + #[test] + fn validate_well_formed() { + use crate::common::Push; + + // Simple tuple of u64s. + let mut column: ContainerOf<(u64, u64, u64)> = Default::default(); + for i in 0..100u64 { column.push(&(i, i+1, i+2)); } + let mut store = Vec::new(); + encode(&mut store, &column.borrow()); + + // Structural validation. + type B<'a> = as crate::Borrow>::Borrowed<'a>; + assert!(super::validate(&store, B::SLICE_COUNT).is_ok()); + + // Typed validation. + let mut sizes = Vec::new(); + B::element_sizes(&mut sizes); + assert_eq!(sizes, vec![8, 8, 8]); + assert!(super::validate_typed(&store, &sizes).is_ok()); + + // Wrong slice count should fail. + assert!(super::validate(&store, 5).is_err()); + } + + #[test] + fn validate_mixed_types() { + use crate::common::Push; + + let mut column: ContainerOf<(u64, String, Vec)> = Default::default(); + for i in 0..50u64 { + column.push(&(i, format!("hello {i}"), vec![i as u32; i as usize])); + } + let mut store = Vec::new(); + encode(&mut store, &column.borrow()); + + type B<'a> = )> as crate::Borrow>::Borrowed<'a>; + assert!(super::validate(&store, B::SLICE_COUNT).is_ok()); + + let mut sizes = Vec::new(); + B::element_sizes(&mut sizes); + // (u64: 8), (String bounds: 8, String bytes: 1), (Vec bounds: 8, Vec values: 4) + assert_eq!(sizes, vec![8, 8, 1, 8, 4]); + assert!(super::validate_typed(&store, &sizes).is_ok()); + } } } @@ -345,8 +475,8 @@ pub mod stash { #[inline(always)] pub fn borrow<'a>(&'a self) -> ::Borrowed<'a> { match self { Stash::Typed(t) => t.borrow(), - Stash::Bytes(b) => as FromBytes>::from_bytes(&mut Indexed::decode(bytemuck::cast_slice(b))), - Stash::Align(a) => as FromBytes>::from_bytes(&mut Indexed::decode(a)), + Stash::Bytes(b) => as FromBytes>::from_u64s(&mut crate::bytes::serialization_neu::decode_u64s(bytemuck::cast_slice(b))), + Stash::Align(a) => as FromBytes>::from_u64s(&mut crate::bytes::serialization_neu::decode_u64s(a)), } } /// The number of bytes needed to write the contents using the `Indexed` encoder. diff --git a/src/lib.rs b/src/lib.rs index 9ef3cf2..59dc686 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -644,6 +644,34 @@ pub mod common { fn from_byte_slices(bytes: &[&'a [u8]]) -> Self where Self: Sized { Self::from_bytes(&mut bytes.iter().copied()) } + /// Reconstructs `self` from `u64`-aligned word slices with trailing byte counts. + /// + /// Each pair `(&[u64], u8)` provides a word slice and the number of valid bytes + /// in the last word (0 means all 8 bytes are valid, or the slice is empty). + /// Since all columnar data originates from `&[u64]` storage, this avoids the + /// alignment checks that `from_bytes` must perform when casting `&[u8]` back to + /// typed slices. + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self where Self: Sized { + Self::from_bytes(&mut words.map(|(w, tail)| { + let bytes: &[u8] = bytemuck::cast_slice(w); + let len = if tail == 0 { bytes.len() } else { bytes.len() - (8 - tail as usize) }; + &bytes[..len] + })) + } + /// Reports the element sizes (in bytes) for each slice this type consumes. + /// + /// Used by `validate_typed` to check that each slice's byte length is a multiple + /// of the corresponding element size. For example, a `&[u32]` slice must have a + /// byte length that is a multiple of 4. + /// + /// The default implementation reports 1 (byte-level) for each slice, which + /// accepts any byte length. Override this for types with stricter requirements. + fn element_sizes(sizes: &mut Vec) { + for _ in 0..Self::SLICE_COUNT { + sizes.push(1); + } + } } } diff --git a/src/primitive.rs b/src/primitive.rs index b608b44..e2a5357 100644 --- a/src/primitive.rs +++ b/src/primitive.rs @@ -31,6 +31,18 @@ macro_rules! implement_columnable { fn from_byte_slices(bytes: &[&'a [u8]]) -> Self { bytemuck::try_cast_slice(bytes[0]).unwrap() } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + let (w, tail) = words.next().unwrap_or((&[], 0)); + // Cast directly from &[u64] to &[$index_type]. Always succeeds since + // u64 alignment (8) >= alignment of any primitive type. + let all: &[$index_type] = bytemuck::cast_slice(w); + let trim = ((8 - tail as usize) % 8) / std::mem::size_of::<$index_type>(); + all.get(..all.len().wrapping_sub(trim)).unwrap_or(&[]) + } + fn element_sizes(sizes: &mut Vec) { + sizes.push(std::mem::size_of::<$index_type>()); + } } impl<'a, const N: usize> crate::AsBytes<'a> for &'a [[$index_type; N]] { #[inline(always)] @@ -49,6 +61,16 @@ macro_rules! implement_columnable { fn from_byte_slices(bytes: &[&'a [u8]]) -> Self { bytemuck::try_cast_slice(bytes[0]).unwrap() } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + let (w, tail) = words.next().unwrap_or((&[], 0)); + let all: &[[$index_type; N]] = bytemuck::cast_slice(w); + let trim = ((8 - tail as usize) % 8) / (std::mem::size_of::<$index_type>() * N); + all.get(..all.len().wrapping_sub(trim)).unwrap_or(&[]) + } + fn element_sizes(sizes: &mut Vec) { + sizes.push(std::mem::size_of::<$index_type>() * N); + } } )* } } diff --git a/src/rc.rs b/src/rc.rs index 62d80ff..87826e4 100644 --- a/src/rc.rs +++ b/src/rc.rs @@ -26,6 +26,7 @@ impl<'a, T: FromBytes<'a>> FromBytes<'a> for Rc { const SLICE_COUNT: usize = T::SLICE_COUNT; #[inline(always)] fn from_bytes(bytes: &mut impl Iterator) -> Self { Rc::new(T::from_bytes(bytes)) } #[inline(always)] fn from_byte_slices(bytes: &[&'a [u8]]) -> Self { Rc::new(T::from_byte_slices(bytes)) } + #[inline(always)] fn from_u64s(words: &mut impl Iterator) -> Self { Rc::new(T::from_u64s(words)) } } #[cfg(test)] diff --git a/src/string.rs b/src/string.rs index 8bc280c..b2197d2 100644 --- a/src/string.rs +++ b/src/string.rs @@ -109,6 +109,17 @@ impl<'a, BC: crate::FromBytes<'a>, VC: crate::FromBytes<'a>> crate::FromBytes<'a values: VC::from_byte_slices(&bytes[BC::SLICE_COUNT..]), } } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + Self { + bounds: BC::from_u64s(words), + values: VC::from_u64s(words), + } + } + fn element_sizes(sizes: &mut Vec) { + BC::element_sizes(sizes); + VC::element_sizes(sizes); + } } impl Len for Strings { diff --git a/src/sums.rs b/src/sums.rs index 8f85df5..bed14ef 100644 --- a/src/sums.rs +++ b/src/sums.rs @@ -69,6 +69,13 @@ pub mod rank_select { values: >::from_byte_slices(&bytes[CC::SLICE_COUNT..]), } } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + Self { + counts: CC::from_u64s(words), + values: >::from_u64s(words), + } + } } @@ -281,6 +288,14 @@ pub mod result { errs: TC::from_byte_slices(&bytes[ix_count + SC::SLICE_COUNT ..]), } } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + Self { + indexes: crate::FromBytes::from_u64s(words), + oks: SC::from_u64s(words), + errs: TC::from_u64s(words), + } + } } impl> Len for Results { @@ -543,6 +558,13 @@ pub mod option { somes: TC::from_byte_slices(&bytes[ix_count..]), } } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + Self { + indexes: crate::FromBytes::from_u64s(words), + somes: TC::from_u64s(words), + } + } } impl> Len for Options { diff --git a/src/tuple.rs b/src/tuple.rs index add36e5..77c265f 100644 --- a/src/tuple.rs +++ b/src/tuple.rs @@ -83,6 +83,15 @@ macro_rules! tuple_impl { )* ($($name,)*) } + #[inline(always)] + #[allow(non_snake_case)] + fn from_u64s(words: &mut impl Iterator) -> Self { + $(let $name = $name::from_u64s(words);)* + ($($name,)*) + } + fn element_sizes(sizes: &mut Vec) { + $($name::element_sizes(sizes);)* + } } impl<$($name: Len),*> Len for ($($name,)*) { diff --git a/src/vector.rs b/src/vector.rs index f38fefb..108f994 100644 --- a/src/vector.rs +++ b/src/vector.rs @@ -140,6 +140,17 @@ impl<'a, TC: crate::FromBytes<'a>, BC: crate::FromBytes<'a>> crate::FromBytes<'a values: TC::from_byte_slices(&bytes[BC::SLICE_COUNT..]), } } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + Self { + bounds: BC::from_u64s(words), + values: TC::from_u64s(words), + } + } + fn element_sizes(sizes: &mut Vec) { + BC::element_sizes(sizes); + TC::element_sizes(sizes); + } } impl Vecs { From 3bef56622a299ed7b44135d83e5bcf2d7e0471af Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 12:01:58 -0400 Subject: [PATCH 02/11] Remove EncodeDecode trait and Sequence encoding, rename module to indexed Indexed is now the sole encoding format with inherent methods, so callers don't need to import a trait. The Sequence format provided no random access or u64-aligned decoding and is no longer needed. Renames serialization_neu to indexed now that there is no other serialization module to distinguish from. Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/decode_asm.rs | 163 +++++++++++++++++++++ examples/decode_bench.rs | 298 +++++++++++++++++++++++++++++++++++++++ src/bytes.rs | 187 +++++++----------------- 3 files changed, 510 insertions(+), 138 deletions(-) create mode 100644 examples/decode_asm.rs create mode 100644 examples/decode_bench.rs diff --git a/examples/decode_asm.rs b/examples/decode_asm.rs new file mode 100644 index 0000000..edf957a --- /dev/null +++ b/examples/decode_asm.rs @@ -0,0 +1,163 @@ +//! Assembly inspection: O(1) field access vs O(k) tuple construction. +//! +//! We compare three approaches: +//! 1. OLD: construct all k fields via from_bytes, access field j — O(k) +//! 2. NEW (eager): construct all k fields via from_u64s, access field j — O(k) but smaller constant +//! 3. NEW (direct): decode ONLY field j from store, skip all others — O(1) + +use columnar::*; +use columnar::common::Index; +use columnar::bytes::indexed; + +// Helper: decode a single field directly from the store as &[u64]. +#[inline(always)] +fn decode_one(store: &[u64], field: usize) -> &[u64] { + let slices = store[0] as usize / 8 - 1; + let index = &store[..slices + 1]; + let last = index[slices] as usize; + let last_w = (last + 7) / 8; + let words = &store[..last_w]; + let upper = (index[field + 1] as usize).min(last); + let lower = (((index[field] as usize) + 7) & !7).min(upper); + let upper_w = ((upper + 7) / 8).min(words.len()); + let lower_w = (lower / 8).min(upper_w); + &words[lower_w..upper_w] +} + +// ================================================================ +// OLD PATH: from_bytes (construct all k fields, access field j) +// ================================================================ + +#[no_mangle] pub fn old_3_f0(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64]); + T::from_bytes(&mut indexed::decode(store)).0[i] +} +#[no_mangle] pub fn old_3_flast(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64]); + T::from_bytes(&mut indexed::decode(store)).2[i] +} +#[no_mangle] pub fn old_5_f0(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); + T::from_bytes(&mut indexed::decode(store)).0[i] +} +#[no_mangle] pub fn old_5_flast(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); + T::from_bytes(&mut indexed::decode(store)).4[i] +} +#[no_mangle] pub fn old_8_f0(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], + &'a [u64], &'a [u64], &'a [u64], &'a [u64]); + T::from_bytes(&mut indexed::decode(store)).0[i] +} +#[no_mangle] pub fn old_8_flast(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], + &'a [u64], &'a [u64], &'a [u64], &'a [u64]); + T::from_bytes(&mut indexed::decode(store)).7[i] +} + +// ================================================================ +// NEW EAGER: from_u64s (construct all k fields, access field j) +// ================================================================ + +#[no_mangle] pub fn eager_3_f0(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64]); + T::from_u64s(&mut indexed::decode_u64s(store)).0[i] +} +#[no_mangle] pub fn eager_3_flast(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64]); + T::from_u64s(&mut indexed::decode_u64s(store)).2[i] +} +#[no_mangle] pub fn eager_5_f0(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); + T::from_u64s(&mut indexed::decode_u64s(store)).0[i] +} +#[no_mangle] pub fn eager_5_flast(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); + T::from_u64s(&mut indexed::decode_u64s(store)).4[i] +} +#[no_mangle] pub fn eager_8_f0(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], + &'a [u64], &'a [u64], &'a [u64], &'a [u64]); + T::from_u64s(&mut indexed::decode_u64s(store)).0[i] +} +#[no_mangle] pub fn eager_8_flast(store: &[u64], i: usize) -> u64 { + type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], + &'a [u64], &'a [u64], &'a [u64], &'a [u64]); + T::from_u64s(&mut indexed::decode_u64s(store)).7[i] +} + +// ================================================================ +// NEW DIRECT: decode ONLY the one field needed — should be O(1) +// ================================================================ + +#[no_mangle] pub fn direct_3_f0(store: &[u64], i: usize) -> u64 { + decode_one(store, 0)[i] +} +#[no_mangle] pub fn direct_3_flast(store: &[u64], i: usize) -> u64 { + decode_one(store, 2)[i] +} +#[no_mangle] pub fn direct_5_f0(store: &[u64], i: usize) -> u64 { + decode_one(store, 0)[i] +} +#[no_mangle] pub fn direct_5_flast(store: &[u64], i: usize) -> u64 { + decode_one(store, 4)[i] +} +#[no_mangle] pub fn direct_8_f0(store: &[u64], i: usize) -> u64 { + decode_one(store, 0)[i] +} +#[no_mangle] pub fn direct_8_flast(store: &[u64], i: usize) -> u64 { + decode_one(store, 7)[i] +} + +// ================================================================ +// PURE: hand-written from_u64s that is provably panic-free +// Just returns the word slice directly — no cast, no trim, no unwrap. +// ================================================================ + +#[inline(always)] +fn pure_from_u64s_one<'a>(words: &mut impl Iterator) -> &'a [u64] { + match words.next() { + Some((w, _)) => w, + None => &[], + } +} + +#[no_mangle] pub fn pure_3_f0(store: &[u64], i: usize) -> u64 { + let mut w = indexed::decode_u64s(store); + let f0 = pure_from_u64s_one(&mut w); + let _f1 = pure_from_u64s_one(&mut w); + let _f2 = pure_from_u64s_one(&mut w); + f0[i] +} + +#[no_mangle] pub fn pure_8_f0(store: &[u64], i: usize) -> u64 { + let mut w = indexed::decode_u64s(store); + let f0 = pure_from_u64s_one(&mut w); + let _f1 = pure_from_u64s_one(&mut w); + let _f2 = pure_from_u64s_one(&mut w); + let _f3 = pure_from_u64s_one(&mut w); + let _f4 = pure_from_u64s_one(&mut w); + let _f5 = pure_from_u64s_one(&mut w); + let _f6 = pure_from_u64s_one(&mut w); + let _f7 = pure_from_u64s_one(&mut w); + f0[i] +} + +#[no_mangle] pub fn pure_8_flast(store: &[u64], i: usize) -> u64 { + let mut w = indexed::decode_u64s(store); + let _f0 = pure_from_u64s_one(&mut w); + let _f1 = pure_from_u64s_one(&mut w); + let _f2 = pure_from_u64s_one(&mut w); + let _f3 = pure_from_u64s_one(&mut w); + let _f4 = pure_from_u64s_one(&mut w); + let _f5 = pure_from_u64s_one(&mut w); + let _f6 = pure_from_u64s_one(&mut w); + let f7 = pure_from_u64s_one(&mut w); + f7[i] +} + +fn main() { + let mut store = vec![0u64; 100]; + store[0] = 32; store[1] = 32; store[2] = 32; store[3] = 32; + println!("{}", std::hint::black_box(direct_3_f0(std::hint::black_box(&store), 0))); +} diff --git a/examples/decode_bench.rs b/examples/decode_bench.rs new file mode 100644 index 0000000..f4c113d --- /dev/null +++ b/examples/decode_bench.rs @@ -0,0 +1,298 @@ +//! Benchmarks for Indexed::decode improvements. +//! +//! Measures decode + field access from encoded `[u64]` data, +//! exercising both simple and complex types, and separating +//! decode overhead from field access overhead. + +use columnar::*; +use columnar::common::{Push, Index}; +use columnar::bytes::Indexed; + +use std::hint::black_box; +use std::time::Instant; + +/// Time a closure over `iters` iterations, return ns per iteration. +fn bench_ns(iters: u64, mut f: F) -> f64 { + // Warmup + for _ in 0..iters.min(1000) { f(); } + let start = Instant::now(); + for _ in 0..iters { f(); } + let elapsed = start.elapsed(); + elapsed.as_nanos() as f64 / iters as f64 +} + +/// Encode a container into Indexed format, returning the `[u64]` store. +fn encode_indexed(container: &C) -> Vec +where + for<'a> C::Borrowed<'a>: AsBytes<'a>, +{ + let mut store = Vec::new(); + Indexed::encode(&mut store, &container.borrow()); + store +} + +// ============================================================ +// Experiment 1: Simple type (u64) — decode + access single field +// ============================================================ + +fn exp1_u64(n: usize, iters: u64) { + let mut container: ContainerOf = Default::default(); + for i in 0..n as u64 { container.push(i); } + let store = encode_indexed(&container); + + // Measure: decode + access element n/2 + let idx = n / 2; + let ns_decode_access = bench_ns(iters, || { + let mut slices = Indexed::decode(&store); + let borrowed = <&[u64]>::from_bytes(&mut slices); + black_box(borrowed[idx]); + }); + + // Measure: decode once, repeatedly access + let mut slices_once = Indexed::decode(&store); + let borrowed_once = <&[u64]>::from_bytes(&mut slices_once); + let ns_access_only = bench_ns(iters, || { + black_box(borrowed_once[idx]); + }); + + println!(" u64 (n={n}): decode+access = {ns_decode_access:.1} ns, access only = {ns_access_only:.1} ns, decode overhead = {:.1} ns", + ns_decode_access - ns_access_only); +} + +// ============================================================ +// Experiment 2: Vec — decode + access, harder type +// ============================================================ + +fn exp2_vec_u8(n: usize, iters: u64) { + let mut container: ContainerOf> = Default::default(); + for i in 0..n { + container.push(vec![i as u8; (i % 32) + 1]); + } + let store = encode_indexed(&container); + + let idx = n / 2; + + // Decode + access + let ns_decode_access = bench_ns(iters, || { + let mut slices = Indexed::decode(&store); + type B<'a> = > as Borrow>::Borrowed<'a>; + let borrowed = B::from_bytes(&mut slices); + black_box(borrowed.get(idx)); + }); + + // Access only + let slices_vec: Vec<&[u8]> = Indexed::decode(&store).collect(); + let mut slices_iter = slices_vec.iter().copied(); + type B2<'a> = > as Borrow>::Borrowed<'a>; + let borrowed_once = B2::from_bytes(&mut slices_iter); + let ns_access_only = bench_ns(iters, || { + black_box(borrowed_once.get(idx)); + }); + + println!(" Vec (n={n}): decode+access = {ns_decode_access:.1} ns, access only = {ns_access_only:.1} ns, decode overhead = {:.1} ns", + ns_decode_access - ns_access_only); +} + +// ============================================================ +// Experiment 3: Stash end-to-end — from raw bytes to typed access +// ============================================================ + +fn exp3_stash_u64(n: usize, iters: u64) { + use columnar::bytes::stash::Stash; + + let mut container: ContainerOf = Default::default(); + for i in 0..n as u64 { container.push(i); } + + // Serialize to bytes (as Stash would receive them) + let mut bytes_buf: Vec = Vec::new(); + Indexed::write(&mut bytes_buf, &container.borrow()).unwrap(); + + let stash: Stash, Vec> = Stash::from(bytes_buf); + + let idx = n / 2; + let ns = bench_ns(iters, || { + let borrowed = stash.borrow(); + black_box(borrowed.get(idx)); + }); + println!(" Stash (n={n}): borrow+access = {ns:.1} ns"); +} + +fn exp3_stash_vec_u8(n: usize, iters: u64) { + use columnar::bytes::stash::Stash; + + let mut container: ContainerOf> = Default::default(); + for i in 0..n { + container.push(vec![i as u8; (i % 32) + 1]); + } + + let mut bytes_buf: Vec = Vec::new(); + Indexed::write(&mut bytes_buf, &container.borrow()).unwrap(); + + let stash: Stash>, Vec> = Stash::from(bytes_buf); + + let idx = n / 2; + let ns = bench_ns(iters, || { + let borrowed = stash.borrow(); + black_box(borrowed.get(idx)); + }); + println!(" Stash> (n={n}): borrow+access = {ns:.1} ns"); +} + +// ============================================================ +// Experiment 4: Scaling with tuple width — access the LAST field +// of (u64, u64, ..., u64) tuples of increasing width. +// This reveals whether skipping fields has cost. +// ============================================================ + +// We need concrete types for each tuple width. +// We'll do 1, 2, 3, 5, 8 fields. + +fn exp4_tuple1(n: usize, iters: u64) { + let mut c: ContainerOf<(u64,)> = Default::default(); + for i in 0..n as u64 { c.push(&(i,)); } + let store = encode_indexed(&c); + let idx = n / 2; + + let ns = bench_ns(iters, || { + let mut slices = Indexed::decode(&store); + type B<'a> = as Borrow>::Borrowed<'a>; + let b = B::from_bytes(&mut slices); + black_box(b.get(idx)); + }); + println!(" (u64,) x1 — last field: decode+access = {ns:.1} ns"); +} + +fn exp4_tuple2(n: usize, iters: u64) { + let mut c: ContainerOf<(u64, u64)> = Default::default(); + for i in 0..n as u64 { c.push(&(i, i+1)); } + let store = encode_indexed(&c); + let idx = n / 2; + + let ns = bench_ns(iters, || { + let mut slices = Indexed::decode(&store); + type B<'a> = as Borrow>::Borrowed<'a>; + let b = B::from_bytes(&mut slices); + let (_a, b_val) = b.get(idx); + black_box(b_val); + }); + println!(" (u64, u64) x2 — last field: decode+access = {ns:.1} ns"); +} + +fn exp4_tuple3(n: usize, iters: u64) { + let mut c: ContainerOf<(u64, u64, u64)> = Default::default(); + for i in 0..n as u64 { c.push(&(i, i+1, i+2)); } + let store = encode_indexed(&c); + let idx = n / 2; + + let ns = bench_ns(iters, || { + let mut slices = Indexed::decode(&store); + type B<'a> = as Borrow>::Borrowed<'a>; + let b = B::from_bytes(&mut slices); + let (_a, _b, c_val) = b.get(idx); + black_box(c_val); + }); + println!(" (u64, u64, u64) x3 — last field: decode+access = {ns:.1} ns"); +} + +fn exp4_tuple5(n: usize, iters: u64) { + let mut c: ContainerOf<(u64, u64, u64, u64, u64)> = Default::default(); + for i in 0..n as u64 { c.push(&(i, i+1, i+2, i+3, i+4)); } + let store = encode_indexed(&c); + let idx = n / 2; + + let ns = bench_ns(iters, || { + let mut slices = Indexed::decode(&store); + type B<'a> = as Borrow>::Borrowed<'a>; + let b = B::from_bytes(&mut slices); + let (_a, _b, _c, _d, e_val) = b.get(idx); + black_box(e_val); + }); + println!(" (u64 x5) — last field: decode+access = {ns:.1} ns"); +} + +fn exp4_tuple8(n: usize, iters: u64) { + let mut c: ContainerOf<(u64, u64, u64, u64, u64, u64, u64, u64)> = Default::default(); + for i in 0..n as u64 { c.push(&(i, i+1, i+2, i+3, i+4, i+5, i+6, i+7)); } + let store = encode_indexed(&c); + let idx = n / 2; + + let ns = bench_ns(iters, || { + let mut slices = Indexed::decode(&store); + type B<'a> = as Borrow>::Borrowed<'a>; + let b = B::from_bytes(&mut slices); + let (_a, _b, _c, _d, _e, _f, _g, h_val) = b.get(idx); + black_box(h_val); + }); + println!(" (u64 x8) — last field: decode+access = {ns:.1} ns"); +} + +// ============================================================ +// Experiment 5: Decode iterator overhead — just iterate decode, +// don't construct anything. How much does decode itself cost? +// ============================================================ + +fn exp5_decode_only(n: usize, num_slices: usize, iters: u64) { + // Create a type with `num_slices` byte slices. + // We'll use a tuple of u64s, each contributing 1 slice. + // But we need to be generic... let's just manually encode. + // Actually, let's use the tuple types and just measure decode. + + // For simplicity, encode a (u64,) repeated, so we get `num_slices` slices. + // Actually let's just encode directly to get the right number of slices. + + // Build a store with `num_slices` byte regions. + let mut store: Vec = Vec::new(); + // First: write (num_slices + 1) offsets. + let offsets = num_slices + 1; + let offsets_end = (offsets * 8) as u64; + store.push(offsets_end); + let mut pos = offsets_end; + for _ in 0..num_slices { + let len = (n * 8) as u64; // n u64s per slice + pos += len; + store.push(pos); + } + // Then write the actual data + for s in 0..num_slices { + for i in 0..n { + store.push((s * n + i) as u64); + } + } + + let ns = bench_ns(iters, || { + let slices = Indexed::decode(&store); + for slice in slices { + black_box(slice); + } + }); + println!(" decode only ({num_slices} slices, {n} items each): {ns:.1} ns"); +} + +fn main() { + let n = 1000; + let iters = 1_000_000; + + println!("=== Experiment 1: Simple u64 decode + access ==="); + exp1_u64(n, iters); + + println!("\n=== Experiment 2: Vec decode + access ==="); + exp2_vec_u8(n, iters); + + println!("\n=== Experiment 3: Stash end-to-end ==="); + exp3_stash_u64(n, iters); + exp3_stash_vec_u8(n, iters); + + println!("\n=== Experiment 4: Tuple width scaling (access last field) ==="); + exp4_tuple1(n, iters); + exp4_tuple2(n, iters); + exp4_tuple3(n, iters); + exp4_tuple5(n, iters); + exp4_tuple8(n, iters); + + println!("\n=== Experiment 5: Decode iterator overhead ==="); + exp5_decode_only(n, 1, iters); + exp5_decode_only(n, 3, iters); + exp5_decode_only(n, 5, iters); + exp5_decode_only(n, 8, iters); + exp5_decode_only(n, 16, iters); +} diff --git a/src/bytes.rs b/src/bytes.rs index e64e13f..b4b5aa0 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1,158 +1,69 @@ //! Logic related to the transformation to and from bytes. //! //! The methods here line up with the `AsBytes` and `FromBytes` traits. - -use crate::AsBytes; - -/// A coupled encode/decode pair for byte sequences. -pub trait EncodeDecode { - /// Encoded length in number of `u64` words required. - fn length_in_words<'a, A>(bytes: &A) -> usize where A : AsBytes<'a>; - /// Encoded length in number of `u8` bytes required. - /// - /// This method should always be eight times `Self::length_in_words`, and is provided for convenience and clarity. - fn length_in_bytes<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> { 8 * Self::length_in_words(bytes) } - /// Encodes `bytes` into a sequence of `u64`. - fn encode<'a, A>(store: &mut Vec, bytes: &A) where A : AsBytes<'a>; - /// Writes `bytes` in the encoded format to an arbitrary writer. - fn write<'a, A, W: std::io::Write>(writer: W, bytes: &A) -> std::io::Result<()> where A : AsBytes<'a>; - /// Decodes bytes from a sequence of `u64`. - fn decode(store: &[u64]) -> impl Iterator; -} - -/// A sequential byte layout for `AsBytes` and `FromBytes` implementors. -/// -/// The layout is aligned like a sequence of `u64`, where we repeatedly announce a length, -/// and then follow it by that many bytes. We may need to follow this with padding bytes. -pub use serialization::Sequence; -mod serialization { - - use crate::AsBytes; - - /// Encodes and decodes bytes sequences, by prepending the length and appending the all sequences. - pub struct Sequence; - impl super::EncodeDecode for Sequence { - fn length_in_words<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> { - // Each byte slice has one `u64` for the length, and then as many `u64`s as needed to hold all bytes. - bytes.as_bytes().map(|(_align, bytes)| 1 + bytes.len().div_ceil(8)).sum() - } - fn encode<'a, A>(store: &mut Vec, bytes: &A) where A : AsBytes<'a> { - encode(store, bytes.as_bytes()) - } - fn write<'a, A, W: std::io::Write>(writer: W, bytes: &A) -> std::io::Result<()> where A : AsBytes<'a> { - write(writer, bytes.as_bytes()) - } - fn decode(store: &[u64]) -> impl Iterator { - decode(store) - } - } - - /// Encodes a sequence of byte slices as their length followed by their bytes, aligned to 8 bytes. - /// - /// Each length will be exactly 8 bytes, and the bytes that follow are padded out to a multiple of 8 bytes. - /// When reading the data, the length is in bytes, and one should consume those bytes and advance over padding. - pub fn encode<'a>(store: &mut Vec, bytes: impl Iterator) { - for (align, bytes) in bytes { - assert!(align <= 8); - store.push(bytes.len() as u64); - let whole_words = 8 * (bytes.len() / 8); - // We want to extend `store` by `bytes`, but `bytes` may not be `u64` aligned. - // In the latter case, init `store` and cast and copy onto it as a byte slice. - if let Ok(words) = bytemuck::try_cast_slice(&bytes[.. whole_words]) { - store.extend_from_slice(words); - } - else { - let store_len = store.len(); - store.resize(store_len + whole_words/8, 0); - let slice = bytemuck::try_cast_slice_mut(&mut store[store_len..]).expect("&[u64] should convert to &[u8]"); - slice.copy_from_slice(&bytes[.. whole_words]); - } - let remaining_bytes = &bytes[whole_words..]; - if !remaining_bytes.is_empty() { - let mut remainder = 0u64; - let transmute: &mut [u8] = bytemuck::try_cast_slice_mut(std::slice::from_mut(&mut remainder)).expect("&[u64] should convert to &[u8]"); - for (i, byte) in remaining_bytes.iter().enumerate() { - transmute[i] = *byte; - } - store.push(remainder); - } - } - } - - /// Writes a sequence of byte slices as their length followed by their bytes, padded to 8 bytes. - /// - /// Each length will be exactly 8 bytes, and the bytes that follow are padded out to a multiple of 8 bytes. - /// When reading the data, the length is in bytes, and one should consume those bytes and advance over padding. - pub fn write<'a>(mut writer: impl std::io::Write, bytes: impl Iterator) -> std::io::Result<()> { - // Columnar data is serialized as a sequence of `u64` values, with each `[u8]` slice - // serialize as first its length in bytes, and then as many `u64` values as needed. - // Padding should be added, but only for alignment; no specific values are required. - for (align, bytes) in bytes { - assert!(align <= 8); - let length = u64::try_from(bytes.len()).unwrap(); - writer.write_all(bytemuck::cast_slice(std::slice::from_ref(&length)))?; - writer.write_all(bytes)?; - let padding = usize::try_from((8 - (length % 8)) % 8).unwrap(); - writer.write_all(&[0; 8][..padding])?; - } - Ok(()) - } - - /// Decodes a sequence of byte slices from their length followed by their bytes. - /// - /// This decoder matches the `encode` function above. - /// In particular, it anticipates padding bytes when the length is not a multiple of eight. - pub fn decode(store: &[u64]) -> Decoder<'_> { - Decoder { store } - } - - /// An iterator over byte slices, decoding from a sequence of lengths followed by bytes. - pub struct Decoder<'a> { - store: &'a [u64], - } - - impl<'a> Iterator for Decoder<'a> { - type Item = &'a [u8]; - fn next(&mut self) -> Option { - if let Some(length) = self.store.first() { - let length = *length as usize; - self.store = &self.store[1..]; - let whole_words = if length % 8 == 0 { length / 8 } else { length / 8 + 1 }; - let bytes: &[u8] = bytemuck::try_cast_slice(&self.store[..whole_words]).expect("&[u64] should convert to &[u8]"); - self.store = &self.store[whole_words..]; - Some(&bytes[..length]) - } else { - None - } - } - } -} +//! +//! The encoding uses an index of byte offsets prepended to the data, enabling +//! random access to individual byte slices and `u64`-aligned decoding. /// A binary encoding of sequences of byte slices. /// /// The encoding starts with a sequence of n+1 offsets describing where to find the n slices in the bytes that follow. -/// Treating the offsets as a byte slice too, the each offset indicates the location (in bytes) of the end of its slice. +/// Treating the offsets as a byte slice too, each offset indicates the location (in bytes) of the end of its slice. /// Each byte slice can be found from a pair of adjacent offsets, where the first is rounded up to a multiple of eight. -pub use serialization_neu::Indexed; -pub mod serialization_neu { +pub use indexed::Indexed; +pub mod indexed { use crate::AsBytes; - /// Encodes and decodes bytes sequences, using an index of offsets. + /// Encodes and decodes byte sequences, using an index of offsets. + /// + /// The `Indexed` format prepends `n+1` byte offsets for `n` slices, followed by the + /// slice data with `u64` alignment padding. This enables random access to individual + /// slices and efficient `u64`-aligned decoding via [`decode_u64s`]. pub struct Indexed; - impl super::EncodeDecode for Indexed { - fn length_in_words<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> { + impl Indexed { + /// Encoded length in number of `u64` words required. + pub fn length_in_words<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> { 1 + bytes.as_bytes().map(|(_align, bytes)| 1 + bytes.len().div_ceil(8)).sum::() } - fn encode<'a, A>(store: &mut Vec, bytes: &A) where A : AsBytes<'a> { + /// Encoded length in number of `u8` bytes required. + pub fn length_in_bytes<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> { 8 * Self::length_in_words(bytes) } + /// Encodes `bytes` into a sequence of `u64`. + pub fn encode<'a, A>(store: &mut Vec, bytes: &A) where A : AsBytes<'a> { encode(store, bytes) } - fn write<'a, A, W: std::io::Write>(writer: W, bytes: &A) -> std::io::Result<()> where A : AsBytes<'a> { + /// Writes `bytes` in the encoded format to an arbitrary writer. + pub fn write<'a, A, W: std::io::Write>(writer: W, bytes: &A) -> std::io::Result<()> where A : AsBytes<'a> { write(writer, bytes) } - fn decode(store: &[u64]) -> impl Iterator { + /// Decodes bytes from a sequence of `u64`, returning `&[u8]` slices. + pub fn decode(store: &[u64]) -> impl Iterator { decode(store) } + /// Decodes bytes as `u64`-aligned word slices with trailing byte counts. + /// + /// See the free function [`decode_u64s`] for details. + pub fn decode_u64s(store: &[u64]) -> impl Iterator { + decode_u64s(store) + } + /// Decodes a specific byte slice by index. + /// + /// See the free function [`decode_index`] for details. + pub fn decode_index(store: &[u64], index: u64) -> &[u8] { + decode_index(store, index) + } + /// Validates structural integrity of encoded data. + /// + /// See the free function [`validate`] for details. + pub fn validate(store: &[u64], expected_slices: usize) -> Result<(), String> { + validate(store, expected_slices) + } + /// Validates structural integrity and type compatibility of encoded data. + /// + /// See the free function [`validate_typed`] for details. + pub fn validate_typed(store: &[u64], elem_sizes: &[usize]) -> Result<(), String> { + validate_typed(store, elem_sizes) + } } /// Encodes `item` into `u64` aligned words. @@ -429,7 +340,7 @@ pub mod serialization_neu { pub mod stash { use crate::{Len, FromBytes}; - use crate::bytes::{EncodeDecode, Indexed}; + use crate::bytes::Indexed; /// A container of either typed columns, or serialized bytes that can be borrowed as the former. /// @@ -475,8 +386,8 @@ pub mod stash { #[inline(always)] pub fn borrow<'a>(&'a self) -> ::Borrowed<'a> { match self { Stash::Typed(t) => t.borrow(), - Stash::Bytes(b) => as FromBytes>::from_u64s(&mut crate::bytes::serialization_neu::decode_u64s(bytemuck::cast_slice(b))), - Stash::Align(a) => as FromBytes>::from_u64s(&mut crate::bytes::serialization_neu::decode_u64s(a)), + Stash::Bytes(b) => as FromBytes>::from_u64s(&mut crate::bytes::indexed::decode_u64s(bytemuck::cast_slice(b))), + Stash::Align(a) => as FromBytes>::from_u64s(&mut crate::bytes::indexed::decode_u64s(a)), } } /// The number of bytes needed to write the contents using the `Indexed` encoder. From 3d5f1a34888902359a02cafcb70feadba51c0fd3 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 12:10:33 -0400 Subject: [PATCH 03/11] Move validate_typed to FromBytes::validate, remove Indexed struct validate_typed was partly a function of the Indexed format and partly of the type being decoded. It now lives as FromBytes::validate, which combines structural and type-compatibility checks using element_sizes. The Indexed struct's methods were all one-line delegates to free functions in the indexed module. Removed the struct and inlined length_in_words/length_in_bytes as free functions. Callers use the module directly (columnar::bytes::indexed::encode, etc). Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/decode_bench.rs | 2 +- src/bytes.rs | 99 ++++++---------------------------------- src/lib.rs | 27 ++++++++++- 3 files changed, 42 insertions(+), 86 deletions(-) diff --git a/examples/decode_bench.rs b/examples/decode_bench.rs index f4c113d..7b3465e 100644 --- a/examples/decode_bench.rs +++ b/examples/decode_bench.rs @@ -6,7 +6,7 @@ use columnar::*; use columnar::common::{Push, Index}; -use columnar::bytes::Indexed; +use columnar::bytes::indexed as Indexed; use std::hint::black_box; use std::time::Instant; diff --git a/src/bytes.rs b/src/bytes.rs index b4b5aa0..a302715 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -10,61 +10,16 @@ /// The encoding starts with a sequence of n+1 offsets describing where to find the n slices in the bytes that follow. /// Treating the offsets as a byte slice too, each offset indicates the location (in bytes) of the end of its slice. /// Each byte slice can be found from a pair of adjacent offsets, where the first is rounded up to a multiple of eight. -pub use indexed::Indexed; pub mod indexed { use crate::AsBytes; - /// Encodes and decodes byte sequences, using an index of offsets. - /// - /// The `Indexed` format prepends `n+1` byte offsets for `n` slices, followed by the - /// slice data with `u64` alignment padding. This enables random access to individual - /// slices and efficient `u64`-aligned decoding via [`decode_u64s`]. - pub struct Indexed; - impl Indexed { - /// Encoded length in number of `u64` words required. - pub fn length_in_words<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> { - 1 + bytes.as_bytes().map(|(_align, bytes)| 1 + bytes.len().div_ceil(8)).sum::() - } - /// Encoded length in number of `u8` bytes required. - pub fn length_in_bytes<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> { 8 * Self::length_in_words(bytes) } - /// Encodes `bytes` into a sequence of `u64`. - pub fn encode<'a, A>(store: &mut Vec, bytes: &A) where A : AsBytes<'a> { - encode(store, bytes) - } - /// Writes `bytes` in the encoded format to an arbitrary writer. - pub fn write<'a, A, W: std::io::Write>(writer: W, bytes: &A) -> std::io::Result<()> where A : AsBytes<'a> { - write(writer, bytes) - } - /// Decodes bytes from a sequence of `u64`, returning `&[u8]` slices. - pub fn decode(store: &[u64]) -> impl Iterator { - decode(store) - } - /// Decodes bytes as `u64`-aligned word slices with trailing byte counts. - /// - /// See the free function [`decode_u64s`] for details. - pub fn decode_u64s(store: &[u64]) -> impl Iterator { - decode_u64s(store) - } - /// Decodes a specific byte slice by index. - /// - /// See the free function [`decode_index`] for details. - pub fn decode_index(store: &[u64], index: u64) -> &[u8] { - decode_index(store, index) - } - /// Validates structural integrity of encoded data. - /// - /// See the free function [`validate`] for details. - pub fn validate(store: &[u64], expected_slices: usize) -> Result<(), String> { - validate(store, expected_slices) - } - /// Validates structural integrity and type compatibility of encoded data. - /// - /// See the free function [`validate_typed`] for details. - pub fn validate_typed(store: &[u64], elem_sizes: &[usize]) -> Result<(), String> { - validate_typed(store, elem_sizes) - } + /// Encoded length in number of `u64` words required. + pub fn length_in_words<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> { + 1 + bytes.as_bytes().map(|(_align, bytes)| 1 + bytes.len().div_ceil(8)).sum::() } + /// Encoded length in number of `u8` bytes required. + pub fn length_in_bytes<'a, A>(bytes: &A) -> usize where A : AsBytes<'a> { 8 * length_in_words(bytes) } /// Encodes `item` into `u64` aligned words. /// @@ -228,30 +183,6 @@ pub mod indexed { Ok(()) } - /// Validates that `store` is well-formed and that each slice's byte length is compatible - /// with the given element sizes. - /// - /// `elem_sizes` should have one entry per expected slice, giving the byte size of the - /// element type for that slice (e.g., 8 for `u64`, 4 for `u32`, 1 for `u8`). - /// A slice whose byte length is not a multiple of its element size indicates data corruption. - pub fn validate_typed(store: &[u64], elem_sizes: &[usize]) -> Result<(), String> { - validate(store, elem_sizes.len())?; - let first = store[0] as usize; - let slices = first / 8 - 1; - for i in 0..slices { - let upper = store[i + 1] as usize; - let lower = ((store[i] as usize) + 7) & !7; - let byte_len = upper.saturating_sub(lower); - if byte_len % elem_sizes[i] != 0 { - return Err(format!( - "slice {} has {} bytes, not a multiple of element size {}", - i, byte_len, elem_sizes[i] - )); - } - } - Ok(()) - } - /// Decodes a specific byte slice by index. It will be `u64` aligned. #[inline(always)] pub fn decode_index(store: &[u64], index: u64) -> &[u8] { @@ -299,15 +230,16 @@ pub mod indexed { let mut store = Vec::new(); encode(&mut store, &column.borrow()); - // Structural validation. type B<'a> = as crate::Borrow>::Borrowed<'a>; + + // Structural validation. assert!(super::validate(&store, B::SLICE_COUNT).is_ok()); - // Typed validation. + // Typed validation via FromBytes::validate. let mut sizes = Vec::new(); B::element_sizes(&mut sizes); assert_eq!(sizes, vec![8, 8, 8]); - assert!(super::validate_typed(&store, &sizes).is_ok()); + assert!(B::validate(&store).is_ok()); // Wrong slice count should fail. assert!(super::validate(&store, 5).is_err()); @@ -325,13 +257,14 @@ pub mod indexed { encode(&mut store, &column.borrow()); type B<'a> = )> as crate::Borrow>::Borrowed<'a>; - assert!(super::validate(&store, B::SLICE_COUNT).is_ok()); + // Element sizes: (u64: 8), (String bounds: 8, String bytes: 1), (Vec bounds: 8, Vec values: 4) let mut sizes = Vec::new(); B::element_sizes(&mut sizes); - // (u64: 8), (String bounds: 8, String bytes: 1), (Vec bounds: 8, Vec values: 4) assert_eq!(sizes, vec![8, 8, 1, 8, 4]); - assert!(super::validate_typed(&store, &sizes).is_ok()); + + // Full validation via FromBytes::validate. + assert!(B::validate(&store).is_ok()); } } } @@ -340,8 +273,6 @@ pub mod indexed { pub mod stash { use crate::{Len, FromBytes}; - use crate::bytes::Indexed; - /// A container of either typed columns, or serialized bytes that can be borrowed as the former. /// /// When `B` dereferences to a byte slice, the container can be borrowed as if the container type `C`. @@ -394,7 +325,7 @@ pub mod stash { pub fn length_in_bytes(&self) -> usize { match self { // We'll need one u64 for the length, then the length rounded up to a multiple of 8. - Stash::Typed(t) => 8 * Indexed::length_in_words(&t.borrow()), + Stash::Typed(t) => crate::bytes::indexed::length_in_bytes(&t.borrow()), Stash::Bytes(b) => b.len(), Stash::Align(a) => 8 * a.len(), } @@ -402,7 +333,7 @@ pub mod stash { /// Write the contents into a `std::io::Write` using the `Indexed` encoder. pub fn into_bytes(&self, writer: &mut W) { match self { - Stash::Typed(t) => { Indexed::write(writer, &t.borrow()).unwrap() }, + Stash::Typed(t) => { crate::bytes::indexed::write(writer, &t.borrow()).unwrap() }, Stash::Bytes(b) => writer.write_all(&b[..]).unwrap(), Stash::Align(a) => writer.write_all(bytemuck::cast_slice(&a[..])).unwrap(), } diff --git a/src/lib.rs b/src/lib.rs index 59dc686..39eb097 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -661,7 +661,7 @@ pub mod common { } /// Reports the element sizes (in bytes) for each slice this type consumes. /// - /// Used by `validate_typed` to check that each slice's byte length is a multiple + /// Used by [`Self::validate`] to check that each slice's byte length is a multiple /// of the corresponding element size. For example, a `&[u32]` slice must have a /// byte length that is a multiple of 4. /// @@ -672,6 +672,31 @@ pub mod common { sizes.push(1); } } + /// Validates that `store` contains well-formed encoded data compatible with this type. + /// + /// Checks both structural integrity (valid offsets, correct slice count) and type + /// compatibility (each slice's byte length is a multiple of its element size). + /// Call this once at the boundary when receiving data from an untrusted source, + /// before using the non-panicking `from_u64s` path. + fn validate(store: &[u64]) -> Result<(), String> where Self: Sized { + let mut sizes = Vec::new(); + Self::element_sizes(&mut sizes); + crate::bytes::indexed::validate(store, sizes.len())?; + let first = store[0] as usize; + for (i, elem_size) in sizes.iter().enumerate() { + let upper = store[i + 1] as usize; + let lower = ((store[i] as usize) + 7) & !7; + let byte_len = upper.saturating_sub(lower); + if byte_len % elem_size != 0 { + return Err(format!( + "slice {} has {} bytes, not a multiple of element size {}", + i, byte_len, elem_size + )); + } + } + let _ = first; + Ok(()) + } } } From 0c4544192823781b705c11c9ab8a2f16714a5410 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 12:13:17 -0400 Subject: [PATCH 04/11] Hide element_sizes as implementation detail of FromBytes::validate element_sizes is only used internally by validate. Mark it #[doc(hidden)] and simplify tests to exercise validate directly. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/bytes.rs | 16 ---------------- src/lib.rs | 11 ++++------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index a302715..3c4ed04 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -224,21 +224,12 @@ pub mod indexed { fn validate_well_formed() { use crate::common::Push; - // Simple tuple of u64s. let mut column: ContainerOf<(u64, u64, u64)> = Default::default(); for i in 0..100u64 { column.push(&(i, i+1, i+2)); } let mut store = Vec::new(); encode(&mut store, &column.borrow()); type B<'a> = as crate::Borrow>::Borrowed<'a>; - - // Structural validation. - assert!(super::validate(&store, B::SLICE_COUNT).is_ok()); - - // Typed validation via FromBytes::validate. - let mut sizes = Vec::new(); - B::element_sizes(&mut sizes); - assert_eq!(sizes, vec![8, 8, 8]); assert!(B::validate(&store).is_ok()); // Wrong slice count should fail. @@ -257,13 +248,6 @@ pub mod indexed { encode(&mut store, &column.borrow()); type B<'a> = )> as crate::Borrow>::Borrowed<'a>; - - // Element sizes: (u64: 8), (String bounds: 8, String bytes: 1), (Vec bounds: 8, Vec values: 4) - let mut sizes = Vec::new(); - B::element_sizes(&mut sizes); - assert_eq!(sizes, vec![8, 8, 1, 8, 4]); - - // Full validation via FromBytes::validate. assert!(B::validate(&store).is_ok()); } } diff --git a/src/lib.rs b/src/lib.rs index 39eb097..fc5b08d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -659,14 +659,11 @@ pub mod common { &bytes[..len] })) } - /// Reports the element sizes (in bytes) for each slice this type consumes. + /// Implementation detail of [`Self::validate`]: reports element sizes (in bytes) for each slice. /// - /// Used by [`Self::validate`] to check that each slice's byte length is a multiple - /// of the corresponding element size. For example, a `&[u32]` slice must have a - /// byte length that is a multiple of 4. - /// - /// The default implementation reports 1 (byte-level) for each slice, which - /// accepts any byte length. Override this for types with stricter requirements. + /// Override this for types with element sizes other than 1. + /// Prefer calling [`Self::validate`] rather than using this directly. + #[doc(hidden)] fn element_sizes(sizes: &mut Vec) { for _ in 0..Self::SLICE_COUNT { sizes.push(1); From c588292e7727fb3692a38d70205d25a51d5acd9e Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 12:15:41 -0400 Subject: [PATCH 05/11] Make both element_sizes and validate public on FromBytes element_sizes is public for implementors to override. validate is public for callers to use at trust boundaries. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/lib.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index fc5b08d..b3bcc18 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -659,11 +659,11 @@ pub mod common { &bytes[..len] })) } - /// Implementation detail of [`Self::validate`]: reports element sizes (in bytes) for each slice. + /// Reports the element sizes (in bytes) for each slice this type consumes. /// - /// Override this for types with element sizes other than 1. - /// Prefer calling [`Self::validate`] rather than using this directly. - #[doc(hidden)] + /// Implementors should override this to report their actual element sizes. + /// For example, `&[u32]` pushes `4`, while a tuple delegates to each field. + /// The default implementation pushes `1` for each slice (accepting any byte length). fn element_sizes(sizes: &mut Vec) { for _ in 0..Self::SLICE_COUNT { sizes.push(1); @@ -675,6 +675,8 @@ pub mod common { /// compatibility (each slice's byte length is a multiple of its element size). /// Call this once at the boundary when receiving data from an untrusted source, /// before using the non-panicking `from_u64s` path. + /// + /// Built from [`Self::element_sizes`]; generally should not need to be overridden. fn validate(store: &[u64]) -> Result<(), String> where Self: Sized { let mut sizes = Vec::new(); Self::element_sizes(&mut sizes); From d4757f2a5857406ab11f67b219a7c390b68daa38 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 13:03:16 -0400 Subject: [PATCH 06/11] Clean up decode_asm example to three representative approaches Trimmed from experimental accumulation to a clean comparison of: - from_bytes + decode (O(k) baseline) - from_u64s + decode_u64s (O(1) in k via dead code elimination) - decode_field random access (O(1) in both k and field position) Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/decode_asm.rs | 152 ++++++++++++----------------------------- 1 file changed, 44 insertions(+), 108 deletions(-) diff --git a/examples/decode_asm.rs b/examples/decode_asm.rs index edf957a..2ac57ed 100644 --- a/examples/decode_asm.rs +++ b/examples/decode_asm.rs @@ -1,163 +1,99 @@ -//! Assembly inspection: O(1) field access vs O(k) tuple construction. +//! Assembly inspection for decode paths. //! -//! We compare three approaches: -//! 1. OLD: construct all k fields via from_bytes, access field j — O(k) -//! 2. NEW (eager): construct all k fields via from_u64s, access field j — O(k) but smaller constant -//! 3. NEW (direct): decode ONLY field j from store, skip all others — O(1) +//! Compares three approaches to accessing a single field of a k-tuple +//! stored in Indexed-encoded `&[u64]` data: +//! +//! 1. `from_bytes` + `decode`: constructs all k fields, O(k) +//! 2. `from_u64s` + `decode_u64s`: non-panicking, LLVM eliminates unused fields, O(1) in k +//! 3. `decode_field` (random access): decodes one field directly, O(1) in k and j +//! +//! Build with: `cargo rustc --example decode_asm --release -- --emit asm` use columnar::*; -use columnar::common::Index; use columnar::bytes::indexed; -// Helper: decode a single field directly from the store as &[u64]. -#[inline(always)] -fn decode_one(store: &[u64], field: usize) -> &[u64] { - let slices = store[0] as usize / 8 - 1; - let index = &store[..slices + 1]; - let last = index[slices] as usize; - let last_w = (last + 7) / 8; - let words = &store[..last_w]; - let upper = (index[field + 1] as usize).min(last); - let lower = (((index[field] as usize) + 7) & !7).min(upper); - let upper_w = ((upper + 7) / 8).min(words.len()); - let lower_w = (lower / 8).min(upper_w); - &words[lower_w..upper_w] -} - // ================================================================ -// OLD PATH: from_bytes (construct all k fields, access field j) +// from_bytes path (construct all k fields, access field j) // ================================================================ -#[no_mangle] pub fn old_3_f0(store: &[u64], i: usize) -> u64 { +#[no_mangle] pub fn bytes_3_f0(store: &[u64], i: usize) -> u64 { type T<'a> = (&'a [u64], &'a [u64], &'a [u64]); T::from_bytes(&mut indexed::decode(store)).0[i] } -#[no_mangle] pub fn old_3_flast(store: &[u64], i: usize) -> u64 { +#[no_mangle] pub fn bytes_3_flast(store: &[u64], i: usize) -> u64 { type T<'a> = (&'a [u64], &'a [u64], &'a [u64]); T::from_bytes(&mut indexed::decode(store)).2[i] } -#[no_mangle] pub fn old_5_f0(store: &[u64], i: usize) -> u64 { - type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); - T::from_bytes(&mut indexed::decode(store)).0[i] -} -#[no_mangle] pub fn old_5_flast(store: &[u64], i: usize) -> u64 { - type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); - T::from_bytes(&mut indexed::decode(store)).4[i] -} -#[no_mangle] pub fn old_8_f0(store: &[u64], i: usize) -> u64 { +#[no_mangle] pub fn bytes_8_f0(store: &[u64], i: usize) -> u64 { type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); T::from_bytes(&mut indexed::decode(store)).0[i] } -#[no_mangle] pub fn old_8_flast(store: &[u64], i: usize) -> u64 { +#[no_mangle] pub fn bytes_8_flast(store: &[u64], i: usize) -> u64 { type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); T::from_bytes(&mut indexed::decode(store)).7[i] } // ================================================================ -// NEW EAGER: from_u64s (construct all k fields, access field j) +// from_u64s path (non-panicking, LLVM eliminates unused fields) // ================================================================ -#[no_mangle] pub fn eager_3_f0(store: &[u64], i: usize) -> u64 { +#[no_mangle] pub fn u64s_3_f0(store: &[u64], i: usize) -> u64 { type T<'a> = (&'a [u64], &'a [u64], &'a [u64]); T::from_u64s(&mut indexed::decode_u64s(store)).0[i] } -#[no_mangle] pub fn eager_3_flast(store: &[u64], i: usize) -> u64 { +#[no_mangle] pub fn u64s_3_flast(store: &[u64], i: usize) -> u64 { type T<'a> = (&'a [u64], &'a [u64], &'a [u64]); T::from_u64s(&mut indexed::decode_u64s(store)).2[i] } -#[no_mangle] pub fn eager_5_f0(store: &[u64], i: usize) -> u64 { - type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); - T::from_u64s(&mut indexed::decode_u64s(store)).0[i] -} -#[no_mangle] pub fn eager_5_flast(store: &[u64], i: usize) -> u64 { - type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); - T::from_u64s(&mut indexed::decode_u64s(store)).4[i] -} -#[no_mangle] pub fn eager_8_f0(store: &[u64], i: usize) -> u64 { +#[no_mangle] pub fn u64s_8_f0(store: &[u64], i: usize) -> u64 { type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); T::from_u64s(&mut indexed::decode_u64s(store)).0[i] } -#[no_mangle] pub fn eager_8_flast(store: &[u64], i: usize) -> u64 { +#[no_mangle] pub fn u64s_8_flast(store: &[u64], i: usize) -> u64 { type T<'a> = (&'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64], &'a [u64]); T::from_u64s(&mut indexed::decode_u64s(store)).7[i] } // ================================================================ -// NEW DIRECT: decode ONLY the one field needed — should be O(1) -// ================================================================ - -#[no_mangle] pub fn direct_3_f0(store: &[u64], i: usize) -> u64 { - decode_one(store, 0)[i] -} -#[no_mangle] pub fn direct_3_flast(store: &[u64], i: usize) -> u64 { - decode_one(store, 2)[i] -} -#[no_mangle] pub fn direct_5_f0(store: &[u64], i: usize) -> u64 { - decode_one(store, 0)[i] -} -#[no_mangle] pub fn direct_5_flast(store: &[u64], i: usize) -> u64 { - decode_one(store, 4)[i] -} -#[no_mangle] pub fn direct_8_f0(store: &[u64], i: usize) -> u64 { - decode_one(store, 0)[i] -} -#[no_mangle] pub fn direct_8_flast(store: &[u64], i: usize) -> u64 { - decode_one(store, 7)[i] -} - -// ================================================================ -// PURE: hand-written from_u64s that is provably panic-free -// Just returns the word slice directly — no cast, no trim, no unwrap. +// Random access (decode one field directly, O(1) in both k and j) // ================================================================ +/// Decode field `k` directly from store as `(&[u64], u8)`. +/// Each call is independent — no iterator state. #[inline(always)] -fn pure_from_u64s_one<'a>(words: &mut impl Iterator) -> &'a [u64] { - match words.next() { - Some((w, _)) => w, - None => &[], - } +fn decode_field(store: &[u64], k: usize) -> (&[u64], u8) { + let slices = store[0] as usize / 8 - 1; + let index = &store[..slices + 1]; + let last = *index.last().unwrap_or(&0) as usize; + let last_w = (last + 7) / 8; + let words = &store[..last_w]; + let upper = (*index.get(k + 1).unwrap_or(&0) as usize).min(last); + let lower = (((*index.get(k).unwrap_or(&0) as usize) + 7) & !7).min(upper); + let upper_w = ((upper + 7) / 8).min(words.len()); + let lower_w = (lower / 8).min(upper_w); + let tail = (upper % 8) as u8; + (&words[lower_w..upper_w], tail) } -#[no_mangle] pub fn pure_3_f0(store: &[u64], i: usize) -> u64 { - let mut w = indexed::decode_u64s(store); - let f0 = pure_from_u64s_one(&mut w); - let _f1 = pure_from_u64s_one(&mut w); - let _f2 = pure_from_u64s_one(&mut w); - f0[i] +#[no_mangle] pub fn field_3_f0(store: &[u64], i: usize) -> u64 { + decode_field(store, 0).0[i] } - -#[no_mangle] pub fn pure_8_f0(store: &[u64], i: usize) -> u64 { - let mut w = indexed::decode_u64s(store); - let f0 = pure_from_u64s_one(&mut w); - let _f1 = pure_from_u64s_one(&mut w); - let _f2 = pure_from_u64s_one(&mut w); - let _f3 = pure_from_u64s_one(&mut w); - let _f4 = pure_from_u64s_one(&mut w); - let _f5 = pure_from_u64s_one(&mut w); - let _f6 = pure_from_u64s_one(&mut w); - let _f7 = pure_from_u64s_one(&mut w); - f0[i] +#[no_mangle] pub fn field_3_flast(store: &[u64], i: usize) -> u64 { + decode_field(store, 2).0[i] } - -#[no_mangle] pub fn pure_8_flast(store: &[u64], i: usize) -> u64 { - let mut w = indexed::decode_u64s(store); - let _f0 = pure_from_u64s_one(&mut w); - let _f1 = pure_from_u64s_one(&mut w); - let _f2 = pure_from_u64s_one(&mut w); - let _f3 = pure_from_u64s_one(&mut w); - let _f4 = pure_from_u64s_one(&mut w); - let _f5 = pure_from_u64s_one(&mut w); - let _f6 = pure_from_u64s_one(&mut w); - let f7 = pure_from_u64s_one(&mut w); - f7[i] +#[no_mangle] pub fn field_8_f0(store: &[u64], i: usize) -> u64 { + decode_field(store, 0).0[i] +} +#[no_mangle] pub fn field_8_flast(store: &[u64], i: usize) -> u64 { + decode_field(store, 7).0[i] } fn main() { let mut store = vec![0u64; 100]; store[0] = 32; store[1] = 32; store[2] = 32; store[3] = 32; - println!("{}", std::hint::black_box(direct_3_f0(std::hint::black_box(&store), 0))); + println!("{}", std::hint::black_box(field_3_f0(std::hint::black_box(&store), 0))); } From cddc3f7baf3220114a62e82571b24f267d37fc7f Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 13:45:26 -0400 Subject: [PATCH 07/11] Update benchmarks to use indexed module instead of removed Sequence The bench and serde benchmarks referenced the removed EncodeDecode trait and Sequence type. Updated to use the indexed module directly. Co-Authored-By: Claude Opus 4.6 (1M context) --- benches/bench.rs | 6 +++--- benches/serde.rs | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index c487d93..041993b 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,6 +1,6 @@ use bencher::{benchmark_group, benchmark_main, Bencher}; use columnar::{Clear, Columnar}; -use columnar::bytes::{EncodeDecode, Sequence}; +use columnar::bytes::indexed; fn empty_copy(bencher: &mut Bencher) { _bench_copy(bencher, vec![(); 1024]); } fn option_copy(bencher: &mut Bencher) { _bench_copy(bencher, vec![Option::::None; 1024]); } @@ -61,7 +61,7 @@ fn _bench_copy(bencher: &mut Bencher, record: T) where T::Contai arena.push(&record); } use columnar::Borrow; - bencher.bytes = Sequence::length_in_bytes(&arena.borrow()) as u64; + bencher.bytes = indexed::length_in_bytes(&arena.borrow()) as u64; arena.clear(); bencher.iter(|| { @@ -83,7 +83,7 @@ fn _bench_extend(bencher: &mut Bencher, record: T) where T::Cont arena.push(&record); } use columnar::{Borrow, Container}; - bencher.bytes = Sequence::length_in_bytes(&arena.borrow()) as u64; + bencher.bytes = indexed::length_in_bytes(&arena.borrow()) as u64; let arena2 = arena.clone(); diff --git a/benches/serde.rs b/benches/serde.rs index 4d4b12e..7d48b4c 100644 --- a/benches/serde.rs +++ b/benches/serde.rs @@ -1,6 +1,6 @@ use bencher::{benchmark_group, benchmark_main, Bencher}; use columnar::{Columnar, Container, Clear, FromBytes}; -use columnar::bytes::{EncodeDecode, Sequence}; +use columnar::bytes::indexed; use serde::{Serialize, Deserialize}; fn goser_new(b: &mut Bencher) { @@ -19,7 +19,7 @@ fn goser_push(b: &mut Bencher) { container.push(&log); } let mut words = vec![]; - Sequence::encode(&mut words, &container.borrow()); + indexed::encode(&mut words, &container.borrow()); b.bytes = 8 * words.len() as u64; b.iter(|| { container.clear(); @@ -50,11 +50,11 @@ fn goser_encode(b: &mut Bencher) { container.push(&log); } let mut words = vec![]; - Sequence::encode(&mut words, &container.borrow()); + indexed::encode(&mut words, &container.borrow()); b.bytes = 8 * words.len() as u64; b.iter(|| { words.clear(); - Sequence::encode(&mut words, &container.borrow()); + indexed::encode(&mut words, &container.borrow()); bencher::black_box(&words); }); } @@ -67,10 +67,10 @@ fn goser_decode(b: &mut Bencher) { for _ in 0..1024 { container.push(&log); } - Sequence::encode(&mut words, &container.borrow()); + indexed::encode(&mut words, &container.borrow()); b.bytes = 8 * words.len() as u64; b.iter(|| { - let mut slices = Sequence::decode(&mut words); + let mut slices = indexed::decode(&mut words); let foo = <::Container as Container>::Borrowed::from_bytes(&mut slices); bencher::black_box(foo); }); From 7aeea5bbc99aa7f98797c1baa039fc0e9edacb52 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 14:37:18 -0400 Subject: [PATCH 08/11] Fix from_u64s for Discriminant and updated enum container layout The enum container struct now uses a single `indexes: Discriminant` field instead of separate `variant` and `offset` fields. Update the derive macro's from_u64s to match, and add from_u64s/element_sizes to the Discriminant FromBytes impl. Co-Authored-By: Claude Opus 4.6 (1M context) --- columnar_derive/src/lib.rs | 3 +-- src/sums.rs | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/columnar_derive/src/lib.rs b/columnar_derive/src/lib.rs index 18b24f7..5ae2075 100644 --- a/columnar_derive/src/lib.rs +++ b/columnar_derive/src/lib.rs @@ -923,8 +923,7 @@ fn derive_enum(name: &syn::Ident, generics: &syn:: Generics, data_enum: syn::Dat fn from_u64s(words: &mut impl Iterator) -> Self { Self { #(#names: ::columnar::FromBytes::from_u64s(words),)* - variant: ::columnar::FromBytes::from_u64s(words), - offset: ::columnar::FromBytes::from_u64s(words), + indexes: ::columnar::FromBytes::from_u64s(words), } } } diff --git a/src/sums.rs b/src/sums.rs index bed14ef..3471f68 100644 --- a/src/sums.rs +++ b/src/sums.rs @@ -882,6 +882,22 @@ pub mod discriminant { let offset = <&'a [u64]>::from_byte_slices(&bytes[2 + <&'a [u8]>::SLICE_COUNT..]); Self { tag, count, variant, offset } } + #[inline(always)] + fn from_u64s(words: &mut impl Iterator) -> Self { + let (w_tag, _) = words.next().unwrap_or((&[], 0)); + let tag = w_tag.first().unwrap_or(&0); + let (w_count, _) = words.next().unwrap_or((&[], 0)); + let count = w_count.first().unwrap_or(&0); + let variant = crate::FromBytes::from_u64s(words); + let offset = crate::FromBytes::from_u64s(words); + Self { tag, count, variant, offset } + } + fn element_sizes(sizes: &mut Vec) { + sizes.push(8); // tag + sizes.push(8); // count + <&[u8]>::element_sizes(sizes); + <&[u64]>::element_sizes(sizes); + } } #[cfg(test)] From 72bd6f4c347e61aac395d7fe4e575ece55a3222f Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 15:13:09 -0400 Subject: [PATCH 09/11] Rework validate to take decoded slices, add validate_typed entry point FromBytes::validate now takes &[(&[u64], u8)] matching the from_u64s input shape, making it composable for nested types. Added indexed::validate_typed:: as the single entry point that combines structural and type-level validation. Also added from_u64s and element_sizes for Discriminant. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/bytes.rs | 29 ++++++++++++++++++++++++++--- src/lib.rs | 22 ++++++++++------------ 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index 3c4ed04..43ecc06 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -183,6 +183,29 @@ pub mod indexed { Ok(()) } + /// Validates that `store` contains well-formed data compatible with type `T`. + /// + /// This combines structural validation ([`validate`]) with type-level validation + /// ([`FromBytes::validate`]) in a single call. Use this at trust boundaries when + /// receiving encoded data before passing it to `from_u64s`. + /// + /// The `from_u64s` decode path performs no further validation at access time: + /// it will not panic on malformed data, but may return incorrect results. + /// There is no undefined behavior in any case. Call this method once before + /// using `from_u64s` to ensure the data is well-formed. + /// + /// ```ignore + /// type B<'a> = ::Borrowed<'a>; + /// indexed::validate_typed::(&store)?; + /// // Now safe to use the non-panicking path: + /// let borrowed = B::from_u64s(&mut indexed::decode_u64s(&store)); + /// ``` + pub fn validate_typed<'a, T: crate::FromBytes<'a>>(store: &[u64]) -> Result<(), String> { + validate(store, T::SLICE_COUNT)?; + let slices: Vec<_> = decode_u64s(store).collect(); + T::validate(&slices) + } + /// Decodes a specific byte slice by index. It will be `u64` aligned. #[inline(always)] pub fn decode_index(store: &[u64], index: u64) -> &[u8] { @@ -230,9 +253,9 @@ pub mod indexed { encode(&mut store, &column.borrow()); type B<'a> = as crate::Borrow>::Borrowed<'a>; - assert!(B::validate(&store).is_ok()); + assert!(super::validate_typed::(&store).is_ok()); - // Wrong slice count should fail. + // Wrong slice count should fail structural validation. assert!(super::validate(&store, 5).is_err()); } @@ -248,7 +271,7 @@ pub mod indexed { encode(&mut store, &column.borrow()); type B<'a> = )> as crate::Borrow>::Borrowed<'a>; - assert!(B::validate(&store).is_ok()); + assert!(super::validate_typed::(&store).is_ok()); } } } diff --git a/src/lib.rs b/src/lib.rs index b3bcc18..5681c94 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -669,23 +669,22 @@ pub mod common { sizes.push(1); } } - /// Validates that `store` contains well-formed encoded data compatible with this type. + /// Validates that the given slices are compatible with this type. /// - /// Checks both structural integrity (valid offsets, correct slice count) and type - /// compatibility (each slice's byte length is a multiple of its element size). - /// Call this once at the boundary when receiving data from an untrusted source, - /// before using the non-panicking `from_u64s` path. + /// The input matches the shape of `from_u64s`: each `(&[u64], u8)` is a word slice + /// and trailing byte count. This type consumes `Self::SLICE_COUNT` entries and checks + /// that each slice's byte length is a multiple of its element size. /// /// Built from [`Self::element_sizes`]; generally should not need to be overridden. - fn validate(store: &[u64]) -> Result<(), String> where Self: Sized { + fn validate(slices: &[(&[u64], u8)]) -> Result<(), String> where Self: Sized { + if slices.len() < Self::SLICE_COUNT { + return Err(format!("expected {} slices but got {}", Self::SLICE_COUNT, slices.len())); + } let mut sizes = Vec::new(); Self::element_sizes(&mut sizes); - crate::bytes::indexed::validate(store, sizes.len())?; - let first = store[0] as usize; for (i, elem_size) in sizes.iter().enumerate() { - let upper = store[i + 1] as usize; - let lower = ((store[i] as usize) + 7) & !7; - let byte_len = upper.saturating_sub(lower); + let (words, tail) = &slices[i]; + let byte_len = words.len() * 8 - ((8 - *tail as usize) % 8); if byte_len % elem_size != 0 { return Err(format!( "slice {} has {} bytes, not a multiple of element size {}", @@ -693,7 +692,6 @@ pub mod common { )); } } - let _ = first; Ok(()) } } From 2d288ed75c213f6c93e8bfd1737285fc7635ddd8 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 15:40:58 -0400 Subject: [PATCH 10/11] Rename validate to validate_structure, validate_typed to validate The obvious name should do the obvious thing: indexed::validate:: does full validation (structural + type compatibility). The structural- only check is now validate_structure, an implementation detail. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/bytes.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index 43ecc06..ee908a3 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -147,12 +147,12 @@ pub mod indexed { }) } - /// Validates that `store` contains well-formed Indexed-encoded data with `expected_slices` byte slices. + /// Validates the internal structure of Indexed-encoded data. /// - /// Returns `Ok(())` if the data is well-formed, or `Err` with a description of the problem. - /// Call this once at the boundary (e.g., when receiving data from the network or disk) - /// before using the non-panicking `decode_u64s` / `from_u64s` path. - pub fn validate(store: &[u64], expected_slices: usize) -> Result<(), String> { + /// Checks that offsets are well-formed, in bounds, and that the slice count matches + /// `expected_slices`. This is a building block for [`validate`]; prefer calling + /// `validate` directly unless you need structural checks alone. + pub fn validate_structure(store: &[u64], expected_slices: usize) -> Result<(), String> { if store.is_empty() { return Err("store is empty".into()); } @@ -185,9 +185,9 @@ pub mod indexed { /// Validates that `store` contains well-formed data compatible with type `T`. /// - /// This combines structural validation ([`validate`]) with type-level validation - /// ([`FromBytes::validate`]) in a single call. Use this at trust boundaries when - /// receiving encoded data before passing it to `from_u64s`. + /// Checks both the internal structure of the encoding (offsets, slice count) and + /// type-level compatibility (each slice's byte length is a multiple of its element + /// size). Call this once at trust boundaries when receiving encoded data. /// /// The `from_u64s` decode path performs no further validation at access time: /// it will not panic on malformed data, but may return incorrect results. @@ -196,12 +196,12 @@ pub mod indexed { /// /// ```ignore /// type B<'a> = ::Borrowed<'a>; - /// indexed::validate_typed::(&store)?; + /// indexed::validate::(&store)?; /// // Now safe to use the non-panicking path: /// let borrowed = B::from_u64s(&mut indexed::decode_u64s(&store)); /// ``` - pub fn validate_typed<'a, T: crate::FromBytes<'a>>(store: &[u64]) -> Result<(), String> { - validate(store, T::SLICE_COUNT)?; + pub fn validate<'a, T: crate::FromBytes<'a>>(store: &[u64]) -> Result<(), String> { + validate_structure(store, T::SLICE_COUNT)?; let slices: Vec<_> = decode_u64s(store).collect(); T::validate(&slices) } @@ -253,10 +253,10 @@ pub mod indexed { encode(&mut store, &column.borrow()); type B<'a> = as crate::Borrow>::Borrowed<'a>; - assert!(super::validate_typed::(&store).is_ok()); + assert!(super::validate::(&store).is_ok()); // Wrong slice count should fail structural validation. - assert!(super::validate(&store, 5).is_err()); + assert!(super::validate_structure(&store, 5).is_err()); } #[test] @@ -271,7 +271,7 @@ pub mod indexed { encode(&mut store, &column.borrow()); type B<'a> = )> as crate::Borrow>::Borrowed<'a>; - assert!(super::validate_typed::(&store).is_ok()); + assert!(super::validate::(&store).is_ok()); } } } From a760bd094879be9b2fcfac011b03ffb6ae5b1f23 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Sat, 14 Mar 2026 15:45:23 -0400 Subject: [PATCH 11/11] Remove unused FromBytes import in test module Co-Authored-By: Claude Opus 4.6 (1M context) --- src/bytes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bytes.rs b/src/bytes.rs index ee908a3..356186b 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -219,7 +219,7 @@ pub mod indexed { #[cfg(test)] mod test { - use crate::{Borrow, ContainerOf, FromBytes}; + use crate::{Borrow, ContainerOf}; use crate::common::Push; use crate::AsBytes;