Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
- `DacsByte::from_slice` now accepts a generic index type, removing `from_slice_with_index`.
- Added `BitVectorBuilder` and zero-copy `BitVectorData` backed by `anybytes::View`.
- Introduced `IndexBuilder` trait with a `Built` type and adjusted serialization helpers.
- `Rank9SelIndex` now stores its serialized bytes internally and `to_bytes` returns this buffer.
- Rename crate to `succdisk` to reflect on-disk succinct data structures.
- Rename crate from `succdisk` to `jerky`.
- Replaced the old `BitVector` with the generic `BitVector<I>` and renamed the
Expand All @@ -20,8 +19,9 @@
- Documented the byte layout produced by `DacsByte::to_bytes` with ASCII art.
- Switched `anybytes` dependency to track the upstream Git repository for the
latest changes.
- Removed internal byte buffers from data structures; `WaveletMatrix`,
`DacsByte`, and `Rank9SelIndex` no longer store a `Bytes` field.
- Flags are serialized before level data to eliminate padding.
- `DacsByte` stores all flags and levels in one contiguous byte buffer and `to_bytes` simply clones this buffer.
- Added `get_bits` methods to `BitVectorData` and `BitVector`.
- Removed deprecated `size_in_bytes` helpers.
- Added `scripts/devtest.sh` and `scripts/preflight.sh` for testing and
Expand Down Expand Up @@ -66,4 +66,3 @@
- Documented `WaveletMatrix` usage in `README.md`.
- Moved README usage examples to runnable files in `examples/`.
- Added `compact_vector` example showing construction and retrieval.
- WaveletMatrix now stores its serialized word buffer for zero-copy access and preallocates building memory.
1 change: 1 addition & 0 deletions INVENTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@

## Discovered Issues
- `katex.html` performs manual string replacements; consider DOM-based manipulation.
- Revisit zero-copy storage strategy: avoid extra copies when storing serialized bytes in structures.
100 changes: 48 additions & 52 deletions src/bit_vector/rank9sel/inner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ const SELECT_ZEROS_PER_HINT: usize = SELECT_ONES_PER_HINT;
/// The index implementation separated from the bit vector.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Rank9SelIndex<const SELECT1: bool = true, const SELECT0: bool = true> {
bytes: Bytes,
len: usize,
block_rank_pairs: View<[usize]>,
select1_hints: Option<View<[usize]>>,
Expand Down Expand Up @@ -49,43 +48,16 @@ impl<const SELECT1: bool, const SELECT0: bool> Rank9SelIndexBuilder<SELECT1, SEL

/// Freezes and returns [`Rank9SelIndex`].
pub fn build(self) -> Rank9SelIndex<SELECT1, SELECT0> {
let mut store = Vec::new();
store.push(self.len);
store.push(self.block_rank_pairs.len());
store.extend_from_slice(&self.block_rank_pairs);

if SELECT1 {
let hints = self.select1_hints.unwrap_or_default();
store.push(hints.len());
store.extend_from_slice(&hints);
}

if SELECT0 {
let hints = self.select0_hints.unwrap_or_default();
store.push(hints.len());
store.extend_from_slice(&hints);
}

let bytes = Bytes::from_source(store);
let mut parser = bytes.clone();
let _len = *parser.view_prefix::<usize>().unwrap();
let brp_len = *parser.view_prefix::<usize>().unwrap();
let block_rank_pairs = parser.view_prefix_with_elems::<[usize]>(brp_len).unwrap();
let select1_hints = if SELECT1 {
let l = *parser.view_prefix::<usize>().unwrap();
Some(parser.view_prefix_with_elems::<[usize]>(l).unwrap())
} else {
None
};
let select0_hints = if SELECT0 {
let l = *parser.view_prefix::<usize>().unwrap();
Some(parser.view_prefix_with_elems::<[usize]>(l).unwrap())
} else {
None
};

let block_rank_pairs = Bytes::from_source(self.block_rank_pairs)
.view::<[usize]>()
.unwrap();
let select1_hints = self
.select1_hints
.map(|v| Bytes::from_source(v).view::<[usize]>().unwrap());
let select0_hints = self
.select0_hints
.map(|v| Bytes::from_source(v).view::<[usize]>().unwrap());
Rank9SelIndex::<SELECT1, SELECT0> {
bytes,
len: self.len,
block_rank_pairs,
select1_hints,
Expand Down Expand Up @@ -469,46 +441,52 @@ impl<const SELECT1: bool, const SELECT0: bool> Rank9SelIndex<SELECT1, SELECT0> {

impl<const SELECT1: bool, const SELECT0: bool> Rank9SelIndex<SELECT1, SELECT0> {
/// Reconstructs the index from zero-copy [`Bytes`].
pub fn from_bytes(bytes: Bytes) -> Result<Self> {
let mut parser = bytes.clone();
let len = *parser
pub fn from_bytes(mut bytes: Bytes) -> Result<Self> {
let len = *bytes
.view_prefix::<usize>()
.map_err(|e| anyhow::anyhow!(e))?;
let brp_len = *parser
let brp_len = *bytes
.view_prefix::<usize>()
.map_err(|e| anyhow::anyhow!(e))?;
let block_rank_pairs = parser
let block_rank_pairs = bytes
.view_prefix_with_elems::<[usize]>(brp_len)
.map_err(|e| anyhow::anyhow!(e))?;
let select1_hints = if SELECT1 {
let l = *parser
let has_select1 = *bytes
.view_prefix::<usize>()
.map_err(|e| anyhow::anyhow!(e))?
!= 0;
let select1_hints = if has_select1 {
let l = *bytes
.view_prefix::<usize>()
.map_err(|e| anyhow::anyhow!(e))?;
Some(
parser
bytes
.view_prefix_with_elems::<[usize]>(l)
.map_err(|e| anyhow::anyhow!(e))?,
)
} else {
None
};
let select0_hints = if SELECT0 {
let l = *parser
let has_select0 = *bytes
.view_prefix::<usize>()
.map_err(|e| anyhow::anyhow!(e))?
!= 0;
let select0_hints = if has_select0 {
let l = *bytes
.view_prefix::<usize>()
.map_err(|e| anyhow::anyhow!(e))?;
Some(
parser
bytes
.view_prefix_with_elems::<[usize]>(l)
.map_err(|e| anyhow::anyhow!(e))?,
)
} else {
None
};
if !parser.as_ref().is_empty() {
return Err(anyhow::anyhow!("extra bytes"));
if has_select1 != SELECT1 || has_select0 != SELECT0 {
return Err(anyhow::anyhow!("mismatched hint flags"));
}
Ok(Self {
bytes,
len,
block_rank_pairs,
select1_hints,
Expand All @@ -518,7 +496,25 @@ impl<const SELECT1: bool, const SELECT0: bool> Rank9SelIndex<SELECT1, SELECT0> {

/// Serializes the index metadata and data into a [`Bytes`] buffer.
pub fn to_bytes(&self) -> Bytes {
self.bytes.clone()
let mut store: Vec<usize> = Vec::new();
store.push(self.len);
store.push(self.block_rank_pairs.len());
store.extend_from_slice(self.block_rank_pairs.as_ref());
if let Some(ref v) = self.select1_hints {
store.push(1);
store.push(v.len());
store.extend_from_slice(v.as_ref());
} else {
store.push(0);
}
if let Some(ref v) = self.select0_hints {
store.push(1);
store.push(v.len());
store.extend_from_slice(v.as_ref());
} else {
store.push(0);
}
Bytes::from_source(store)
}
}

Expand Down
76 changes: 22 additions & 54 deletions src/char_sequences/wavelet_matrix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ use anybytes::Bytes;
use anyhow::{anyhow, Result};

use crate::bit_vector::{
Access, BitVector, BitVectorData, BitVectorIndex, NumBits, Rank, Select, WORD_LEN,
Access, BitVector, BitVectorBuilder, BitVectorData, BitVectorIndex, NumBits, Rank, Select,
WORD_LEN,
};
use crate::int_vectors::{CompactVector, CompactVectorBuilder};
use crate::utils;
Expand Down Expand Up @@ -55,21 +56,10 @@ use crate::utils;
/// # References
///
/// - F. Claude, and G. Navarro, "The Wavelet Matrix," In SPIRE 2012.
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Default, Debug, Clone, PartialEq, Eq)]
pub struct WaveletMatrix<I> {
layers: Vec<BitVector<I>>,
alph_size: usize,
bytes: Bytes,
}

impl<I> Default for WaveletMatrix<I> {
fn default() -> Self {
Self {
layers: Vec::new(),
alph_size: 0,
bytes: Bytes::empty(),
}
}
}

/// Metadata describing the serialized form of a [`WaveletMatrix`].
Expand Down Expand Up @@ -99,79 +89,55 @@ where
return Err(anyhow!("seq must not be empty."));
}

let len = seq.len();
let alph_size = seq.iter().max().unwrap() + 1;
let alph_width = utils::needed_bits(alph_size);
let num_words = (len + WORD_LEN - 1) / WORD_LEN;

let mut store = vec![0usize; alph_width * num_words];

let mut zeros = seq;
let mut ones = CompactVector::new(alph_width)?.freeze();
let mut layers = vec![];

for depth in 0..alph_width {
let mut next_zeros = CompactVectorBuilder::new(alph_width).unwrap();
let mut next_ones = CompactVectorBuilder::new(alph_width).unwrap();
let layer = &mut store[depth * num_words..(depth + 1) * num_words];
let mut pos = 0;
Self::filter_into(
let mut bv = BitVectorBuilder::new();
Self::filter(
&zeros,
alph_width - depth - 1,
&mut next_zeros,
&mut next_ones,
layer,
&mut pos,
&mut bv,
);
Self::filter_into(
Self::filter(
&ones,
alph_width - depth - 1,
&mut next_zeros,
&mut next_ones,
layer,
&mut pos,
&mut bv,
);
zeros = next_zeros.freeze();
ones = next_ones.freeze();
let bits = bv.freeze::<I>();
layers.push(bits);
}

let bytes = Bytes::from_source(store);
let mut layer_bytes = bytes.clone();
let mut layers = Vec::with_capacity(alph_width);
for _ in 0..alph_width {
let words = layer_bytes
.view_prefix_with_elems::<[usize]>(num_words)
.map_err(|e| anyhow!(e))?;
let data = BitVectorData { words, len };
let index = I::build(&data);
layers.push(BitVector::new(data, index));
}

Ok(Self {
layers,
alph_size,
bytes,
})
Ok(Self { layers, alph_size })
}

fn filter_into(
fn filter(
seq: &CompactVector,
shift: usize,
next_zeros: &mut CompactVectorBuilder,
next_ones: &mut CompactVectorBuilder,
layer: &mut [usize],
pos: &mut usize,
bv: &mut BitVectorBuilder,
) {
for val in seq.iter() {
let bit = ((val >> shift) & 1) == 1;
bv.push_bit(bit);
if bit {
let idx = *pos / WORD_LEN;
let sh = *pos % WORD_LEN;
layer[idx] |= 1usize << sh;
next_ones.push_int(val).unwrap();
} else {
next_zeros.push_int(val).unwrap();
}
*pos += 1;
}
}

Expand Down Expand Up @@ -611,21 +577,24 @@ where

/// Serializes the sequence into a [`Bytes`] buffer along with its metadata.
pub fn to_bytes(&self) -> (WaveletMatrixMeta, Bytes) {
let mut store: Vec<usize> = Vec::new();
for layer in &self.layers {
store.extend_from_slice(layer.data.words());
}
let meta = WaveletMatrixMeta {
alph_size: self.alph_size,
alph_width: self.alph_width(),
len: self.len(),
};
(meta, self.bytes.clone())
(meta, Bytes::from_source(store))
}

/// Reconstructs the sequence from metadata and a zero-copy [`Bytes`] buffer.
pub fn from_bytes(meta: WaveletMatrixMeta, bytes: Bytes) -> Result<Self> {
pub fn from_bytes(meta: WaveletMatrixMeta, mut bytes: Bytes) -> Result<Self> {
let mut layers = Vec::with_capacity(meta.alph_width);
let num_words = (meta.len + WORD_LEN - 1) / WORD_LEN;
let mut slice = bytes.clone();
for _ in 0..meta.alph_width {
let words = slice
let words = bytes
.view_prefix_with_elems::<[usize]>(num_words)
.map_err(|e| anyhow!(e))?;
let data = BitVectorData {
Expand All @@ -638,7 +607,6 @@ where
Ok(Self {
layers,
alph_size: meta.alph_size,
bytes,
})
}
}
Expand Down
Loading