Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,4 @@
- Documented `WaveletMatrix` usage in `README.md`.
- Moved README usage examples to runnable files in `examples/`.
- Added `compact_vector` example showing construction and retrieval.
- WaveletMatrix now stores its serialized word buffer for zero-copy access and preallocates building memory.
76 changes: 54 additions & 22 deletions src/char_sequences/wavelet_matrix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ use anybytes::Bytes;
use anyhow::{anyhow, Result};

use crate::bit_vector::{
Access, BitVector, BitVectorBuilder, BitVectorData, BitVectorIndex, NumBits, Rank, Select,
WORD_LEN,
Access, BitVector, BitVectorData, BitVectorIndex, NumBits, Rank, Select, WORD_LEN,
};
use crate::int_vectors::{CompactVector, CompactVectorBuilder};
use crate::utils;
Expand Down Expand Up @@ -56,10 +55,21 @@ use crate::utils;
/// # References
///
/// - F. Claude, and G. Navarro, "The Wavelet Matrix," In SPIRE 2012.
#[derive(Default, Debug, Clone, PartialEq, Eq)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct WaveletMatrix<I> {
layers: Vec<BitVector<I>>,
alph_size: usize,
bytes: Bytes,
}

impl<I> Default for WaveletMatrix<I> {
fn default() -> Self {
Self {
layers: Vec::new(),
alph_size: 0,
bytes: Bytes::empty(),
}
}
}

/// Metadata describing the serialized form of a [`WaveletMatrix`].
Expand Down Expand Up @@ -89,55 +99,79 @@ where
return Err(anyhow!("seq must not be empty."));
}

let len = seq.len();
let alph_size = seq.iter().max().unwrap() + 1;
let alph_width = utils::needed_bits(alph_size);
let num_words = (len + WORD_LEN - 1) / WORD_LEN;

let mut store = vec![0usize; alph_width * num_words];

let mut zeros = seq;
let mut ones = CompactVector::new(alph_width)?.freeze();
let mut layers = vec![];

for depth in 0..alph_width {
let mut next_zeros = CompactVectorBuilder::new(alph_width).unwrap();
let mut next_ones = CompactVectorBuilder::new(alph_width).unwrap();
let mut bv = BitVectorBuilder::new();
Self::filter(
let layer = &mut store[depth * num_words..(depth + 1) * num_words];
let mut pos = 0;
Self::filter_into(
&zeros,
alph_width - depth - 1,
&mut next_zeros,
&mut next_ones,
&mut bv,
layer,
&mut pos,
);
Self::filter(
Self::filter_into(
&ones,
alph_width - depth - 1,
&mut next_zeros,
&mut next_ones,
&mut bv,
layer,
&mut pos,
);
zeros = next_zeros.freeze();
ones = next_ones.freeze();
let bits = bv.freeze::<I>();
layers.push(bits);
}

Ok(Self { layers, alph_size })
let bytes = Bytes::from_source(store);
let mut layer_bytes = bytes.clone();
let mut layers = Vec::with_capacity(alph_width);
for _ in 0..alph_width {
let words = layer_bytes
.view_prefix_with_elems::<[usize]>(num_words)
.map_err(|e| anyhow!(e))?;
let data = BitVectorData { words, len };
let index = I::build(&data);
layers.push(BitVector::new(data, index));
}

Ok(Self {
layers,
alph_size,
bytes,
})
}

fn filter(
fn filter_into(
seq: &CompactVector,
shift: usize,
next_zeros: &mut CompactVectorBuilder,
next_ones: &mut CompactVectorBuilder,
bv: &mut BitVectorBuilder,
layer: &mut [usize],
pos: &mut usize,
) {
for val in seq.iter() {
let bit = ((val >> shift) & 1) == 1;
bv.push_bit(bit);
if bit {
let idx = *pos / WORD_LEN;
let sh = *pos % WORD_LEN;
layer[idx] |= 1usize << sh;
next_ones.push_int(val).unwrap();
} else {
next_zeros.push_int(val).unwrap();
}
*pos += 1;
}
}

Expand Down Expand Up @@ -577,24 +611,21 @@ where

/// Serializes the sequence into a [`Bytes`] buffer along with its metadata.
pub fn to_bytes(&self) -> (WaveletMatrixMeta, Bytes) {
let mut store: Vec<usize> = Vec::new();
for layer in &self.layers {
store.extend_from_slice(layer.data.words());
}
let meta = WaveletMatrixMeta {
alph_size: self.alph_size,
alph_width: self.alph_width(),
len: self.len(),
};
(meta, Bytes::from_source(store))
(meta, self.bytes.clone())
}

/// Reconstructs the sequence from metadata and a zero-copy [`Bytes`] buffer.
pub fn from_bytes(meta: WaveletMatrixMeta, mut bytes: Bytes) -> Result<Self> {
pub fn from_bytes(meta: WaveletMatrixMeta, bytes: Bytes) -> Result<Self> {
let mut layers = Vec::with_capacity(meta.alph_width);
let num_words = (meta.len + WORD_LEN - 1) / WORD_LEN;
let mut slice = bytes.clone();
for _ in 0..meta.alph_width {
let words = bytes
let words = slice
.view_prefix_with_elems::<[usize]>(num_words)
.map_err(|e| anyhow!(e))?;
let data = BitVectorData {
Expand All @@ -607,6 +638,7 @@ where
Ok(Self {
layers,
alph_size: meta.alph_size,
bytes,
})
}
}
Expand Down