Skip to content

Commit 33352d7

Browse files
Merge pull request #59 from triblespace/codex/add-bytes-field-to-waveletmatrix
Optimize wavelet matrix storage
2 parents a2323cb + b5d161d commit 33352d7

File tree

2 files changed

+55
-22
lines changed

2 files changed

+55
-22
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,4 @@
6262
- Documented `WaveletMatrix` usage in `README.md`.
6363
- Moved README usage examples to runnable files in `examples/`.
6464
- Added `compact_vector` example showing construction and retrieval.
65+
- WaveletMatrix now stores its serialized word buffer for zero-copy access and preallocates building memory.

src/char_sequences/wavelet_matrix.rs

Lines changed: 54 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ use anybytes::Bytes;
88
use anyhow::{anyhow, Result};
99

1010
use crate::bit_vector::{
11-
Access, BitVector, BitVectorBuilder, BitVectorData, BitVectorIndex, NumBits, Rank, Select,
12-
WORD_LEN,
11+
Access, BitVector, BitVectorData, BitVectorIndex, NumBits, Rank, Select, WORD_LEN,
1312
};
1413
use crate::int_vectors::{CompactVector, CompactVectorBuilder};
1514
use crate::utils;
@@ -56,10 +55,21 @@ use crate::utils;
5655
/// # References
5756
///
5857
/// - F. Claude, and G. Navarro, "The Wavelet Matrix," In SPIRE 2012.
59-
#[derive(Default, Debug, Clone, PartialEq, Eq)]
58+
#[derive(Debug, Clone, PartialEq, Eq)]
6059
pub struct WaveletMatrix<I> {
6160
layers: Vec<BitVector<I>>,
6261
alph_size: usize,
62+
bytes: Bytes,
63+
}
64+
65+
impl<I> Default for WaveletMatrix<I> {
66+
fn default() -> Self {
67+
Self {
68+
layers: Vec::new(),
69+
alph_size: 0,
70+
bytes: Bytes::empty(),
71+
}
72+
}
6373
}
6474

6575
/// Metadata describing the serialized form of a [`WaveletMatrix`].
@@ -89,55 +99,79 @@ where
8999
return Err(anyhow!("seq must not be empty."));
90100
}
91101

102+
let len = seq.len();
92103
let alph_size = seq.iter().max().unwrap() + 1;
93104
let alph_width = utils::needed_bits(alph_size);
105+
let num_words = (len + WORD_LEN - 1) / WORD_LEN;
106+
107+
let mut store = vec![0usize; alph_width * num_words];
94108

95109
let mut zeros = seq;
96110
let mut ones = CompactVector::new(alph_width)?.freeze();
97-
let mut layers = vec![];
98111

99112
for depth in 0..alph_width {
100113
let mut next_zeros = CompactVectorBuilder::new(alph_width).unwrap();
101114
let mut next_ones = CompactVectorBuilder::new(alph_width).unwrap();
102-
let mut bv = BitVectorBuilder::new();
103-
Self::filter(
115+
let layer = &mut store[depth * num_words..(depth + 1) * num_words];
116+
let mut pos = 0;
117+
Self::filter_into(
104118
&zeros,
105119
alph_width - depth - 1,
106120
&mut next_zeros,
107121
&mut next_ones,
108-
&mut bv,
122+
layer,
123+
&mut pos,
109124
);
110-
Self::filter(
125+
Self::filter_into(
111126
&ones,
112127
alph_width - depth - 1,
113128
&mut next_zeros,
114129
&mut next_ones,
115-
&mut bv,
130+
layer,
131+
&mut pos,
116132
);
117133
zeros = next_zeros.freeze();
118134
ones = next_ones.freeze();
119-
let bits = bv.freeze::<I>();
120-
layers.push(bits);
121135
}
122136

123-
Ok(Self { layers, alph_size })
137+
let bytes = Bytes::from_source(store);
138+
let mut layer_bytes = bytes.clone();
139+
let mut layers = Vec::with_capacity(alph_width);
140+
for _ in 0..alph_width {
141+
let words = layer_bytes
142+
.view_prefix_with_elems::<[usize]>(num_words)
143+
.map_err(|e| anyhow!(e))?;
144+
let data = BitVectorData { words, len };
145+
let index = I::build(&data);
146+
layers.push(BitVector::new(data, index));
147+
}
148+
149+
Ok(Self {
150+
layers,
151+
alph_size,
152+
bytes,
153+
})
124154
}
125155

126-
fn filter(
156+
fn filter_into(
127157
seq: &CompactVector,
128158
shift: usize,
129159
next_zeros: &mut CompactVectorBuilder,
130160
next_ones: &mut CompactVectorBuilder,
131-
bv: &mut BitVectorBuilder,
161+
layer: &mut [usize],
162+
pos: &mut usize,
132163
) {
133164
for val in seq.iter() {
134165
let bit = ((val >> shift) & 1) == 1;
135-
bv.push_bit(bit);
136166
if bit {
167+
let idx = *pos / WORD_LEN;
168+
let sh = *pos % WORD_LEN;
169+
layer[idx] |= 1usize << sh;
137170
next_ones.push_int(val).unwrap();
138171
} else {
139172
next_zeros.push_int(val).unwrap();
140173
}
174+
*pos += 1;
141175
}
142176
}
143177

@@ -577,24 +611,21 @@ where
577611

578612
/// Serializes the sequence into a [`Bytes`] buffer along with its metadata.
579613
pub fn to_bytes(&self) -> (WaveletMatrixMeta, Bytes) {
580-
let mut store: Vec<usize> = Vec::new();
581-
for layer in &self.layers {
582-
store.extend_from_slice(layer.data.words());
583-
}
584614
let meta = WaveletMatrixMeta {
585615
alph_size: self.alph_size,
586616
alph_width: self.alph_width(),
587617
len: self.len(),
588618
};
589-
(meta, Bytes::from_source(store))
619+
(meta, self.bytes.clone())
590620
}
591621

592622
/// Reconstructs the sequence from metadata and a zero-copy [`Bytes`] buffer.
593-
pub fn from_bytes(meta: WaveletMatrixMeta, mut bytes: Bytes) -> Result<Self> {
623+
pub fn from_bytes(meta: WaveletMatrixMeta, bytes: Bytes) -> Result<Self> {
594624
let mut layers = Vec::with_capacity(meta.alph_width);
595625
let num_words = (meta.len + WORD_LEN - 1) / WORD_LEN;
626+
let mut slice = bytes.clone();
596627
for _ in 0..meta.alph_width {
597-
let words = bytes
628+
let words = slice
598629
.view_prefix_with_elems::<[usize]>(num_words)
599630
.map_err(|e| anyhow!(e))?;
600631
let data = BitVectorData {
@@ -607,6 +638,7 @@ where
607638
Ok(Self {
608639
layers,
609640
alph_size: meta.alph_size,
641+
bytes,
610642
})
611643
}
612644
}

0 commit comments

Comments
 (0)