Skip to content

Commit ce3b30e

Browse files
Merge pull request #63 from triblespace/codex/rollback-byte-field-storage-in-datastructures
Remove stored bytes field from data structures
2 parents b683bfa + f671944 commit ce3b30e

File tree

5 files changed

+99
-181
lines changed

5 files changed

+99
-181
lines changed

CHANGELOG.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
- `DacsByte::from_slice` now accepts a generic index type, removing `from_slice_with_index`.
99
- Added `BitVectorBuilder` and zero-copy `BitVectorData` backed by `anybytes::View`.
1010
- Introduced `IndexBuilder` trait with a `Built` type and adjusted serialization helpers.
11-
- `Rank9SelIndex` now stores its serialized bytes internally and `to_bytes` returns this buffer.
1211
- Rename crate to `succdisk` to reflect on-disk succinct data structures.
1312
- Rename crate from `succdisk` to `jerky`.
1413
- Replaced the old `BitVector` with the generic `BitVector<I>` and renamed the
@@ -20,8 +19,9 @@
2019
- Documented the byte layout produced by `DacsByte::to_bytes` with ASCII art.
2120
- Switched `anybytes` dependency to track the upstream Git repository for the
2221
latest changes.
22+
- Removed internal byte buffers from data structures; `WaveletMatrix`,
23+
`DacsByte`, and `Rank9SelIndex` no longer store a `Bytes` field.
2324
- Flags are serialized before level data to eliminate padding.
24-
- `DacsByte` stores all flags and levels in one contiguous byte buffer and `to_bytes` simply clones this buffer.
2525
- Added `get_bits` methods to `BitVectorData` and `BitVector`.
2626
- Removed deprecated `size_in_bytes` helpers.
2727
- Added `scripts/devtest.sh` and `scripts/preflight.sh` for testing and
@@ -66,4 +66,3 @@
6666
- Documented `WaveletMatrix` usage in `README.md`.
6767
- Moved README usage examples to runnable files in `examples/`.
6868
- Added `compact_vector` example showing construction and retrieval.
69-
- WaveletMatrix now stores its serialized word buffer for zero-copy access and preallocates building memory.

INVENTORY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@
1515

1616
## Discovered Issues
1717
- `katex.html` performs manual string replacements; consider DOM-based manipulation.
18+
- Revisit zero-copy storage strategy: avoid extra copies when storing serialized bytes in structures.

src/bit_vector/rank9sel/inner.rs

Lines changed: 48 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ const SELECT_ZEROS_PER_HINT: usize = SELECT_ONES_PER_HINT;
1515
/// The index implementation separated from the bit vector.
1616
#[derive(Debug, Clone, PartialEq, Eq)]
1717
pub struct Rank9SelIndex<const SELECT1: bool = true, const SELECT0: bool = true> {
18-
bytes: Bytes,
1918
len: usize,
2019
block_rank_pairs: View<[usize]>,
2120
select1_hints: Option<View<[usize]>>,
@@ -49,43 +48,16 @@ impl<const SELECT1: bool, const SELECT0: bool> Rank9SelIndexBuilder<SELECT1, SEL
4948

5049
/// Freezes and returns [`Rank9SelIndex`].
5150
pub fn build(self) -> Rank9SelIndex<SELECT1, SELECT0> {
52-
let mut store = Vec::new();
53-
store.push(self.len);
54-
store.push(self.block_rank_pairs.len());
55-
store.extend_from_slice(&self.block_rank_pairs);
56-
57-
if SELECT1 {
58-
let hints = self.select1_hints.unwrap_or_default();
59-
store.push(hints.len());
60-
store.extend_from_slice(&hints);
61-
}
62-
63-
if SELECT0 {
64-
let hints = self.select0_hints.unwrap_or_default();
65-
store.push(hints.len());
66-
store.extend_from_slice(&hints);
67-
}
68-
69-
let bytes = Bytes::from_source(store);
70-
let mut parser = bytes.clone();
71-
let _len = *parser.view_prefix::<usize>().unwrap();
72-
let brp_len = *parser.view_prefix::<usize>().unwrap();
73-
let block_rank_pairs = parser.view_prefix_with_elems::<[usize]>(brp_len).unwrap();
74-
let select1_hints = if SELECT1 {
75-
let l = *parser.view_prefix::<usize>().unwrap();
76-
Some(parser.view_prefix_with_elems::<[usize]>(l).unwrap())
77-
} else {
78-
None
79-
};
80-
let select0_hints = if SELECT0 {
81-
let l = *parser.view_prefix::<usize>().unwrap();
82-
Some(parser.view_prefix_with_elems::<[usize]>(l).unwrap())
83-
} else {
84-
None
85-
};
86-
51+
let block_rank_pairs = Bytes::from_source(self.block_rank_pairs)
52+
.view::<[usize]>()
53+
.unwrap();
54+
let select1_hints = self
55+
.select1_hints
56+
.map(|v| Bytes::from_source(v).view::<[usize]>().unwrap());
57+
let select0_hints = self
58+
.select0_hints
59+
.map(|v| Bytes::from_source(v).view::<[usize]>().unwrap());
8760
Rank9SelIndex::<SELECT1, SELECT0> {
88-
bytes,
8961
len: self.len,
9062
block_rank_pairs,
9163
select1_hints,
@@ -469,46 +441,52 @@ impl<const SELECT1: bool, const SELECT0: bool> Rank9SelIndex<SELECT1, SELECT0> {
469441

470442
impl<const SELECT1: bool, const SELECT0: bool> Rank9SelIndex<SELECT1, SELECT0> {
471443
/// Reconstructs the index from zero-copy [`Bytes`].
472-
pub fn from_bytes(bytes: Bytes) -> Result<Self> {
473-
let mut parser = bytes.clone();
474-
let len = *parser
444+
pub fn from_bytes(mut bytes: Bytes) -> Result<Self> {
445+
let len = *bytes
475446
.view_prefix::<usize>()
476447
.map_err(|e| anyhow::anyhow!(e))?;
477-
let brp_len = *parser
448+
let brp_len = *bytes
478449
.view_prefix::<usize>()
479450
.map_err(|e| anyhow::anyhow!(e))?;
480-
let block_rank_pairs = parser
451+
let block_rank_pairs = bytes
481452
.view_prefix_with_elems::<[usize]>(brp_len)
482453
.map_err(|e| anyhow::anyhow!(e))?;
483-
let select1_hints = if SELECT1 {
484-
let l = *parser
454+
let has_select1 = *bytes
455+
.view_prefix::<usize>()
456+
.map_err(|e| anyhow::anyhow!(e))?
457+
!= 0;
458+
let select1_hints = if has_select1 {
459+
let l = *bytes
485460
.view_prefix::<usize>()
486461
.map_err(|e| anyhow::anyhow!(e))?;
487462
Some(
488-
parser
463+
bytes
489464
.view_prefix_with_elems::<[usize]>(l)
490465
.map_err(|e| anyhow::anyhow!(e))?,
491466
)
492467
} else {
493468
None
494469
};
495-
let select0_hints = if SELECT0 {
496-
let l = *parser
470+
let has_select0 = *bytes
471+
.view_prefix::<usize>()
472+
.map_err(|e| anyhow::anyhow!(e))?
473+
!= 0;
474+
let select0_hints = if has_select0 {
475+
let l = *bytes
497476
.view_prefix::<usize>()
498477
.map_err(|e| anyhow::anyhow!(e))?;
499478
Some(
500-
parser
479+
bytes
501480
.view_prefix_with_elems::<[usize]>(l)
502481
.map_err(|e| anyhow::anyhow!(e))?,
503482
)
504483
} else {
505484
None
506485
};
507-
if !parser.as_ref().is_empty() {
508-
return Err(anyhow::anyhow!("extra bytes"));
486+
if has_select1 != SELECT1 || has_select0 != SELECT0 {
487+
return Err(anyhow::anyhow!("mismatched hint flags"));
509488
}
510489
Ok(Self {
511-
bytes,
512490
len,
513491
block_rank_pairs,
514492
select1_hints,
@@ -518,7 +496,25 @@ impl<const SELECT1: bool, const SELECT0: bool> Rank9SelIndex<SELECT1, SELECT0> {
518496

519497
/// Serializes the index metadata and data into a [`Bytes`] buffer.
520498
pub fn to_bytes(&self) -> Bytes {
521-
self.bytes.clone()
499+
let mut store: Vec<usize> = Vec::new();
500+
store.push(self.len);
501+
store.push(self.block_rank_pairs.len());
502+
store.extend_from_slice(self.block_rank_pairs.as_ref());
503+
if let Some(ref v) = self.select1_hints {
504+
store.push(1);
505+
store.push(v.len());
506+
store.extend_from_slice(v.as_ref());
507+
} else {
508+
store.push(0);
509+
}
510+
if let Some(ref v) = self.select0_hints {
511+
store.push(1);
512+
store.push(v.len());
513+
store.extend_from_slice(v.as_ref());
514+
} else {
515+
store.push(0);
516+
}
517+
Bytes::from_source(store)
522518
}
523519
}
524520

src/char_sequences/wavelet_matrix.rs

Lines changed: 22 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ use anybytes::Bytes;
88
use anyhow::{anyhow, Result};
99

1010
use crate::bit_vector::{
11-
Access, BitVector, BitVectorData, BitVectorIndex, NumBits, Rank, Select, WORD_LEN,
11+
Access, BitVector, BitVectorBuilder, BitVectorData, BitVectorIndex, NumBits, Rank, Select,
12+
WORD_LEN,
1213
};
1314
use crate::int_vectors::{CompactVector, CompactVectorBuilder};
1415
use crate::utils;
@@ -55,21 +56,10 @@ use crate::utils;
5556
/// # References
5657
///
5758
/// - F. Claude, and G. Navarro, "The Wavelet Matrix," In SPIRE 2012.
58-
#[derive(Debug, Clone, PartialEq, Eq)]
59+
#[derive(Default, Debug, Clone, PartialEq, Eq)]
5960
pub struct WaveletMatrix<I> {
6061
layers: Vec<BitVector<I>>,
6162
alph_size: usize,
62-
bytes: Bytes,
63-
}
64-
65-
impl<I> Default for WaveletMatrix<I> {
66-
fn default() -> Self {
67-
Self {
68-
layers: Vec::new(),
69-
alph_size: 0,
70-
bytes: Bytes::empty(),
71-
}
72-
}
7363
}
7464

7565
/// Metadata describing the serialized form of a [`WaveletMatrix`].
@@ -99,79 +89,55 @@ where
9989
return Err(anyhow!("seq must not be empty."));
10090
}
10191

102-
let len = seq.len();
10392
let alph_size = seq.iter().max().unwrap() + 1;
10493
let alph_width = utils::needed_bits(alph_size);
105-
let num_words = (len + WORD_LEN - 1) / WORD_LEN;
106-
107-
let mut store = vec![0usize; alph_width * num_words];
10894

10995
let mut zeros = seq;
11096
let mut ones = CompactVector::new(alph_width)?.freeze();
97+
let mut layers = vec![];
11198

11299
for depth in 0..alph_width {
113100
let mut next_zeros = CompactVectorBuilder::new(alph_width).unwrap();
114101
let mut next_ones = CompactVectorBuilder::new(alph_width).unwrap();
115-
let layer = &mut store[depth * num_words..(depth + 1) * num_words];
116-
let mut pos = 0;
117-
Self::filter_into(
102+
let mut bv = BitVectorBuilder::new();
103+
Self::filter(
118104
&zeros,
119105
alph_width - depth - 1,
120106
&mut next_zeros,
121107
&mut next_ones,
122-
layer,
123-
&mut pos,
108+
&mut bv,
124109
);
125-
Self::filter_into(
110+
Self::filter(
126111
&ones,
127112
alph_width - depth - 1,
128113
&mut next_zeros,
129114
&mut next_ones,
130-
layer,
131-
&mut pos,
115+
&mut bv,
132116
);
133117
zeros = next_zeros.freeze();
134118
ones = next_ones.freeze();
119+
let bits = bv.freeze::<I>();
120+
layers.push(bits);
135121
}
136122

137-
let bytes = Bytes::from_source(store);
138-
let mut layer_bytes = bytes.clone();
139-
let mut layers = Vec::with_capacity(alph_width);
140-
for _ in 0..alph_width {
141-
let words = layer_bytes
142-
.view_prefix_with_elems::<[usize]>(num_words)
143-
.map_err(|e| anyhow!(e))?;
144-
let data = BitVectorData { words, len };
145-
let index = I::build(&data);
146-
layers.push(BitVector::new(data, index));
147-
}
148-
149-
Ok(Self {
150-
layers,
151-
alph_size,
152-
bytes,
153-
})
123+
Ok(Self { layers, alph_size })
154124
}
155125

156-
fn filter_into(
126+
fn filter(
157127
seq: &CompactVector,
158128
shift: usize,
159129
next_zeros: &mut CompactVectorBuilder,
160130
next_ones: &mut CompactVectorBuilder,
161-
layer: &mut [usize],
162-
pos: &mut usize,
131+
bv: &mut BitVectorBuilder,
163132
) {
164133
for val in seq.iter() {
165134
let bit = ((val >> shift) & 1) == 1;
135+
bv.push_bit(bit);
166136
if bit {
167-
let idx = *pos / WORD_LEN;
168-
let sh = *pos % WORD_LEN;
169-
layer[idx] |= 1usize << sh;
170137
next_ones.push_int(val).unwrap();
171138
} else {
172139
next_zeros.push_int(val).unwrap();
173140
}
174-
*pos += 1;
175141
}
176142
}
177143

@@ -611,21 +577,24 @@ where
611577

612578
/// Serializes the sequence into a [`Bytes`] buffer along with its metadata.
613579
pub fn to_bytes(&self) -> (WaveletMatrixMeta, Bytes) {
580+
let mut store: Vec<usize> = Vec::new();
581+
for layer in &self.layers {
582+
store.extend_from_slice(layer.data.words());
583+
}
614584
let meta = WaveletMatrixMeta {
615585
alph_size: self.alph_size,
616586
alph_width: self.alph_width(),
617587
len: self.len(),
618588
};
619-
(meta, self.bytes.clone())
589+
(meta, Bytes::from_source(store))
620590
}
621591

622592
/// Reconstructs the sequence from metadata and a zero-copy [`Bytes`] buffer.
623-
pub fn from_bytes(meta: WaveletMatrixMeta, bytes: Bytes) -> Result<Self> {
593+
pub fn from_bytes(meta: WaveletMatrixMeta, mut bytes: Bytes) -> Result<Self> {
624594
let mut layers = Vec::with_capacity(meta.alph_width);
625595
let num_words = (meta.len + WORD_LEN - 1) / WORD_LEN;
626-
let mut slice = bytes.clone();
627596
for _ in 0..meta.alph_width {
628-
let words = slice
597+
let words = bytes
629598
.view_prefix_with_elems::<[usize]>(num_words)
630599
.map_err(|e| anyhow!(e))?;
631600
let data = BitVectorData {
@@ -638,7 +607,6 @@ where
638607
Ok(Self {
639608
layers,
640609
alph_size: meta.alph_size,
641-
bytes,
642610
})
643611
}
644612
}

0 commit comments

Comments
 (0)