diff --git a/CHANGELOG.md b/CHANGELOG.md index 114eeff..5434e9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,7 +18,10 @@ - Replaced `DArrayFullIndex` with new `DArrayIndex` that uses const generics to optionally include `select1` and `select0` support. - Introduced `CompactVectorBuilder` mutable APIs `push_int`, `set_int`, and `extend`. +- Simplified bit vector imports by re-exporting `BitVectorBuilder` and `Rank9SelIndex` and updating examples. +- Moved the `bit_vector::bit_vector` module contents directly into `bit_vector` for cleaner paths. - Added README usage example demonstrating basic bit vector operations. +- Removed `bit_vector::prelude`; import traits directly with `use jerky::bit_vector::*`. - Added `freeze()` on `CompactVectorBuilder` yielding an immutable `CompactVector` backed by `BitVector`. - `CompactVector::new` and `with_capacity` now return builders; other constructors build via the builder pattern. - Wavelet matrix and DACs builders now use `BitVectorBuilder` for temporary bit diff --git a/INVENTORY.md b/INVENTORY.md index bcd380e..854c345 100644 --- a/INVENTORY.md +++ b/INVENTORY.md @@ -10,8 +10,3 @@ ## Discovered Issues - `katex.html` performs manual string replacements; consider DOM-based manipulation. -- bit_vector prelude lacks common types like `BitVectorBuilder` and `Rank9SelIndex`, - forcing verbose imports in examples. -- Import paths for `BitVectorBuilder` include a redundant `bit_vector` module - segment, e.g. `jerky::bit_vector::bit_vector::BitVectorBuilder`. - Cleanup the module layout so examples only need `jerky::bit_vector`. diff --git a/README.md b/README.md index 13cb12e..0df1272 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ The snippet below shows how to build a bit vector with a rank/select index and perform a few basic queries: ```rust -use jerky::bit_vector::{bit_vector::BitVectorBuilder, rank9sel::inner::Rank9SelIndex, prelude::*}; +use jerky::bit_vector::*; fn main() -> Result<(), Box> { let mut builder = BitVectorBuilder::new(); diff --git a/bench/benches/timing_bitvec_rank.rs b/bench/benches/timing_bitvec_rank.rs index 17c16a9..d29bf03 100644 --- a/bench/benches/timing_bitvec_rank.rs +++ b/bench/benches/timing_bitvec_rank.rs @@ -7,9 +7,7 @@ use criterion::{ criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, Criterion, SamplingMode, }; -use jerky::bit_vector::bit_vector::BitVectorBuilder; -use jerky::bit_vector::rank9sel::inner::Rank9SelIndex; -use jerky::bit_vector::{BitVector, NoIndex, Rank}; +use jerky::bit_vector::{BitVector, BitVectorBuilder, NoIndex, Rank, Rank9SelIndex}; const SAMPLE_SIZE: usize = 30; const WARM_UP_TIME: Duration = Duration::from_secs(5); diff --git a/bench/benches/timing_bitvec_select.rs b/bench/benches/timing_bitvec_select.rs index d9000f0..bb38e1c 100644 --- a/bench/benches/timing_bitvec_select.rs +++ b/bench/benches/timing_bitvec_select.rs @@ -6,9 +6,7 @@ use rand_chacha::ChaChaRng; use criterion::{ criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, Criterion, SamplingMode, }; -use jerky::bit_vector::bit_vector::BitVectorBuilder; -use jerky::bit_vector::rank9sel::inner::Rank9SelIndex; -use jerky::bit_vector::{BitVector, NoIndex, Select}; +use jerky::bit_vector::{BitVector, BitVectorBuilder, NoIndex, Rank9SelIndex, Select}; const SAMPLE_SIZE: usize = 30; const WARM_UP_TIME: Duration = Duration::from_secs(5); diff --git a/bench/benches/timing_chrseq_access.rs b/bench/benches/timing_chrseq_access.rs index 9b3bcc4..38a10fd 100644 --- a/bench/benches/timing_chrseq_access.rs +++ b/bench/benches/timing_chrseq_access.rs @@ -3,10 +3,7 @@ use std::time::Duration; use rand::{Rng, SeedableRng}; use rand_chacha::ChaChaRng; -use jerky::bit_vector::bit_vector::BitVectorBuilder; -use jerky::bit_vector::prelude::*; -use jerky::bit_vector::rank9sel::inner::Rank9SelIndex; -use jerky::bit_vector::{BitVector, NoIndex}; +use jerky::bit_vector::*; use jerky::char_sequences::WaveletMatrix; use jerky::int_vectors::CompactVector; diff --git a/bench/src/mem_bitvec.rs b/bench/src/mem_bitvec.rs index 744a649..b64efae 100644 --- a/bench/src/mem_bitvec.rs +++ b/bench/src/mem_bitvec.rs @@ -1,6 +1,4 @@ -use jerky::bit_vector::bit_vector::BitVectorBuilder; -use jerky::bit_vector::rank9sel::inner::Rank9SelIndex; -use jerky::bit_vector::BitVector; +use jerky::bit_vector::{BitVector, BitVectorBuilder, Rank9SelIndex}; use rand::{Rng, SeedableRng}; use rand_chacha::ChaChaRng; diff --git a/bench/src/mem_chrseq.rs b/bench/src/mem_chrseq.rs index f400f86..d3d923d 100644 --- a/bench/src/mem_chrseq.rs +++ b/bench/src/mem_chrseq.rs @@ -1,6 +1,4 @@ -use jerky::bit_vector::bit_vector::BitVectorBuilder; -use jerky::bit_vector::rank9sel::inner::Rank9SelIndex; -use jerky::bit_vector::{BitVector, NoIndex, Rank}; +use jerky::bit_vector::{BitVector, BitVectorBuilder, NoIndex, Rank, Rank9SelIndex}; use jerky::char_sequences::WaveletMatrix; use jerky::int_vectors::CompactVector; diff --git a/src/bit_vector/bit_vector.rs b/src/bit_vector/bit_vector.rs deleted file mode 100644 index f9f98ca..0000000 --- a/src/bit_vector/bit_vector.rs +++ /dev/null @@ -1,492 +0,0 @@ -//! Raw storage types and generic wrapper for bit vectors. -//! -//! The [`BitVectorBuilder`] allows collecting bits and freezing them into -//! [`BitVector`] backed by zero-copy [`BitVectorData`]. Data can also be -//! reconstructed directly from [`anybytes::Bytes`] obtained via an mmap -//! wrapper like `Bytes::from_source`. - -/// The number of bits in a machine word. -pub const WORD_LEN: usize = core::mem::size_of::() * 8; - -use crate::bit_vector::{Access, NumBits, Rank, Select}; -use anybytes::{Bytes, View}; -use anyhow::{anyhow, Result}; - -/// Builder that collects raw bits into a zero-copy [`BitVector`]. -#[derive(Debug, Default, Clone)] -pub struct BitVectorBuilder { - words: Vec, - len: usize, -} - -impl BitVectorBuilder { - /// Creates an empty builder. - pub fn new() -> Self { - Self::default() - } - - /// Pushes a single bit. - pub fn push_bit(&mut self, bit: bool) { - let pos_in_word = self.len % WORD_LEN; - if pos_in_word == 0 { - self.words.push(bit as usize); - } else { - let cur = self.words.last_mut().unwrap(); - *cur |= (bit as usize) << pos_in_word; - } - self.len += 1; - } - - /// Pushes `len` bits from `bits` at the end. - /// - /// Bits outside the lowest `len` bits are truncated. - pub fn push_bits(&mut self, bits: usize, len: usize) -> Result<()> { - if WORD_LEN < len { - return Err(anyhow!( - "len must be no greater than {WORD_LEN}, but got {len}." - )); - } - if len == 0 { - return Ok(()); - } - - let mask = if len < WORD_LEN { - (1 << len) - 1 - } else { - usize::MAX - }; - let bits = bits & mask; - - let pos_in_word = self.len % WORD_LEN; - if pos_in_word == 0 { - self.words.push(bits); - } else { - let cur = self.words.last_mut().unwrap(); - *cur |= bits << pos_in_word; - if len > WORD_LEN - pos_in_word { - self.words.push(bits >> (WORD_LEN - pos_in_word)); - } - } - self.len += len; - Ok(()) - } - - /// Sets the `pos`-th bit to `bit`. - pub fn set_bit(&mut self, pos: usize, bit: bool) -> Result<()> { - if self.len <= pos { - return Err(anyhow!( - "pos must be no greater than self.len()={}, but got {pos}.", - self.len - )); - } - let word = pos / WORD_LEN; - let pos_in_word = pos % WORD_LEN; - self.words[word] &= !(1 << pos_in_word); - self.words[word] |= (bit as usize) << pos_in_word; - Ok(()) - } - - /// Extends the builder from an iterator of bits. - pub fn extend_bits>(&mut self, bits: I) { - bits.into_iter().for_each(|b| self.push_bit(b)); - } - - fn into_data(self) -> BitVectorData { - let words = Bytes::from_source(self.words).view::<[usize]>().unwrap(); - BitVectorData { - words, - len: self.len, - } - } - - /// Finalizes the builder into a [`BitVector`]. - pub fn freeze(self) -> BitVector { - let data = self.into_data(); - let index = I::build(&data); - BitVector::new(data, index) - } - - /// Serializes the builder contents into a [`Bytes`] buffer. - pub fn into_bytes(self) -> (usize, Bytes) { - (self.len, Bytes::from_source(self.words)) - } -} - -/// Immutable bit vector data without auxiliary indexes. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct BitVectorData { - /// Underlying machine words storing bit data. - pub words: View<[usize]>, - /// Number of valid bits in `words`. - pub len: usize, -} - -impl Default for BitVectorData { - fn default() -> Self { - Self { - words: Bytes::empty().view::<[usize]>().unwrap(), - len: 0, - } - } -} - -impl BitVectorData { - /// Creates bit vector data from a bit iterator. - pub fn from_bits>(bits: I) -> Self { - let mut builder = BitVectorBuilder::new(); - builder.extend_bits(bits); - builder.into_data() - } - - /// Reconstructs the data from zero-copy [`Bytes`]. - pub fn from_bytes(len: usize, bytes: Bytes) -> Result { - let words = bytes.view::<[usize]>().map_err(|e| anyhow::anyhow!(e))?; - Ok(Self { words, len }) - } - - /// Returns the number of bits stored. - pub const fn len(&self) -> usize { - self.len - } - - /// Returns the raw word slice. - pub fn words(&self) -> &[usize] { - self.words.as_ref() - } - - /// Returns the number of words stored. - pub fn num_words(&self) -> usize { - self.words.len() - } - - /// Returns `len` bits starting at position `pos`. - pub fn get_bits(&self, pos: usize, len: usize) -> Option { - if WORD_LEN < len || self.len() < pos + len { - return None; - } - if len == 0 { - return Some(0); - } - let block = pos / WORD_LEN; - let shift = pos % WORD_LEN; - let mask = if len < WORD_LEN { - (1 << len) - 1 - } else { - usize::MAX - }; - let bits = if shift + len <= WORD_LEN { - (self.words[block] >> shift) & mask - } else { - (self.words[block] >> shift) | ((self.words[block + 1] << (WORD_LEN - shift)) & mask) - }; - Some(bits) - } - - /// Returns the number of bytes required for the old copy-based serialization. - pub fn size_in_bytes(&self) -> usize { - std::mem::size_of::() * (self.words.len() + 2) - } - - /// Serializes the data into a [`Bytes`] buffer. - pub fn to_bytes(&self) -> (usize, Bytes) { - (self.len, self.words.clone().bytes()) - } -} - -impl From for BitVector { - fn from(data: BitVectorData) -> Self { - BitVector::new(data, NoIndex) - } -} - -impl Access for BitVectorData { - fn access(&self, pos: usize) -> Option { - if pos < self.len { - let block = pos / WORD_LEN; - let shift = pos % WORD_LEN; - Some((self.words[block] >> shift) & 1 == 1) - } else { - None - } - } -} - -/// Index trait for bit vector data. -pub trait BitVectorIndex: Sized { - /// Constructs an index from bit vector data. - fn build(data: &BitVectorData) -> Self; - - /// Counts set bits in the data. - fn num_ones(&self, data: &BitVectorData) -> usize; - - /// Counts unset bits in the data. - fn num_zeros(&self, data: &BitVectorData) -> usize { - data.len() - self.num_ones(data) - } - - /// Rank query for ones. - fn rank1(&self, data: &BitVectorData, pos: usize) -> Option; - - /// Rank query for zeros. - fn rank0(&self, data: &BitVectorData, pos: usize) -> Option { - Some(pos - self.rank1(data, pos)?) - } - - /// Select query for ones. - fn select1(&self, data: &BitVectorData, k: usize) -> Option; - - /// Select query for zeros. - fn select0(&self, data: &BitVectorData, k: usize) -> Option; -} - -/// Placeholder index that performs linear scans over the data. -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] -pub struct NoIndex; - -impl BitVectorIndex for NoIndex { - fn build(_: &BitVectorData) -> Self { - NoIndex - } - - fn num_ones(&self, data: &BitVectorData) -> usize { - data.words - .iter() - .map(|&w| crate::broadword::popcount(w)) - .sum() - } - - fn rank1(&self, data: &BitVectorData, pos: usize) -> Option { - if data.len() < pos { - return None; - } - let mut r = 0; - let (wpos, left) = (pos / WORD_LEN, pos % WORD_LEN); - for &w in &data.words[..wpos] { - r += crate::broadword::popcount(w); - } - if left != 0 { - r += crate::broadword::popcount(data.words[wpos] << (WORD_LEN - left)); - } - Some(r) - } - - fn select1(&self, data: &BitVectorData, k: usize) -> Option { - let mut wpos = 0; - let mut cur_rank = 0; - while wpos < data.words.len() { - let cnt = crate::broadword::popcount(data.words[wpos]); - if k < cur_rank + cnt { - break; - } - wpos += 1; - cur_rank += cnt; - } - if wpos == data.words.len() { - return None; - } - let sel = wpos * WORD_LEN - + crate::broadword::select_in_word(data.words[wpos], k - cur_rank).unwrap(); - Some(sel) - } - - fn select0(&self, data: &BitVectorData, k: usize) -> Option { - let mut wpos = 0; - let mut cur_rank = 0; - while wpos < data.words.len() { - let cnt = crate::broadword::popcount(!data.words[wpos]); - if k < cur_rank + cnt { - break; - } - wpos += 1; - cur_rank += cnt; - } - if wpos == data.words.len() { - return None; - } - let sel = wpos * WORD_LEN - + crate::broadword::select_in_word(!data.words[wpos], k - cur_rank).unwrap(); - if sel < data.len() { - Some(sel) - } else { - None - } - } -} - -/// Immutable bit vector data combined with an auxiliary index. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct BitVector { - /// Raw data bits. - pub data: BitVectorData, - /// Associated index. - pub index: I, -} - -/// Iterator over bits in a [`BitVector`]. -pub struct Iter<'a, I> { - bv: &'a BitVector, - pos: usize, -} - -impl<'a, I> Iter<'a, I> { - /// Creates a new iterator. - pub const fn new(bv: &'a BitVector) -> Self { - Self { bv, pos: 0 } - } -} - -impl Iterator for Iter<'_, I> { - type Item = bool; - - fn next(&mut self) -> Option { - if self.pos < self.bv.len() { - let bit = self.bv.access(self.pos).unwrap(); - self.pos += 1; - Some(bit) - } else { - None - } - } - - fn size_hint(&self) -> (usize, Option) { - (self.bv.len(), Some(self.bv.len())) - } -} - -impl BitVector { - /// Creates a new wrapper from data and index. - pub const fn new(data: BitVectorData, index: I) -> Self { - Self { data, index } - } - - /// Returns the number of bits stored. - pub const fn len(&self) -> usize { - self.data.len() - } - - /// Returns the `len` bits starting at `pos`, or [`None`] if out of bounds. - pub fn get_bits(&self, pos: usize, len: usize) -> Option { - self.data.get_bits(pos, len) - } - - /// Creates an iterator over all bits. - pub const fn iter(&self) -> Iter { - Iter { bv: self, pos: 0 } - } - - /// Collects all bits into a `Vec` for inspection. - pub fn to_vec(&self) -> Vec { - self.iter().collect() - } -} - -impl NumBits for BitVector { - fn num_bits(&self) -> usize { - self.data.len() - } - - fn num_ones(&self) -> usize { - self.index.num_ones(&self.data) - } -} - -impl Access for BitVector { - fn access(&self, pos: usize) -> Option { - self.data.access(pos) - } -} - -impl Rank for BitVector { - fn rank1(&self, pos: usize) -> Option { - self.index.rank1(&self.data, pos) - } - - fn rank0(&self, pos: usize) -> Option { - self.index.rank0(&self.data, pos) - } -} - -impl Select for BitVector { - fn select1(&self, k: usize) -> Option { - self.index.select1(&self.data, k) - } - - fn select0(&self, k: usize) -> Option { - self.index.select0(&self.data, k) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn no_index_wrapper() { - let data = BitVectorData::from_bits([true, false, false, true]); - let bv = BitVector::new(data, NoIndex); - - assert_eq!(bv.num_bits(), 4); - assert_eq!(bv.num_ones(), 2); - assert_eq!(bv.access(0), Some(true)); - assert_eq!(bv.rank1(4), Some(2)); - assert_eq!(bv.select1(1), Some(3)); - assert_eq!(bv.select0(0), Some(1)); - } - - #[test] - fn builder_freeze() { - let mut builder = BitVectorBuilder::new(); - builder.extend_bits([true, false]); - builder.push_bits(0b10, 2).unwrap(); - builder.set_bit(1, true).unwrap(); - let bv: BitVector = builder.freeze::(); - assert_eq!(bv.len(), 4); - assert_eq!(bv.get_bits(0, 4), Some(0b1011)); - } - - #[test] - fn from_bytes_roundtrip() { - let mut builder = BitVectorBuilder::new(); - builder.extend_bits([true, false, true, true, false]); - let expected: BitVector = builder.clone().freeze::(); - let (len, bytes) = builder.into_bytes(); - - let data = BitVectorData::from_bytes(len, bytes).unwrap(); - let other: BitVector = data.into(); - assert_eq!(expected, other); - } - - #[test] - fn get_bits_wrapper() { - let data = BitVectorData::from_bits([true, false, true, true, false]); - let bv = BitVector::new(data.clone(), NoIndex); - assert_eq!(data.get_bits(1, 3), Some(0b110)); - assert_eq!(data.get_bits(2, 4), None); - assert_eq!(bv.get_bits(1, 3), Some(0b110)); - assert_eq!(bv.get_bits(2, 4), None); - } - - #[test] - fn builder_push_bits_across_word() { - let mut builder = BitVectorBuilder::new(); - builder.extend_bits(core::iter::repeat(false).take(62)); - builder.push_bits(0b011111, 6).unwrap(); - let bv: BitVector = builder.freeze::(); - assert_eq!(bv.data.get_bits(61, 7).unwrap(), 0b0111110); - } - - #[test] - fn iter_collects() { - let data = BitVectorData::from_bits([true, false, true]); - let bv = BitVector::new(data, NoIndex); - let collected: Vec = bv.iter().collect(); - assert_eq!(collected, vec![true, false, true]); - } - - #[test] - fn to_vec_collects() { - let data = BitVectorData::from_bits([true, false, true]); - let bv = BitVector::new(data, NoIndex); - assert_eq!(bv.to_vec(), vec![true, false, true]); - } -} diff --git a/src/bit_vector/mod.rs b/src/bit_vector/mod.rs index 7e0ab98..82b3278 100644 --- a/src/bit_vector/mod.rs +++ b/src/bit_vector/mod.rs @@ -42,12 +42,12 @@ //! # Examples //! //! This module provides several traits for essential behaviors, -//! allowing to compare our bit vectors as components in your data structures. -//! [`prelude`] allows you to import them easily. +//! allowing you to compare our bit vectors as components in your data +//! structures. Import them all with `use jerky::bit_vector::*;`. //! //! ``` //! # fn main() -> Result<(), Box> { -//! use jerky::bit_vector::{bit_vector::BitVectorBuilder, rank9sel::inner::Rank9SelIndex, BitVector, prelude::*}; +//! use jerky::bit_vector::*; //! //! let mut builder = BitVectorBuilder::new(); //! builder.extend_bits([true, false, false, true]); @@ -66,12 +66,8 @@ //! # Ok(()) //! # } //! ``` -pub mod bit_vector; -pub mod prelude; pub mod rank9sel; -pub use bit_vector::{BitVector, BitVectorData, BitVectorIndex, NoIndex}; - /// Interface for building a bit vector with rank/select queries. /// Interface for reporting basic statistics in a bit vector. @@ -122,3 +118,490 @@ pub trait Select { /// [`None`] if out of bounds. fn select0(&self, k: usize) -> Option; } + +/// The number of bits in a machine word. +pub const WORD_LEN: usize = core::mem::size_of::() * 8; + +use anybytes::{Bytes, View}; +use anyhow::{anyhow, Result}; + +/// Builder that collects raw bits into a zero-copy [`BitVector`]. +#[derive(Debug, Default, Clone)] +pub struct BitVectorBuilder { + words: Vec, + len: usize, +} + +impl BitVectorBuilder { + /// Creates an empty builder. + pub fn new() -> Self { + Self::default() + } + + /// Pushes a single bit. + pub fn push_bit(&mut self, bit: bool) { + let pos_in_word = self.len % WORD_LEN; + if pos_in_word == 0 { + self.words.push(bit as usize); + } else { + let cur = self.words.last_mut().unwrap(); + *cur |= (bit as usize) << pos_in_word; + } + self.len += 1; + } + + /// Pushes `len` bits from `bits` at the end. + /// + /// Bits outside the lowest `len` bits are truncated. + pub fn push_bits(&mut self, bits: usize, len: usize) -> Result<()> { + if WORD_LEN < len { + return Err(anyhow!( + "len must be no greater than {WORD_LEN}, but got {len}." + )); + } + if len == 0 { + return Ok(()); + } + + let mask = if len < WORD_LEN { + (1 << len) - 1 + } else { + usize::MAX + }; + let bits = bits & mask; + + let pos_in_word = self.len % WORD_LEN; + if pos_in_word == 0 { + self.words.push(bits); + } else { + let cur = self.words.last_mut().unwrap(); + *cur |= bits << pos_in_word; + if len > WORD_LEN - pos_in_word { + self.words.push(bits >> (WORD_LEN - pos_in_word)); + } + } + self.len += len; + Ok(()) + } + + /// Sets the `pos`-th bit to `bit`. + pub fn set_bit(&mut self, pos: usize, bit: bool) -> Result<()> { + if self.len <= pos { + return Err(anyhow!( + "pos must be no greater than self.len()={}, but got {pos}.", + self.len + )); + } + let word = pos / WORD_LEN; + let pos_in_word = pos % WORD_LEN; + self.words[word] &= !(1 << pos_in_word); + self.words[word] |= (bit as usize) << pos_in_word; + Ok(()) + } + + /// Extends the builder from an iterator of bits. + pub fn extend_bits>(&mut self, bits: I) { + bits.into_iter().for_each(|b| self.push_bit(b)); + } + + fn into_data(self) -> BitVectorData { + let words = Bytes::from_source(self.words).view::<[usize]>().unwrap(); + BitVectorData { + words, + len: self.len, + } + } + + /// Finalizes the builder into a [`BitVector`]. + pub fn freeze(self) -> BitVector { + let data = self.into_data(); + let index = I::build(&data); + BitVector::new(data, index) + } + + /// Serializes the builder contents into a [`Bytes`] buffer. + pub fn into_bytes(self) -> (usize, Bytes) { + (self.len, Bytes::from_source(self.words)) + } +} + +/// Immutable bit vector data without auxiliary indexes. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BitVectorData { + /// Underlying machine words storing bit data. + pub words: View<[usize]>, + /// Number of valid bits in `words`. + pub len: usize, +} + +impl Default for BitVectorData { + fn default() -> Self { + Self { + words: Bytes::empty().view::<[usize]>().unwrap(), + len: 0, + } + } +} + +impl BitVectorData { + /// Creates bit vector data from a bit iterator. + pub fn from_bits>(bits: I) -> Self { + let mut builder = BitVectorBuilder::new(); + builder.extend_bits(bits); + builder.into_data() + } + + /// Reconstructs the data from zero-copy [`Bytes`]. + pub fn from_bytes(len: usize, bytes: Bytes) -> Result { + let words = bytes.view::<[usize]>().map_err(|e| anyhow::anyhow!(e))?; + Ok(Self { words, len }) + } + + /// Returns the number of bits stored. + pub const fn len(&self) -> usize { + self.len + } + + /// Returns the raw word slice. + pub fn words(&self) -> &[usize] { + self.words.as_ref() + } + + /// Returns the number of words stored. + pub fn num_words(&self) -> usize { + self.words.len() + } + + /// Returns `len` bits starting at position `pos`. + pub fn get_bits(&self, pos: usize, len: usize) -> Option { + if WORD_LEN < len || self.len() < pos + len { + return None; + } + if len == 0 { + return Some(0); + } + let block = pos / WORD_LEN; + let shift = pos % WORD_LEN; + let mask = if len < WORD_LEN { + (1 << len) - 1 + } else { + usize::MAX + }; + let bits = if shift + len <= WORD_LEN { + (self.words[block] >> shift) & mask + } else { + (self.words[block] >> shift) | ((self.words[block + 1] << (WORD_LEN - shift)) & mask) + }; + Some(bits) + } + + /// Returns the number of bytes required for the old copy-based serialization. + pub fn size_in_bytes(&self) -> usize { + std::mem::size_of::() * (self.words.len() + 2) + } + + /// Serializes the data into a [`Bytes`] buffer. + pub fn to_bytes(&self) -> (usize, Bytes) { + (self.len, self.words.clone().bytes()) + } +} + +impl From for BitVector { + fn from(data: BitVectorData) -> Self { + BitVector::new(data, NoIndex) + } +} + +impl Access for BitVectorData { + fn access(&self, pos: usize) -> Option { + if pos < self.len { + let block = pos / WORD_LEN; + let shift = pos % WORD_LEN; + Some((self.words[block] >> shift) & 1 == 1) + } else { + None + } + } +} + +/// Index trait for bit vector data. +pub trait BitVectorIndex: Sized { + /// Constructs an index from bit vector data. + fn build(data: &BitVectorData) -> Self; + + /// Counts set bits in the data. + fn num_ones(&self, data: &BitVectorData) -> usize; + + /// Counts unset bits in the data. + fn num_zeros(&self, data: &BitVectorData) -> usize { + data.len() - self.num_ones(data) + } + + /// Rank query for ones. + fn rank1(&self, data: &BitVectorData, pos: usize) -> Option; + + /// Rank query for zeros. + fn rank0(&self, data: &BitVectorData, pos: usize) -> Option { + Some(pos - self.rank1(data, pos)?) + } + + /// Select query for ones. + fn select1(&self, data: &BitVectorData, k: usize) -> Option; + + /// Select query for zeros. + fn select0(&self, data: &BitVectorData, k: usize) -> Option; +} + +/// Placeholder index that performs linear scans over the data. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct NoIndex; + +impl BitVectorIndex for NoIndex { + fn build(_: &BitVectorData) -> Self { + NoIndex + } + + fn num_ones(&self, data: &BitVectorData) -> usize { + data.words + .iter() + .map(|&w| crate::broadword::popcount(w)) + .sum() + } + + fn rank1(&self, data: &BitVectorData, pos: usize) -> Option { + if data.len() < pos { + return None; + } + let mut r = 0; + let (wpos, left) = (pos / WORD_LEN, pos % WORD_LEN); + for &w in &data.words[..wpos] { + r += crate::broadword::popcount(w); + } + if left != 0 { + r += crate::broadword::popcount(data.words[wpos] << (WORD_LEN - left)); + } + Some(r) + } + + fn select1(&self, data: &BitVectorData, k: usize) -> Option { + let mut wpos = 0; + let mut cur_rank = 0; + while wpos < data.words.len() { + let cnt = crate::broadword::popcount(data.words[wpos]); + if k < cur_rank + cnt { + break; + } + wpos += 1; + cur_rank += cnt; + } + if wpos == data.words.len() { + return None; + } + let sel = wpos * WORD_LEN + + crate::broadword::select_in_word(data.words[wpos], k - cur_rank).unwrap(); + Some(sel) + } + + fn select0(&self, data: &BitVectorData, k: usize) -> Option { + let mut wpos = 0; + let mut cur_rank = 0; + while wpos < data.words.len() { + let cnt = crate::broadword::popcount(!data.words[wpos]); + if k < cur_rank + cnt { + break; + } + wpos += 1; + cur_rank += cnt; + } + if wpos == data.words.len() { + return None; + } + let sel = wpos * WORD_LEN + + crate::broadword::select_in_word(!data.words[wpos], k - cur_rank).unwrap(); + if sel < data.len() { + Some(sel) + } else { + None + } + } +} + +/// Immutable bit vector data combined with an auxiliary index. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BitVector { + /// Raw data bits. + pub data: BitVectorData, + /// Associated index. + pub index: I, +} + +/// Iterator over bits in a [`BitVector`]. +pub struct Iter<'a, I> { + bv: &'a BitVector, + pos: usize, +} + +impl<'a, I> Iter<'a, I> { + /// Creates a new iterator. + pub const fn new(bv: &'a BitVector) -> Self { + Self { bv, pos: 0 } + } +} + +impl Iterator for Iter<'_, I> { + type Item = bool; + + fn next(&mut self) -> Option { + if self.pos < self.bv.len() { + let bit = self.bv.access(self.pos).unwrap(); + self.pos += 1; + Some(bit) + } else { + None + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.bv.len(), Some(self.bv.len())) + } +} + +impl BitVector { + /// Creates a new wrapper from data and index. + pub const fn new(data: BitVectorData, index: I) -> Self { + Self { data, index } + } + + /// Returns the number of bits stored. + pub const fn len(&self) -> usize { + self.data.len() + } + + /// Returns the `len` bits starting at `pos`, or [`None`] if out of bounds. + pub fn get_bits(&self, pos: usize, len: usize) -> Option { + self.data.get_bits(pos, len) + } + + /// Creates an iterator over all bits. + pub const fn iter(&self) -> Iter { + Iter { bv: self, pos: 0 } + } + + /// Collects all bits into a `Vec` for inspection. + pub fn to_vec(&self) -> Vec { + self.iter().collect() + } +} + +impl NumBits for BitVector { + fn num_bits(&self) -> usize { + self.data.len() + } + + fn num_ones(&self) -> usize { + self.index.num_ones(&self.data) + } +} + +impl Access for BitVector { + fn access(&self, pos: usize) -> Option { + self.data.access(pos) + } +} + +impl Rank for BitVector { + fn rank1(&self, pos: usize) -> Option { + self.index.rank1(&self.data, pos) + } + + fn rank0(&self, pos: usize) -> Option { + self.index.rank0(&self.data, pos) + } +} + +impl Select for BitVector { + fn select1(&self, k: usize) -> Option { + self.index.select1(&self.data, k) + } + + fn select0(&self, k: usize) -> Option { + self.index.select0(&self.data, k) + } +} + +pub use rank9sel::Rank9SelIndex; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn no_index_wrapper() { + let data = BitVectorData::from_bits([true, false, false, true]); + let bv = BitVector::new(data, NoIndex); + + assert_eq!(bv.num_bits(), 4); + assert_eq!(bv.num_ones(), 2); + assert_eq!(bv.access(0), Some(true)); + assert_eq!(bv.rank1(4), Some(2)); + assert_eq!(bv.select1(1), Some(3)); + assert_eq!(bv.select0(0), Some(1)); + } + + #[test] + fn builder_freeze() { + let mut builder = BitVectorBuilder::new(); + builder.extend_bits([true, false]); + builder.push_bits(0b10, 2).unwrap(); + builder.set_bit(1, true).unwrap(); + let bv: BitVector = builder.freeze::(); + assert_eq!(bv.len(), 4); + assert_eq!(bv.get_bits(0, 4), Some(0b1011)); + } + + #[test] + fn from_bytes_roundtrip() { + let mut builder = BitVectorBuilder::new(); + builder.extend_bits([true, false, true, true, false]); + let expected: BitVector = builder.clone().freeze::(); + let (len, bytes) = builder.into_bytes(); + + let data = BitVectorData::from_bytes(len, bytes).unwrap(); + let other: BitVector = data.into(); + assert_eq!(expected, other); + } + + #[test] + fn get_bits_wrapper() { + let data = BitVectorData::from_bits([true, false, true, true, false]); + let bv = BitVector::new(data.clone(), NoIndex); + assert_eq!(data.get_bits(1, 3), Some(0b110)); + assert_eq!(data.get_bits(2, 4), None); + assert_eq!(bv.get_bits(1, 3), Some(0b110)); + assert_eq!(bv.get_bits(2, 4), None); + } + + #[test] + fn builder_push_bits_across_word() { + let mut builder = BitVectorBuilder::new(); + builder.extend_bits(core::iter::repeat(false).take(62)); + builder.push_bits(0b011111, 6).unwrap(); + let bv: BitVector = builder.freeze::(); + assert_eq!(bv.data.get_bits(61, 7).unwrap(), 0b0111110); + } + + #[test] + fn iter_collects() { + let data = BitVectorData::from_bits([true, false, true]); + let bv = BitVector::new(data, NoIndex); + let collected: Vec = bv.iter().collect(); + assert_eq!(collected, vec![true, false, true]); + } + + #[test] + fn to_vec_collects() { + let data = BitVectorData::from_bits([true, false, true]); + let bv = BitVector::new(data, NoIndex); + assert_eq!(bv.to_vec(), vec![true, false, true]); + } +} diff --git a/src/bit_vector/prelude.rs b/src/bit_vector/prelude.rs deleted file mode 100644 index b3aa6e8..0000000 --- a/src/bit_vector/prelude.rs +++ /dev/null @@ -1,9 +0,0 @@ -//! The prelude for bit vectors. -//! -//! The purpose of this module is to alleviate imports of many common traits for bit vectors. -//! -//! ``` -//! # #![allow(unused_imports)] -//! use jerky::bit_vector::prelude::*; -//! ``` -pub use crate::bit_vector::{Access, NumBits, Rank, Select}; diff --git a/src/bit_vector/rank9sel/inner.rs b/src/bit_vector/rank9sel/inner.rs index 46c654d..11d7e15 100644 --- a/src/bit_vector/rank9sel/inner.rs +++ b/src/bit_vector/rank9sel/inner.rs @@ -5,7 +5,7 @@ use anybytes::{Bytes, View}; use anyhow::Result; -use crate::bit_vector::bit_vector::BitVectorData; +use crate::bit_vector::BitVectorData; use crate::broadword; const BLOCK_LEN: usize = 8; @@ -232,7 +232,7 @@ impl Rank9SelIndex { /// # Examples /// /// ``` - /// use jerky::bit_vector::rank9sel::inner::Rank9SelIndex; + /// use jerky::bit_vector::Rank9SelIndex; /// use jerky::bit_vector::BitVectorData; /// /// let data = BitVectorData::from_bits([true, false, false, true]); @@ -278,7 +278,7 @@ impl Rank9SelIndex { /// # Examples /// /// ``` - /// use jerky::bit_vector::rank9sel::inner::Rank9SelIndex; + /// use jerky::bit_vector::Rank9SelIndex; /// use jerky::bit_vector::BitVectorData; /// let data = BitVectorData::from_bits([true, false, false, true]); /// let idx = Rank9SelIndex::::new(&data); @@ -310,7 +310,7 @@ impl Rank9SelIndex { /// # Examples /// /// ``` - /// use jerky::bit_vector::rank9sel::inner::Rank9SelIndex; + /// use jerky::bit_vector::Rank9SelIndex; /// use jerky::bit_vector::BitVectorData; /// let data = BitVectorData::from_bits([true, false, false, true]); /// let idx = Rank9SelIndex::::new(&data); @@ -384,7 +384,7 @@ impl Rank9SelIndex { /// # Examples /// /// ``` - /// use jerky::bit_vector::rank9sel::inner::Rank9SelIndex; + /// use jerky::bit_vector::Rank9SelIndex; /// use jerky::bit_vector::BitVectorData; /// let data = BitVectorData::from_bits([true, false, false, true]); /// let idx = Rank9SelIndex::::new(&data); @@ -535,7 +535,7 @@ impl Rank9SelIndex { } } -impl crate::bit_vector::bit_vector::BitVectorIndex +impl crate::bit_vector::BitVectorIndex for Rank9SelIndex { fn build(data: &BitVectorData) -> Self { diff --git a/src/char_sequences/wavelet_matrix.rs b/src/char_sequences/wavelet_matrix.rs index d8a9d62..d5b144e 100644 --- a/src/char_sequences/wavelet_matrix.rs +++ b/src/char_sequences/wavelet_matrix.rs @@ -6,9 +6,9 @@ use std::ops::Range; use anyhow::{anyhow, Result}; -use crate::bit_vector::bit_vector::{BitVectorBuilder, BitVectorIndex}; -use crate::bit_vector::rank9sel::inner::Rank9SelIndex; -use crate::bit_vector::{Access, BitVector, NumBits, Rank, Select}; +use crate::bit_vector::{ + Access, BitVector, BitVectorBuilder, BitVectorIndex, NumBits, Rank, Rank9SelIndex, Select, +}; use crate::int_vectors::{CompactVector, CompactVectorBuilder}; use crate::utils; diff --git a/src/int_vectors/compact_vector.rs b/src/int_vectors/compact_vector.rs index 0ff5965..47b805f 100644 --- a/src/int_vectors/compact_vector.rs +++ b/src/int_vectors/compact_vector.rs @@ -4,7 +4,7 @@ use anyhow::{anyhow, Result}; use num_traits::ToPrimitive; -use crate::bit_vector::bit_vector::BitVectorBuilder; +use crate::bit_vector::BitVectorBuilder; use crate::bit_vector::{BitVector, NoIndex}; use crate::int_vectors::prelude::*; use crate::utils; diff --git a/src/int_vectors/dacs_byte.rs b/src/int_vectors/dacs_byte.rs index a3ba30f..b5f3a73 100644 --- a/src/int_vectors/dacs_byte.rs +++ b/src/int_vectors/dacs_byte.rs @@ -6,9 +6,7 @@ use std::convert::TryFrom; use anyhow::{anyhow, Result}; use num_traits::ToPrimitive; -use crate::bit_vector::bit_vector::BitVectorBuilder; -use crate::bit_vector::rank9sel::inner::Rank9SelIndex; -use crate::bit_vector::{self, BitVector, Rank}; +use crate::bit_vector::{self, BitVector, BitVectorBuilder, Rank, Rank9SelIndex}; use crate::int_vectors::{Access, Build, NumVals}; use crate::utils; use anybytes::{Bytes, View};