Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions ph/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ sip13 = ["seedable_hash/sip13"] # if enabled, makes available BuildSip13 that u
fnv = ["seedable_hash/fnv"]
sux = ["dep:sux", "mem_dbg"]
cacheline-ef = ["dep:cacheline-ef", "mem_dbg"]
epserde = ["dep:epserde", "sux/epserde", "seedable_hash/epserde"]
mem_dbg = ["dep:mem_dbg", "seedable_hash/mem_dbg"]

[dependencies]
bitm = { version="0.5", path="../bitm" }
Expand All @@ -31,9 +33,12 @@ seedable_hash = { version="0.2", path="../seedable_hash", default-features = fal
arrayvec = "0.7"
voracious_radix_sort = { version = "1.2.0", features = ["voracious_multithread"] }
cseq = { version = "0.1", path = "../cseq", optional = true }
sux = { version = "0.6", optional = true }
mem_dbg = { version="0.3", optional = true } # required by sux and cacheline-ef
sux = { version = "0.9", optional = true }
mem_dbg = { version = "0.3.2", optional = true } # required by sux and cacheline-ef
cacheline-ef = { version = "1.1.0", optional = true }
epserde = { version = "0.11", optional = true }
maligned = "0.2.1"
rdst = "0.20.14"

[target.'cfg(target_pointer_width = "32")'.dependencies]
aligned-vec = { version="0.6", optional=true } # for 32 bit support
Expand All @@ -43,4 +48,4 @@ criterion = "0.5"

[[bench]]
name = "fmph"
harness = false
harness = false
3 changes: 1 addition & 2 deletions ph/src/fmph/gofunction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -759,8 +759,7 @@ mod tests {
use std::fmt::{Debug, Display};
use crate::seeds::Bits;

fn test_read_write<GS: GroupSize + Sync, SS: SeedSize>(h: &GOFunction<GS, SS>)
where SS::VecElement: std::cmp::PartialEq + Debug
fn test_read_write<GS: GroupSize + Sync, SS: SeedSize<VecElement: std::cmp::PartialEq + Debug>>(h: &GOFunction<GS, SS>)
{
let mut buff = Vec::new();
h.write(&mut buff).unwrap();
Expand Down
7 changes: 6 additions & 1 deletion ph/src/phast/compressed_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ impl<C> GetSize for LinearRegressionArray<C> {
}

/// Implementation of `CompressedArray` that stores each value with the same number of bits required to store the largest one.
#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
pub struct Compact {
pub items: Box<[u64]>,
pub item_size: u8,
Expand Down Expand Up @@ -277,7 +279,10 @@ impl CompressedArray for CompactFast {


/// CompressedArray implementation by Elias-Fano from `sux` crate.
#[cfg(feature = "sux")] pub struct SuxEliasFano(sux::dict::elias_fano::EfSeq);
#[cfg(feature = "sux")]
#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
pub struct SuxEliasFano<E = sux::dict::elias_fano::EfSeq>(E);

#[cfg(feature = "sux")] impl CompressedBuilder for sux::dict::EliasFanoBuilder {
#[inline] fn new(num_of_values: usize, max_value: usize) -> Self {
Expand Down
2 changes: 2 additions & 0 deletions ph/src/phast/conf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ use super::SeedChooser;

/// PHast map-or-bump function configuration.
#[derive(Clone, Copy)]
#[cfg_attr(feature = "epserde", derive(epserde::Epserde), repr(C), epserde_zero_copy)]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
pub struct Conf {
pub(crate) buckets_num: usize, // number of buckets, B
pub(crate) slice_len_minus_one: u16, // slice length L
Expand Down
31 changes: 18 additions & 13 deletions ph/src/phast/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,20 @@ use voracious_radix_sort::RadixSort;
use rayon::prelude::*;

/// Represents map-or-bump function.
pub(crate) struct SeedEx<SSVecElement> {
pub(crate) seeds: Box<[SSVecElement]>,
#[cfg_attr(feature = "epserde", derive(epserde::Epserde), repr(C))]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
pub struct SeedEx<SSVecElement, B = Box<[SSVecElement]>> {
pub(crate) seeds: B,
pub(crate) conf: Conf,
pub(crate) _marker: std::marker::PhantomData<SSVecElement>
}

impl<SSVecElement> SeedEx<SSVecElement> {
#[inline(always)]
pub(crate) fn bucket_for(&self, key: u64) -> usize { self.conf.bucket_for(key) }

#[inline(always)]
pub(crate) fn seed_for<SS>(&self, seed_size: SS, key: u64) -> u16 where SS: SeedSize<VecElement=SSVecElement> {
pub(crate) fn seed_for<SS: SeedSize<VecElement=SSVecElement>>(&self, seed_size: SS, key: u64) -> u16 {
//self.seeds.get_fragment(self.bucket_for(key), self.conf.bits_per_seed()) as u16
seed_size.get_seed(&self.seeds, self.bucket_for(key))
}
Expand All @@ -32,9 +35,12 @@ impl<SSVecElement: GetSize> GetSize for SeedEx<SSVecElement> {
}


pub(crate) struct Level<SSVecElement> {
pub(crate) seeds: SeedEx<SSVecElement>,
pub(crate) shift: usize
#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
pub struct Level<SSVecElement, S = SeedEx<SSVecElement>> {
pub(crate) seeds: S,
pub(crate) shift: usize,
pub(crate) _marker: std::marker::PhantomData<SSVecElement>
}

impl<SSVecElement: GetSize> GetSize for Level<SSVecElement> {
Expand All @@ -60,7 +66,7 @@ pub(crate) fn build_level_from_slice_st<K, SS, SC, S>(keys: &[K], params: &Param
keys_vec.extend(keys.into_iter().filter(|key| {
params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0
}).cloned());
(keys_vec, SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
(keys_vec, SeedEx::<SS::VecElement>{ seeds, conf, _marker: std::marker::PhantomData }, unassigned_values, unassigned_len)
}

#[inline]
Expand All @@ -87,7 +93,7 @@ pub(crate) fn build_level_from_slice_mt<K, SS, SC, S>(keys: &[K], params: &Param
keys_vec.par_extend(keys.into_par_iter().filter(|key| {
params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0
}).cloned());
(keys_vec, SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
(keys_vec, SeedEx::<SS::VecElement>{ seeds, conf, _marker: std::marker::PhantomData }, unassigned_values, unassigned_len)
}

#[inline(always)]
Expand All @@ -105,7 +111,7 @@ pub(crate) fn build_level_st<K, SS, SC, S>(keys: &mut Vec::<K>, params: &Params<
keys.retain(|key| {
params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0
});
(SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
(SeedEx::<SS::VecElement>{ seeds, conf, _marker: std::marker::PhantomData }, unassigned_values, unassigned_len)
}

#[inline]
Expand Down Expand Up @@ -133,7 +139,7 @@ pub(crate) fn build_level_mt<K, SS, SC, S>(keys: &mut Vec::<K>, params: &Params<
keys.par_extend(result.into_par_iter().filter(|key| {
params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0
}));
(SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
(SeedEx::<SS::VecElement>{ seeds, conf, _marker: std::marker::PhantomData }, unassigned_values, unassigned_len)
}

/// PHast (Perfect Hashing made fast) - Minimal Perfect Hash Function
Expand All @@ -149,8 +155,7 @@ pub(crate) fn build_level_mt<K, SS, SC, S>(keys: &mut Vec::<K>, params: &Params<
///
/// See:
/// Piotr Beling, Peter Sanders, *PHast - Perfect Hashing made fast*, 2025, <https://arxiv.org/abs/2504.17918>
pub struct Function<SS, SC = SeedOnly, CA = DefaultCompressedArray, S = BuildDefaultSeededHasher>
where SS: SeedSize
pub struct Function<SS: SeedSize, SC = SeedOnly, CA = DefaultCompressedArray, S = BuildDefaultSeededHasher>
{
level0: SeedEx<SS::VecElement>,
unassigned: CA,
Expand Down Expand Up @@ -307,7 +312,7 @@ impl<SS: SeedSize, SC: SeedChooser, CA: CompressedArray, S: BuildSeededHasher> F
unassigned.push(last);
}
}
levels.push(Level { seeds, shift });
levels.push(Level { seeds, shift, _marker: std::marker::PhantomData });
}
debug_assert!(level0_unassigned.next().is_none());
drop(level0_unassigned);
Expand Down
19 changes: 10 additions & 9 deletions ph/src/phast/function2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@ use voracious_radix_sort::RadixSort;
///
/// See:
/// Piotr Beling, Peter Sanders, *PHast - Perfect Hashing made fast*, 2025, <https://arxiv.org/abs/2504.17918>
pub struct Function2<SS, SC = ShiftOnly, CA = DefaultCompressedArray, S = BuildDefaultSeededHasher>
where SS: SeedSize
#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
pub struct Function2<SS: SeedSize, SC = ShiftOnly, CA = DefaultCompressedArray, S = BuildDefaultSeededHasher, L0 = SeedEx<<SS as SeedSize>::VecElement>, L = Box<[Level<<SS as SeedSize>::VecElement>]>, LL = Level<<Bits8 as SeedSize>::VecElement>>
{
level0: SeedEx<SS::VecElement>,
level0: L0,
unassigned: CA,
levels: Box<[Level<SS::VecElement>]>,
levels: L,
hasher: S,
last_level: Level<<Bits8 as SeedSize>::VecElement>,
last_level: LL,
last_level_seed: u64,
seed_chooser: SC,
seed_size: SS,
Expand Down Expand Up @@ -184,15 +185,15 @@ impl<SS: SeedSize, SC: SeedChooser, CA: CompressedArray, S: BuildSeededHasher> F
unassigned.push(last);
}
}
levels.push(Level { seeds, shift });
levels.push(Level { seeds, shift, _marker: std::marker::PhantomData });
}
//dbg!(keys.len()); // TODO keys.len()==0
let mut last_seed = levels.len() as u64+1;
let last_shift;
let last_seeds =
if keys.is_empty() {
last_shift = 0;
SeedEx::<<Bits8 as SeedSize>::VecElement>{ seeds: Box::default(), conf: Conf { buckets_num: 0, slice_len_minus_one: 0, num_of_slices: 0 } }
SeedEx::<<Bits8 as SeedSize>::VecElement>{ seeds: Box::default(), conf: Conf { buckets_num: 0, slice_len_minus_one: 0, num_of_slices: 0 }, _marker: std::marker::PhantomData }
} else {
let (last_seeds, unassigned_values, _unassigned_len) =
Self::build_last_level(keys, &hasher, &mut last_seed);
Expand Down Expand Up @@ -223,7 +224,7 @@ impl<SS: SeedSize, SC: SeedChooser, CA: CompressedArray, S: BuildSeededHasher> F
levels: levels.into_boxed_slice(),
hasher,
seed_chooser,
last_level: Level { seeds: last_seeds, shift: last_shift },
last_level: Level { seeds: last_seeds, shift: last_shift, _marker: std::marker::PhantomData },
last_level_seed: last_seed,
seed_size,
}
Expand All @@ -245,7 +246,7 @@ impl<SS: SeedSize, SC: SeedChooser, CA: CompressedArray, S: BuildSeededHasher> F
if let Some((seeds, unassigned_values, unassigned_len)) =
build_last_level(&hashes, conf, bits_per_seed, evaluator.clone())
{
return (SeedEx{ seeds, conf }, unassigned_values, unassigned_len);
return (SeedEx{ seeds, conf, _marker: std::marker::PhantomData }, unassigned_values, unassigned_len);
}
*seed += 1;
//dbg!(*seed);
Expand Down
6 changes: 3 additions & 3 deletions ph/src/phast/partial.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use std::hash::{BuildHasher, Hash, RandomState};
///
/// See:
/// Piotr Beling, Peter Sanders, *PHast - Perfect Hashing made fast*, 2025, <https://arxiv.org/abs/2504.17918>
pub struct Partial<SS, SC = SeedOnly, S = RandomState> where SS: SeedSize {
pub struct Partial<SS: SeedSize, SC = SeedOnly, S = RandomState> {
seeds: SeedEx<SS::VecElement>,
hasher: S,
seed_chooser: SC,
Expand Down Expand Up @@ -135,7 +135,7 @@ impl<SS: SeedSize, SC: SeedChooser, S> Partial<SS, SC, S> {
hashes.voracious_sort();
let (seeds, build_conf) = build_st(hashes, conf, seed_size, bucket_evaluator, seed_chooser);
(Self {
seeds: SeedEx{ seeds, conf },
seeds: SeedEx{ seeds, conf, _marker: std::marker::PhantomData },
hasher,
seed_chooser,
seed_size
Expand All @@ -150,7 +150,7 @@ impl<SS: SeedSize, SC: SeedChooser, S> Partial<SS, SC, S> {
hashes.voracious_mt_sort(threads_num);
let (seeds, build_conf) = build_mt(hashes, conf, seed_size, WINDOW_SIZE, bucket_evaluator, seed_chooser, threads_num);
(Self {
seeds: SeedEx{ seeds, conf },
seeds: SeedEx{ seeds, conf, _marker: std::marker::PhantomData },
hasher,
seed_chooser,
seed_size,
Expand Down
10 changes: 5 additions & 5 deletions ph/src/phast/perfect.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ impl<SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher> Perfect<SS, SC, S> {
while !keys.is_empty() {
let seeds = build_level(&mut keys, levels.len() as u64+1, &hasher);
let out_range = seeds.conf.output_range(seed_chooser, seed_size.into());
levels.push(Level { seeds, shift });
levels.push(Level { seeds, shift, _marker: std::marker::PhantomData });
shift += out_range;
}
Self {
Expand Down Expand Up @@ -159,7 +159,7 @@ impl<SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher> Perfect<SS, SC, S> {
keys_vec.extend(keys.into_iter().filter(|key| {
params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0
}).cloned());
(keys_vec, SeedEx{ seeds, conf })
(keys_vec, SeedEx{ seeds, conf, _marker: std::marker::PhantomData })
}

#[inline]
Expand All @@ -182,7 +182,7 @@ impl<SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher> Perfect<SS, SC, S> {
keys_vec.par_extend(keys.into_par_iter().filter(|key| {
params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0
}).cloned());
(keys_vec, SeedEx{ seeds, conf })
(keys_vec, SeedEx{ seeds, conf, _marker: std::marker::PhantomData })
}

#[inline(always)]
Expand All @@ -197,7 +197,7 @@ impl<SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher> Perfect<SS, SC, S> {
keys.retain(|key| {
params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0
});
SeedEx{ seeds, conf }
SeedEx{ seeds, conf, _marker: std::marker::PhantomData }
}

#[inline]
Expand All @@ -224,7 +224,7 @@ impl<SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher> Perfect<SS, SC, S> {
keys.par_extend(result.into_par_iter().filter(|key| {
params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0
}));
SeedEx{ seeds, conf }
SeedEx{ seeds, conf, _marker: std::marker::PhantomData }
}

/// Returns maximum number of keys which can be mapped to the same value by `k`-[`Perfect`] function `self`.
Expand Down
2 changes: 2 additions & 0 deletions ph/src/phast/seed_chooser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ const SMALL_BUCKET_LIMIT: usize = 8;
///
/// It chooses best seed with quite strong hasher, without shift component,
/// which should lead to small size, but long construction time.
#[cfg_attr(feature = "epserde", derive(epserde::Epserde), epserde_deep_copy)]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
#[derive(Clone, Copy)]
pub struct SeedOnly;

Expand Down
2 changes: 2 additions & 0 deletions ph/src/phast/seed_chooser/shift.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ use super::SeedChooser;
///
/// It chooses best seed using only shifting without wrapping,
/// which leads to very fast construction but the cost of bigger size.
#[cfg_attr(feature = "epserde", derive(epserde::Epserde), repr(C), epserde_zero_copy)]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
#[derive(Clone, Copy, Default)]
pub struct ShiftOnly;

Expand Down
8 changes: 8 additions & 0 deletions ph/src/seeds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ pub trait SeedSize: Copy + Into<u8> + Sync + TryFrom<u8, Error=&'static str> {
}

/// Size in bits.
#[cfg_attr(feature = "epserde", derive(epserde::Epserde), epserde_deep_copy)]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
#[derive(Copy, Clone)]
pub struct Bits(pub u8);

Expand Down Expand Up @@ -128,6 +130,8 @@ impl SeedSize for Bits {
/// Size in bits.
///
/// Uses unaligned reads/writes to access data in SeedSize implementation.
#[cfg_attr(feature = "epserde", derive(epserde::Epserde), epserde_deep_copy)]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
#[derive(Copy, Clone)]
pub struct BitsFast(pub u8);

Expand Down Expand Up @@ -197,6 +201,8 @@ impl SeedSize for BitsFast {
}

/// Seed size of 8 bits.
#[cfg_attr(feature = "epserde", derive(epserde::Epserde), epserde_deep_copy)]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
#[derive(Copy, Clone, Default)]
pub struct Bits8;

Expand Down Expand Up @@ -248,6 +254,8 @@ impl SeedSize for Bits8 {
}

/// Seed size given as a power of two (knowing at compile time).
#[cfg_attr(feature = "epserde", derive(epserde::Epserde), epserde_deep_copy)]
#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))]
#[derive(Copy, Clone, Default)]
pub struct TwoToPowerBitsStatic<const LOG2_BITS: u8>;

Expand Down
6 changes: 5 additions & 1 deletion seedable_hash/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@ keywords = [ "hashing", "hash", "seed", "reduce", "map" ]
[features]
default = ["wyhash"]
sip13 = [] # if enabled, makes available BuildSip13 that uses Sip13 from compiler internals
epserde = ["dep:epserde"]
mem_dbg = ["dep:mem_dbg"]

[dependencies]
wyhash = { version="0.5" , optional=true }
fnv = { version="1", optional=true }
gxhash = { version = "3.5", optional = true }
rapidhash = { version = "1.3", optional = true, default-features = false }
xxhash-rust = { version = "0.8", features = ["xxh3"], optional=true }
xxhash-rust = { version = "0.8", features = ["xxh3"], optional=true }
epserde = { version = "0.11.3", optional = true }
mem_dbg = { version = "0.3.0", optional = true }
Loading