Skip to content

Commit 82062d2

Browse files
committed
small bug ers
1 parent 2b5844f commit 82062d2

File tree

1 file changed

+32
-41
lines changed

1 file changed

+32
-41
lines changed

src/rejsmp.rs

Lines changed: 32 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -193,20 +193,22 @@ impl ErsWmh {
193193
r <= (mi as f64) + xi
194194
}
195195

196-
/// Algorithm 2:
197196
/// - `max_attempts` is reinterpreted as **L** (sequence length per hash).
198197
/// * None, L_DEFAULT
199198
/// * Some(L), run exactly L draws per hash
199+
/// For each j in 0..K, scan r_{j,1..L}; take the first green. If none, mark E.
200+
/// Densification: replace each E by a uniformly chosen non-empty bucket (using
201+
/// shared, data-independent randomness from tabulation).
202+
/// `max_attempts` is interpreted as L (per-hash sequence length).
200203
pub fn sketch(&self, x: &[(u64, f64)], max_attempts: Option<u64>) -> Vec<Dart> {
201204
// Tunable default L (per-hash sequence length)
202205
const L_DEFAULT: u32 = 1024;
203-
204206
let l_per_hash: u32 = max_attempts.map(|v| v as u32).unwrap_or(L_DEFAULT);
205207

206208
let w = dense_weights(self.d, x);
207209
let k_usize = self.k as usize;
208210

209-
// Degenerate cases: no mass or M==0, deterministic fallback
211+
// Degenerate cases: no mass or M==0 deterministic fallback
210212
let m_total = self.index.m_total();
211213
let mass: f64 = w.iter().sum();
212214
if m_total == 0 || mass == 0.0 {
@@ -221,43 +223,39 @@ impl ErsWmh {
221223
let m = m_total as f64;
222224

223225
// One slot per hash j
224-
let mut buckets: Vec<Option<BucketKey>> = vec![None; k_usize];
225-
let mut any_filled = false;
226+
let mut buckets: Vec<Option<(u64 /*id*/, u32 /*time*/)>> = vec![None; k_usize];
226227

227228
// For each hash position j, scan a fixed-length sequence {r_{j,t}}_{t=1..L}
228229
for j in 0..k_usize {
229-
// deterministic stream: r_{j,t} = M * U(0,1) from tabulation on (j,t)
230-
let mut chosen: Option<BucketKey> = None;
231-
232230
for t in 1..=l_per_hash {
233-
// Key for the (j,t) draw
231+
// Per-draw key: (j, t) ⇒ r in [0,M)
234232
let key = ((j as u64) << 32) ^ (t as u64);
235233
let mut u = to_unit(self.t_u.hash(key));
236-
if u >= 1.0 { u = f64::from_bits(0x3fefffffffffffff); } // clamp to < 1
234+
if u >= 1.0 {
235+
u = f64::from_bits(0x3fefffffffffffff); // clamp to < 1
236+
}
237237
let r = m * u;
238238

239-
// Identify which component i this r falls into
240-
let (i, _mi) = self.index.comp_of(r);
241-
242239
// Accept if green for this vector
243240
if self.is_green(&w, r) {
244-
// IMPORTANT: ID must be based on the component index i (and j),
245-
// not on t; this is what preserves unbiased Jaccard.
246-
let id_key = ((j as u64) << 32) ^ (i as u64);
247-
let id = self.t_id.hash(id_key);
248-
chosen = Some(BucketKey { time: t, hash_id: id });
241+
// **Identity must be per-draw** so that two sets collide
242+
// iff they accepted the SAME r_{j,t}. Use (j,t).
243+
let id = self.t_id.hash(key);
244+
buckets[j] = Some((id, t));
249245
break;
250246
}
251247
}
252-
253-
if let Some(kv) = chosen {
254-
buckets[j] = Some(kv);
255-
any_filled = true;
256-
}
257248
}
258249

259-
// If none filled (extremely unlikely with reasonable L), deterministic fallback
260-
if !any_filled {
250+
// Build donor list (indices of non-empty buckets)
251+
let donors: Vec<usize> = buckets
252+
.iter()
253+
.enumerate()
254+
.filter_map(|(idx, v)| if v.is_some() { Some(idx) } else { None })
255+
.collect();
256+
257+
// If none filled (unlikely with decent L), deterministic fallback
258+
if donors.is_empty() {
261259
let mut out = Vec::with_capacity(k_usize);
262260
for j in 0..k_usize {
263261
let fake = (self.t_rot.hash(j as u32) as u64) << 32 | (j as u64);
@@ -266,30 +264,23 @@ impl ErsWmh {
266264
return out;
267265
}
268266

269-
// Densification by rotation: for each empty j, walk j + off, j + 2*off, ... (mod k)
270-
// using a deterministic, data-independent offset derived from j.
267+
// **Uniform** densification with shared randomness:
268+
// For each empty j, pick donor = donors[ H(j) mod donors.len() ].
269+
// This is uniform over donors and depends only on j and the tabulation
270+
// seeds (shared across sets), not on the weights.
271271
for j in 0..k_usize {
272272
if buckets[j].is_none() {
273-
let mut off = (self.t_rot.hash(j as u32) as usize) % k_usize;
274-
if off == 0 { off = 1; }
275-
let donor: BucketKey = {
276-
let mut t = 0usize;
277-
loop {
278-
let jj = (j + off * (t + 1)) % k_usize;
279-
if let Some(kv) = buckets[jj] { break kv; }
280-
t += 1;
281-
// Since at least one filled exists, this loop must terminate
282-
}
283-
};
284-
buckets[j] = Some(donor);
273+
let idx = (self.t_rot.hash(j as u32) as usize) % donors.len();
274+
let donor = donors[idx];
275+
buckets[j] = buckets[donor]; // copy donor's (id, time)
285276
}
286277
}
287278

288279
// Convert to (id, rank) = (hash_id, time as f64)
289280
let mut out = Vec::with_capacity(k_usize);
290281
for j in 0..k_usize {
291-
let key = buckets[j].unwrap();
292-
out.push((key.hash_id, key.time as f64));
282+
let (id, t) = buckets[j].unwrap();
283+
out.push((id, t as f64));
293284
}
294285
out
295286
}

0 commit comments

Comments
 (0)