@@ -193,20 +193,22 @@ impl ErsWmh {
193193 r <= ( mi as f64 ) + xi
194194 }
195195
196- /// Algorithm 2:
197196 /// - `max_attempts` is reinterpreted as **L** (sequence length per hash).
198197 /// * None, L_DEFAULT
199198 /// * Some(L), run exactly L draws per hash
199+ /// For each j in 0..K, scan r_{j,1..L}; take the first green. If none, mark E.
200+ /// Densification: replace each E by a uniformly chosen non-empty bucket (using
201+ /// shared, data-independent randomness from tabulation).
202+ /// `max_attempts` is interpreted as L (per-hash sequence length).
200203 pub fn sketch ( & self , x : & [ ( u64 , f64 ) ] , max_attempts : Option < u64 > ) -> Vec < Dart > {
201204 // Tunable default L (per-hash sequence length)
202205 const L_DEFAULT : u32 = 1024 ;
203-
204206 let l_per_hash: u32 = max_attempts. map ( |v| v as u32 ) . unwrap_or ( L_DEFAULT ) ;
205207
206208 let w = dense_weights ( self . d , x) ;
207209 let k_usize = self . k as usize ;
208210
209- // Degenerate cases: no mass or M==0, deterministic fallback
211+ // Degenerate cases: no mass or M==0 → deterministic fallback
210212 let m_total = self . index . m_total ( ) ;
211213 let mass: f64 = w. iter ( ) . sum ( ) ;
212214 if m_total == 0 || mass == 0.0 {
@@ -221,43 +223,39 @@ impl ErsWmh {
221223 let m = m_total as f64 ;
222224
223225 // One slot per hash j
224- let mut buckets: Vec < Option < BucketKey > > = vec ! [ None ; k_usize] ;
225- let mut any_filled = false ;
226+ let mut buckets: Vec < Option < ( u64 /*id*/ , u32 /*time*/ ) > > = vec ! [ None ; k_usize] ;
226227
227228 // For each hash position j, scan a fixed-length sequence {r_{j,t}}_{t=1..L}
228229 for j in 0 ..k_usize {
229- // deterministic stream: r_{j,t} = M * U(0,1) from tabulation on (j,t)
230- let mut chosen: Option < BucketKey > = None ;
231-
232230 for t in 1 ..=l_per_hash {
233- // Key for the (j,t) draw
231+ // Per-draw key: (j, t) ⇒ r in [0,M)
234232 let key = ( ( j as u64 ) << 32 ) ^ ( t as u64 ) ;
235233 let mut u = to_unit ( self . t_u . hash ( key) ) ;
236- if u >= 1.0 { u = f64:: from_bits ( 0x3fefffffffffffff ) ; } // clamp to < 1
234+ if u >= 1.0 {
235+ u = f64:: from_bits ( 0x3fefffffffffffff ) ; // clamp to < 1
236+ }
237237 let r = m * u;
238238
239- // Identify which component i this r falls into
240- let ( i, _mi) = self . index . comp_of ( r) ;
241-
242239 // Accept if green for this vector
243240 if self . is_green ( & w, r) {
244- // IMPORTANT: ID must be based on the component index i (and j),
245- // not on t; this is what preserves unbiased Jaccard.
246- let id_key = ( ( j as u64 ) << 32 ) ^ ( i as u64 ) ;
247- let id = self . t_id . hash ( id_key) ;
248- chosen = Some ( BucketKey { time : t, hash_id : id } ) ;
241+ // **Identity must be per-draw** so that two sets collide
242+ // iff they accepted the SAME r_{j,t}. Use (j,t).
243+ let id = self . t_id . hash ( key) ;
244+ buckets[ j] = Some ( ( id, t) ) ;
249245 break ;
250246 }
251247 }
252-
253- if let Some ( kv) = chosen {
254- buckets[ j] = Some ( kv) ;
255- any_filled = true ;
256- }
257248 }
258249
259- // If none filled (extremely unlikely with reasonable L), deterministic fallback
260- if !any_filled {
250+ // Build donor list (indices of non-empty buckets)
251+ let donors: Vec < usize > = buckets
252+ . iter ( )
253+ . enumerate ( )
254+ . filter_map ( |( idx, v) | if v. is_some ( ) { Some ( idx) } else { None } )
255+ . collect ( ) ;
256+
257+ // If none filled (unlikely with decent L), deterministic fallback
258+ if donors. is_empty ( ) {
261259 let mut out = Vec :: with_capacity ( k_usize) ;
262260 for j in 0 ..k_usize {
263261 let fake = ( self . t_rot . hash ( j as u32 ) as u64 ) << 32 | ( j as u64 ) ;
@@ -266,30 +264,23 @@ impl ErsWmh {
266264 return out;
267265 }
268266
269- // Densification by rotation: for each empty j, walk j + off, j + 2*off, ... (mod k)
270- // using a deterministic, data-independent offset derived from j.
267+ // **Uniform** densification with shared randomness:
268+ // For each empty j, pick donor = donors[ H(j) mod donors.len() ].
269+ // This is uniform over donors and depends only on j and the tabulation
270+ // seeds (shared across sets), not on the weights.
271271 for j in 0 ..k_usize {
272272 if buckets[ j] . is_none ( ) {
273- let mut off = ( self . t_rot . hash ( j as u32 ) as usize ) % k_usize;
274- if off == 0 { off = 1 ; }
275- let donor: BucketKey = {
276- let mut t = 0usize ;
277- loop {
278- let jj = ( j + off * ( t + 1 ) ) % k_usize;
279- if let Some ( kv) = buckets[ jj] { break kv; }
280- t += 1 ;
281- // Since at least one filled exists, this loop must terminate
282- }
283- } ;
284- buckets[ j] = Some ( donor) ;
273+ let idx = ( self . t_rot . hash ( j as u32 ) as usize ) % donors. len ( ) ;
274+ let donor = donors[ idx] ;
275+ buckets[ j] = buckets[ donor] ; // copy donor's (id, time)
285276 }
286277 }
287278
288279 // Convert to (id, rank) = (hash_id, time as f64)
289280 let mut out = Vec :: with_capacity ( k_usize) ;
290281 for j in 0 ..k_usize {
291- let key = buckets[ j] . unwrap ( ) ;
292- out. push ( ( key . hash_id , key . time as f64 ) ) ;
282+ let ( id , t ) = buckets[ j] . unwrap ( ) ;
283+ out. push ( ( id , t as f64 ) ) ;
293284 }
294285 out
295286 }
0 commit comments