@@ -255,53 +255,47 @@ fn main() {
255255Efficient Rejection Sampling:
256256
257257``` rust
258- use dartminhash :: ErsWmh ;
258+ use dartminhash :: { ErsWmh } ;
259259use dartminhash :: rng_utils :: mt_from_seed;
260260
261- /// Same cap helper as above
262- fn caps_from_pair (d : usize , a : & [(u64 , f64 )], b : & [(u64 , f64 )]) -> Vec <u32 > {
261+ fn caps_from_sets (d : usize , sets : & [& [(u64 , f64 )]]) -> Vec <u32 > {
263262 let mut m = vec! [1u32 ; d ];
264- for & (i , w ) in a . iter (). chain (b . iter ()) {
265- if w > 0.0 {
266- let cap = (w . ceil () as u32 ). max (1 );
267- let idx = i as usize ;
268- if cap > m [idx ] { m [idx ] = cap ; }
263+ for s in sets {
264+ for & (id , w ) in * s {
265+ if w > 0.0 {
266+ let idx = id as usize ;
267+ let cap = (w . ceil () as u32 ). max (1 );
268+ if cap > m [idx ] { m [idx ] = cap ; }
269+ }
269270 }
270271 }
271272 m
272273}
273274
274275fn main () {
275- let d : usize = 1_000 ;
276- let k : u64 = 128 ; // number of buckets
277- let mut rng = mt_from_seed (42 );
276+ let mut rng = mt_from_seed (1337 );
278277
279- let sample_a = vec! [
280- (5 , 1.2 ),
281- (17 , 0.9 ),
282- (23 , 1.1 ),
283- (42 , 0.95 ),
284- (100 , 1.0 ),
285- ];
286- let sample_b = vec! [
287- (5 , 1.0 ),
288- (17 , 1.0 ),
289- (44 , 1.1 ),
290- (100 , 1.05 ),
291- ];
278+ let d : usize = 200_000 ;
279+ let k : u64 = 1024 ;
280+ let L : u64 = 512 ; // try 256–1024; larger L → fewer pre-densify empties
292281
293- let m_per_dim = caps_from_pair (d , & sample_a , & sample_b );
282+ // Two weighted vectors
283+ let a = vec! [(5 , 1.2 ), (17 , 0.9 ), (23 , 1.1 ), (42 , 0.95 ), (100 , 1.0 )];
284+ let b = vec! [(5 , 1.0 ), (17 , 1.0 ), (44 , 1.1 ), (100 , 1.05 )];
285+
286+ // Caps must dominate both vectors
287+ let m_per_dim = caps_from_sets (d , & [& a , & b ]);
294288
295- // ERS: early-stopping k-bucket sketch, ids per bucket come from accepted r*
296289 let ers = ErsWmh :: new_mt (& mut rng , & m_per_dim , k );
297- let sk_a = ers . sketch_early_stop (& sample_a ); // Vec<(id, rank)>
298- let sk_b = ers . sketch_early_stop (& sample_b );
299290
300- // Estimate J via id-collision rate across buckets
301- let hits = sk_a . iter (). zip (sk_b . iter ()). filter (| (x , y )| x . 0 == y . 0 ). count ();
302- let est_jaccard = hits as f64 / (k as f64 );
291+ // ERS returns k (id, rank) pairs; collisions on id estimate Jaccard
292+ let sk_a = ers . sketch (& a , Some (L ));
293+ let sk_b = ers . sketch (& b , Some (L ));
294+
295+ let hits = sk_a . iter (). zip (& sk_b ). filter (| (x , y )| x . 0 == y . 0 ). count ();
296+ let j_est = hits as f64 / k as f64 ;
303297
304- println! (" ERS estimated weighted Jaccard: {:.4}" , est_jaccard );
298+ println! (" ERS (L={}) estimated weighted Jaccard: {:.4}" , L , j_est );
305299}
306300
307301```
0 commit comments