Skip to content

Commit 2b5844f

Browse files
committed
updating ers
1 parent 3384083 commit 2b5844f

File tree

1 file changed

+26
-32
lines changed

1 file changed

+26
-32
lines changed

README.md

Lines changed: 26 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -255,53 +255,47 @@ fn main() {
255255
Efficient Rejection Sampling:
256256

257257
```rust
258-
use dartminhash::ErsWmh;
258+
use dartminhash::{ErsWmh};
259259
use dartminhash::rng_utils::mt_from_seed;
260260

261-
/// Same cap helper as above
262-
fn caps_from_pair(d: usize, a: &[(u64, f64)], b: &[(u64, f64)]) -> Vec<u32> {
261+
fn caps_from_sets(d: usize, sets: &[&[(u64, f64)]]) -> Vec<u32> {
263262
let mut m = vec![1u32; d];
264-
for &(i, w) in a.iter().chain(b.iter()) {
265-
if w > 0.0 {
266-
let cap = (w.ceil() as u32).max(1);
267-
let idx = i as usize;
268-
if cap > m[idx] { m[idx] = cap; }
263+
for s in sets {
264+
for &(id, w) in *s {
265+
if w > 0.0 {
266+
let idx = id as usize;
267+
let cap = (w.ceil() as u32).max(1);
268+
if cap > m[idx] { m[idx] = cap; }
269+
}
269270
}
270271
}
271272
m
272273
}
273274

274275
fn main() {
275-
let d: usize = 1_000;
276-
let k: u64 = 128; // number of buckets
277-
let mut rng = mt_from_seed(42);
276+
let mut rng = mt_from_seed(1337);
278277

279-
let sample_a = vec![
280-
(5, 1.2),
281-
(17, 0.9),
282-
(23, 1.1),
283-
(42, 0.95),
284-
(100, 1.0),
285-
];
286-
let sample_b = vec![
287-
(5, 1.0),
288-
(17, 1.0),
289-
(44, 1.1),
290-
(100, 1.05),
291-
];
278+
let d: usize = 200_000;
279+
let k: u64 = 1024;
280+
let L: u64 = 512; // try 256–1024; larger L → fewer pre-densify empties
292281

293-
let m_per_dim = caps_from_pair(d, &sample_a, &sample_b);
282+
// Two weighted vectors
283+
let a = vec![(5, 1.2), (17, 0.9), (23, 1.1), (42, 0.95), (100, 1.0)];
284+
let b = vec![(5, 1.0), (17, 1.0), (44, 1.1), (100, 1.05)];
285+
286+
// Caps must dominate both vectors
287+
let m_per_dim = caps_from_sets(d, &[&a, &b]);
294288

295-
// ERS: early-stopping k-bucket sketch, ids per bucket come from accepted r*
296289
let ers = ErsWmh::new_mt(&mut rng, &m_per_dim, k);
297-
let sk_a = ers.sketch_early_stop(&sample_a); // Vec<(id, rank)>
298-
let sk_b = ers.sketch_early_stop(&sample_b);
299290

300-
// Estimate J via id-collision rate across buckets
301-
let hits = sk_a.iter().zip(sk_b.iter()).filter(|(x, y)| x.0 == y.0).count();
302-
let est_jaccard = hits as f64 / (k as f64);
291+
// ERS returns k (id, rank) pairs; collisions on id estimate Jaccard
292+
let sk_a = ers.sketch(&a, Some(L));
293+
let sk_b = ers.sketch(&b, Some(L));
294+
295+
let hits = sk_a.iter().zip(&sk_b).filter(|(x, y)| x.0 == y.0).count();
296+
let j_est = hits as f64 / k as f64;
303297

304-
println!("ERS estimated weighted Jaccard: {:.4}", est_jaccard);
298+
println!("ERS (L={}) estimated weighted Jaccard: {:.4}", L, j_est);
305299
}
306300

307301
```

0 commit comments

Comments
 (0)