Skip to content

Commit 0fe0b04

Browse files
committed
Use single_hash_collision_probability in MinHash collision probability tests (rather than directly use Jaccard similarity). Add tests for hashtype(::MinHash).
1 parent ab7b4d7 commit 0fe0b04

File tree

1 file changed

+8
-5
lines changed

1 file changed

+8
-5
lines changed

test/hashes/test_minhash.jl

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Tests
2121

2222
@test isa(hashfn, MinHash{eltype(symbols)})
2323
@test LSH.n_hashes(hashfn) == nh
24+
@test hashtype(hashfn) == UInt32
2425
end
2526
end
2627

@@ -91,7 +92,7 @@ Tests
9192
@test sum(hashes_1 .== hashes_2) > sum(hashes_1 .== hashes_3)
9293
end
9394

94-
@testset "Collision probability approx. equals Jaccard similarity" begin
95+
@testset "MinHash observed collision frequencies match probabilities" begin
9596
# In theory, the probability of collision for two datasets should be
9697
# roughly equal to the Jaccard similarity between those datasets
9798
symbols = collect(1:200)
@@ -107,13 +108,15 @@ Tests
107108
hashes_2 = hashfn(dataset_2)
108109
hashes_3 = hashfn(dataset_3)
109110

110-
sim_12 = jaccard(Set(dataset_1), Set(dataset_2))
111-
sim_13 = jaccard(Set(dataset_1), Set(dataset_3))
111+
sim_12 = similarity(hashfn)(Set(dataset_1), Set(dataset_2))
112+
sim_13 = similarity(hashfn)(Set(dataset_1), Set(dataset_3))
113+
prob_12 = LSH.single_hash_collision_probability(hashfn, sim_12)
114+
prob_13 = LSH.single_hash_collision_probability(hashfn, sim_13)
112115

113116
mean(x) = sum(x) / length(x)
114117

115-
@test abs(mean(hashes_1 .== hashes_2) - sim_12) 0.01
116-
@test abs(mean(hashes_1 .== hashes_3) - sim_13) 0.01
118+
@test abs(mean(hashes_1 .== hashes_2) - prob_12) 0.01
119+
@test abs(mean(hashes_1 .== hashes_3) - prob_13) 0.01
117120
end
118121

119122
@testset "Can omit symbol set to lazily update hash functions" begin

0 commit comments

Comments
 (0)