Add maxnorm keyword argument to MIPSHash (second commit to fix issue #2).

kernelmethod · kernelmethod · commit c334620760f4 · 2020-01-16T13:38:53.000-07:00
diff --git a/src/hashes/mips_hash.jl b/src/hashes/mips_hash.jl
@@ -21,37 +21,60 @@ mutable struct MIPSHash{T <: Union{Float32,Float64}} <: AsymmetricLSHFunction
     Qshift :: Vector{T}
     m :: Int64
 
+    # An upper bound on the norm of the data points this hash function will
+    # process
+    maxnorm :: T
+
     # Whether or not the number of coefficients per hash function should be
     # expanded to be a power of 2 whenever we need to resize coeff_A.
     resize_pow2 :: Bool
 
     ### Internal MIPSHash constructors
-    function MIPSHash{T}(
-            n_hashes::Integer = 1;
-            scale::Real = 1,
-            m::Integer = 3,
-            resize_pow2::Bool = false) where {T <: Union{Float32,Float64}}
-
-        if n_hashes < 1
-            "n_hashes must be positive" |> ErrorException |> throw
-        elseif scale ≤ 0
-            "scaling factor `scale` must be positive" |> ErrorException |> throw
-        elseif m ≤ 0
-            "m must be positive" |> ErrorException |> throw
-        end
-
-        coeff_A = Matrix{T}(undef, n_hashes, 0)
-        coeff_B = randn(T, n_hashes, m)
-        scale = T(scale)
-        m = Int64(m)
-        shift = rand(T, n_hashes)
-        Qshift = coeff_B * fill(T(1/2), m) ./ scale .+ shift
-
-	    new{T}(coeff_A, coeff_B, scale, shift, Qshift, m, resize_pow2)
-    end
 end
 
 ### External MIPSHash constructors
+@generated function MIPSHash{T}(n_hashes::Integer = 1;
+                                maxnorm::Union{Nothing,Real} = nothing,
+                                scale::Real = 1,
+                                m::Integer = 3,
+                                resize_pow2::Bool = false) where T
+    if maxnorm <: Nothing
+        :("maxnorm must be specified for MIPSHash" |>
+          ErrorException |>
+          throw)
+    else
+        quote
+            if n_hashes < 1
+                "n_hashes must be positive" |>
+                ErrorException |>
+                throw
+            elseif scale ≤ 0
+                "scaling factor `scale` must be positive" |>
+                ErrorException |>
+                throw
+            elseif m ≤ 0
+                "m must be positive" |>
+                ErrorException |>
+                throw
+            elseif maxnorm ≤ 0
+                "maxnorm must be positive" |>
+                ErrorException |>
+                throw
+            end
+
+            coeff_A = Matrix{T}(undef, n_hashes, 0)
+            coeff_B = randn(T, n_hashes, m)
+            scale = T(scale)
+            m = Int64(m)
+            shift = rand(T, n_hashes)
+            Qshift = coeff_B * fill(T(1/2), m) ./ scale .+ shift
+
+            MIPSHash{T}(coeff_A, coeff_B, scale, shift, Qshift, m,
+	                    maxnorm, resize_pow2)
+	    end
+	end
+end
+
 
 MIPSHash(args...; dtype=Float32, kws...) =
 	MIPSHash{dtype}(args...; kws...)
@@ -108,20 +131,18 @@ h(P(x)) definitions
     end
 end
 
-function _MIPSHash_P(h :: MIPSHash{T}, x :: AbstractArray) where {T}
+function _MIPSHash_P(hashfn::MIPSHash{T}, x::AbstractArray) where {T}
     n = size(x,1)
-    if n > current_max_input_size(h)
-        resize!(h, size(x,1))
+    if n > current_max_input_size(hashfn)
+        resize!(hashfn, size(x,1))
     end
 
     norms = col_norms(x)
-    maxnorm = maximum(norms)
-    maxnorm = maxnorm == 0 ? 1 : maxnorm	# To handle some edge cases
-    BLAS.scal!(length(norms), 1/maxnorm, norms, 1)
+    BLAS.scal!(length(norms), 1/hashfn.maxnorm, norms, 1)
 
     # First, perform a matvec on x and the first array of coefficients.
     # Note: aTx is an n_hashes × n_inputs array
-    @views aTx = h.coeff_A[1:end,1:n] * x .* (1/maxnorm) |> mat
+    @views aTx = hashfn.coeff_A[1:end,1:n] * x .* (1/hashfn.maxnorm) |> mat
 
     # Compute norms^2, norms^4, ... norms^(2^m).
     # Multiply these by the second array of coefficients and add them to aTx, so
@@ -135,13 +156,13 @@ function _MIPSHash_P(h :: MIPSHash{T}, x :: AbstractArray) where {T}
     # concatenations.
     # Note that m is typically small, so these iterations don't do much to harm
     # performance
-    for ii = 1:h.m
+    for ii = 1:hashfn.m
         norms .^= 2
-        MIPSHash_P_update_aTx!(h.coeff_B[:,ii], norms, aTx)
+        MIPSHash_P_update_aTx!(hashfn.coeff_B[:,ii], norms, aTx)
     end
 
     # Compute the remainder of the hash the same way we'd compute an L^p distance LSH.
-    @. aTx = aTx / h.scale + h.shift
+    @. aTx = aTx / hashfn.scale + hashfn.shift
 
     return floor.(Int32, aTx)
 end
@@ -170,7 +191,7 @@ h(Q(x)) definitions
     end
 end
 
-function _MIPSHash_Q(hashfn::MIPSHash, x::AbstractArray)
+function _MIPSHash_Q(hashfn::MIPSHash{T}, x::AbstractArray) where T
     n = size(x,1)
     if n > current_max_input_size(hashfn)
         resize!(hashfn, n)
@@ -184,10 +205,8 @@ function _MIPSHash_Q(hashfn::MIPSHash, x::AbstractArray)
     # aTx (rather than before) so that we don't have to allocate a new array
     # of size(x). Moreover, for large input vectors, the size of aTx is typically
     # much smaller than the size of x.
-    f(x::T) where {T} = (x ≈ T(0) ? T(1) : x)
     norms = col_norms(x)
-    map!(f, norms, norms)
-
+    map!(x::T -> x ≈ T(0) ? T(1) : x, norms, norms)
     aTx .= aTx ./ norms'
 
     # Here, we would multiply the second array of coefficients by the elements
diff --git a/test/hashes/test_mips_hash.jl b/test/hashes/test_mips_hash.jl
@@ -12,38 +12,48 @@ Tests
     import SparseArrays: sprandn
 
     @testset "Can construct a simple MIPSHash" begin
-        hashfn = MIPSHash()
+        hashfn = MIPSHash(; maxnorm=1)
 
         @test n_hashes(hashfn) == 1
         @test hashtype(hashfn) == Vector{Int32}
         @test isa(hashfn, MIPSHash{Float32})    # Default dtype should be Float32
         @test isa(hashfn, LSH.AsymmetricLSHFunction)
 
         ##
-        hashfn = MIPSHash(12)
+        hashfn = MIPSHash(12; maxnorm=1)
 
         @test n_hashes(hashfn) == 12
 
         ##
-        hashfn = MIPSHash(; dtype=Float64)
+        hashfn = MIPSHash(; dtype=Float64, maxnorm=1)
 
         @test isa(hashfn, MIPSHash{Float64})
 
         ##
-        hashfn = MIPSHash{Float64}()
+        hashfn = MIPSHash{Float64}(; maxnorm=1)
         @test isa(hashfn, MIPSHash{Float64})
 
         ### Invalid hash function construction
-
-        @test_throws ErrorException MIPSHash(-1)
-        @test_throws ErrorException MIPSHash(; m=-1)
-        @test_throws ErrorException MIPSHash(; m=0)
-        @test_throws ErrorException MIPSHash(; scale=-1)
-        @test_throws ErrorException MIPSHash(; scale=0)
+        # Non-positive number of hash functions
+        @test_throws ErrorException MIPSHash(-1; maxnorm=1)
+        @test_throws ErrorException MIPSHash( 0; maxnorm=1)
+
+        # Non-positive m
+        @test_throws ErrorException MIPSHash(; m = -1, maxnorm=1)
+        @test_throws ErrorException MIPSHash(; m =  0, maxnorm=1)
+
+        # Non-positive scale factor
+        @test_throws ErrorException MIPSHash(; scale = -1, maxnorm=1)
+        @test_throws ErrorException MIPSHash(; scale =  0, maxnorm=1)
+
+        # maxnorm not specified or non-positive
+        @test_throws ErrorException MIPSHash()
+        @test_throws ErrorException MIPSHash(; maxnorm=-1)
+        @test_throws ErrorException MIPSHash(; maxnorm=0)
     end
 
     @testset "Hashing returns the correct data types" begin
-        hashfn = MIPSHash{Float64}(; scale=1, m=3)
+        hashfn = MIPSHash{Float64}(; maxnorm=20, scale=1, m=3)
 
         # Matrix{Float64} -> Matrix{Int32}
         x = randn(4, 10)
@@ -66,14 +76,16 @@ Tests
 
     @testset "MIPSHash h(P(x)) is correctly computed" begin
         n_hashes = 128
-        scale = 0.5
-        m = 3
-        hashfn = MIPSHash(n_hashes; scale=scale, m=m)
+        scale    = 0.5
+        m        = 3
+        x        = randn(20)
+        maxnorm  = 2*norm(x)
+
+        hashfn = MIPSHash(n_hashes; maxnorm=maxnorm, scale=scale, m=m)
 
         @test size(hashfn.coeff_B) == (n_hashes, 3)
         @test size(hashfn.shift) == (n_hashes,)
 
-        x = randn(20)
         hash = index_hash(hashfn, x)
 
         @test isa(hash, Vector{Int32})
@@ -87,7 +99,7 @@ Tests
         ### Compute hash manually
         # Start by performing the transform P(x)
         coeff = [hashfn.coeff_A hashfn.coeff_B]
-        u = x / norm(x)
+        u = x / maxnorm
         norm_powers = [norm(u)^2, norm(u)^4, norm(u)^8]
         Px = [u; norm_powers]
 
@@ -100,14 +112,16 @@ Tests
 
     @testset "MIPSHash h(Q(x)) is correctly computed" begin
         n_hashes = 128
-        scale = 0.5
-        m = 3
-        hashfn = MIPSHash(n_hashes; scale=scale, m=m)
+        scale    = 0.5
+        m        = 3
+        x        = randn(20)
+        maxnorm  = 2*norm(x)
+
+        hashfn = MIPSHash(n_hashes; maxnorm=maxnorm, scale=scale, m=m)
 
         @test size(hashfn.coeff_B) == (n_hashes, m)
         @test size(hashfn.shift) == (n_hashes,)
 
-        x = randn(40)
         hash = query_hash(hashfn, x)
 
         @test isa(hash, Vector{Int32})
@@ -135,7 +149,7 @@ Tests
 
     @testset "Hash inputs of different sizes" begin
         n_hashes = 16
-        hashfn = MIPSHash(n_hashes)
+        hashfn = MIPSHash(n_hashes; maxnorm=1000)
 
         index_hash(hashfn, rand(10))
         @test size(hashfn.coeff_A) == (n_hashes, 10)
@@ -157,7 +171,7 @@ Tests
     end
 
     @testset "resize_pow2 increases number of coefficients to powers of 2" begin
-        hashfn = MIPSHash(10; resize_pow2=true)
+        hashfn = MIPSHash(10; maxnorm=1000, resize_pow2=true)
         @test size(hashfn.coeff_A) == (10, 0)
 
         index_hash(hashfn, rand(3))
@@ -174,45 +188,40 @@ Tests
     end
 
     @testset "MIPSHash generates collisions for large inner products" begin
-        n_hashes = 256
-        scale = 1
-        m = 5
-        hashfn = MIPSHash(n_hashes; scale=scale, m=m)
-
-        x = randn(20)
-        x_query_hashes = query_hash(hashfn, x)
-
-        # Check that MIPSHash isn't just generating a single query hash
-        @test any(x_query_hashes .!= x_query_hashes[1])
+        input_length = 5; n_hashes = 128;
 
-        # Compute the indexing hashes for a dataset with four vectors:
-        # a) 10 * x (where x is the test query vector)
+        # Compare a random vector x against four other vectors:
+        # a) 10 * x
         # b) x
         # c) A vector of all zeros
         # d) -x
-        dataset = [(10*x) x zero(x) -x]
+        x = randn(input_length)
+        x2, x3, x4 = 10*x, zero(x), -x
+
+        maxnorm = (x, x2, x3, x4) .|> norm |> maximum
+        hashfn = MIPSHash(n_hashes; maxnorm=maxnorm)
+
+        x_query_hashes = query_hash(hashfn, x)
+
+        dataset = [x2 x x3 x4]
         p_hashes = index_hash(hashfn, dataset)
 
         # Each collection of hashes should be different from one another
         @test let result = true
-	        for (ii,jj) in product(1:4, 1:4)
-		        if ii != jj && p_hashes[:,ii] == p_hashes[:,jj]
-			        result = false
-			        break
-		        end
-	        end
-	        result
+            for (ii,jj) in Iterators.product(1:4, 1:4)
+	            if ii != jj && p_hashes[:,ii] == p_hashes[:,jj]
+		            result = false
+		            break
+	            end
+            end
+            result
         end
-
-        # The number of collisions should be highest for x and 2*x, second-highest
-        # for x and x, second-lowest for x and zeros, and lowest for x and -x
-        n_collisions = [sum(x_query_hashes .== p) for p in eachcol(p_hashes)]
-        @test n_collisions[1] > n_collisions[2] > n_collisions[3] > n_collisions[4]
     end
 
     @testset "Can compute hashes for sparse arrays" begin
         X = sprandn(Float32, 10, 1000, 0.2)
-        hashfn = MIPSHash(8; scale=1, m=1)
+        maxnorm = X |> eachcol .|> norm |> maximum
+        hashfn = MIPSHash(8; maxnorm=maxnorm, scale=1, m=1)
 
         ihashes = index_hash(hashfn, X)
         qhashes = query_hash(hashfn, X)
diff --git a/test/hashes/test_sign_alsh.jl b/test/hashes/test_sign_alsh.jl
@@ -110,7 +110,8 @@ Tests
         # The number of collisions should be highest for x and 2*x, second-highest
         # for x and x, second-lowest for x and zeros, and lowest for x and -x
         n_collisions = [sum(x_query_hashes .== p) for p in eachcol(p_hashes)]
-        @test n_collisions[1] > n_collisions[2] > n_collisions[3] > n_collisions[4]
+        @test n_collisions[1] > n_collisions[2] >
+              n_collisions[3] > n_collisions[4]
     end
 
     @testset "Can hash sparse arrays" begin
diff --git a/test/tables/test_table.jl b/test/tables/test_table.jl
@@ -126,7 +126,7 @@ using Test, Random, LSH
         n_inputs = 128
         n_hashes = 8
 
-        hashfn_mips = MIPSHash(n_hashes)
+        hashfn_mips = MIPSHash(n_hashes; maxnorm=input_size)
         hashfn_sign = SignALSH(n_hashes; maxnorm=input_size)
 
         for hashfn in (hashfn_mips, hashfn_sign)