Add a precision threshold to Euclidean and SqEuclidean (#63)

timholy · KristofferC · commit 9d09e914bb48 · 2017-02-07T13:51:49.000+01:00
If a matrix contains duplicated columns, often the distance between identical points (which should be 1) is of order 1e-8 due to the fact that sqrt(roundofferror) ~ 1e-8. This changes the behavior of Euclidean to recalculate the distance by direct subtraction when the points are close compared to their magnitudes.
diff --git a/README.md b/README.md
@@ -154,6 +154,32 @@ Each distance corresponds to a distance type. The type name and the correspondin
 
 **Note:** The formulas above are using *Julia*'s functions. These formulas are mainly for conveying the math concepts in a concise way. The actual implementation may use a faster way.
 
+### Precision for Euclidean and SqEuclidean
+
+For efficiency (see the benchmarks below), `Euclidean` and
+`SqEuclidean` make use of BLAS3 matrix-matrix multiplication to
+calculate distances.  This corresponds to the following expansion:
+
+```julia
+(x-y)^2 == x^2 - 2xy + y^2
+```
+
+However, equality is not precise in the presence of roundoff error,
+and particularly when `x` and `y` are nearby points this may not be
+accurate.  Consequently, `Euclidean` and `SqEuclidean` allow you to
+supply a relative tolerance to force recalculation:
+
+```julia
+julia> x = reshape([0.1, 0.3, -0.1], 3, 1);
+
+julia> pairwise(Euclidean(), x, x)
+1×1 Array{Float64,2}:
+ 7.45058e-9
+
+julia> pairwise(Euclidean(1e-12), x, x)
+1×1 Array{Float64,2}:
+ 0.0
+```
 
 ## Benchmarks
 
@@ -215,5 +241,3 @@ The table below compares the performance (measured in terms of average elapsed t
 | Mahalanobis | 0.373796 | 0.002359 | **158.4337** |
 
 For distances of which a major part of the computation is a quadratic form (e.g. *Euclidean*, *CosineDist*, *Mahalanobis*), the performance can be drastically improved by restructuring the computation and delegating the core part to ``GEMM`` in *BLAS*. The use of this strategy can easily lead to 100x performance gain over simple loops (see the highlighted part of the table above).
-
-
diff --git a/src/metrics.jl b/src/metrics.jl
@@ -6,8 +6,12 @@
 #
 ###########################################################
 
-type Euclidean <: Metric end
-type SqEuclidean <: SemiMetric end
+immutable Euclidean <: Metric
+    thresh::Float64
+end
+immutable SqEuclidean <: SemiMetric
+    thresh::Float64
+end
 type Chebyshev <: Metric end
 type Cityblock <: Metric end
 type Jaccard <: Metric end
@@ -53,6 +57,44 @@ type SpanNormDist <: SemiMetric end
 
 typealias UnionMetrics Union{Euclidean, SqEuclidean, Chebyshev, Cityblock, Minkowski, Hamming, Jaccard, RogersTanimoto, CosineDist, CorrDist, ChiSqDist, KLDivergence, RenyiDivergence, JSDivergence, SpanNormDist}
 
+"""
+    Euclidean([thresh])
+
+Create a euclidean metric.
+
+When computing distances among large numbers of points, it can be much
+more efficient to exploit the formula
+
+    (x-y)^2 = x^2 - 2xy + y^2
+
+However, this can introduce roundoff error. `thresh` (which defaults
+to 0) specifies the relative square-distance tolerance on `2xy`
+compared to `x^2 + y^2` to force recalculation of the distance using
+the more precise direct (elementwise-subtraction) formula.
+
+# Example:
+```julia
+julia> x = reshape([0.1, 0.3, -0.1], 3, 1);
+
+julia> pairwise(Euclidean(), x, x)
+1×1 Array{Float64,2}:
+ 7.45058e-9
+
+julia> pairwise(Euclidean(1e-12), x, x)
+1×1 Array{Float64,2}:
+ 0.0
+```
+"""
+Euclidean() = Euclidean(0)
+
+"""
+    SqEuclidean([thresh])
+
+Create a squared-euclidean semi-metric. For the meaning of `thresh`,
+see [`Euclidean`](@ref).
+"""
+SqEuclidean() = SqEuclidean(0)
+
 ###########################################################
 #
 #  Define Evaluate
@@ -289,6 +331,7 @@ end
 end
 rogerstanimoto{T <: Bool}(a::AbstractArray{T}, b::AbstractArray{T}) = evaluate(RogersTanimoto(), a, b)
 
+
 ###########################################################
 #
 #  Special method
@@ -300,28 +343,65 @@ function pairwise!(r::AbstractMatrix, dist::SqEuclidean, a::AbstractMatrix, b::A
     At_mul_B!(r, a, b)
     sa2 = sumabs2(a, 1)
     sb2 = sumabs2(b, 1)
-    pdist!(r, sa2, sb2)
-end
-function pdist!(r, sa2, sb2)
-    for j = 1 : size(r,2)
-        sb = sb2[j]
-        @simd for i = 1 : size(r,1)
-            @inbounds r[i,j] = sa2[i] + sb - 2 * r[i,j]
+    threshT = convert(eltype(r), dist.thresh)
+    if threshT <= 0
+        # If there's no chance of triggering the threshold, we can use @simd
+        for j = 1 : size(r,2)
+            sb = sb2[j]
+            @simd for i = 1 : size(r,1)
+                @inbounds r[i,j] = sa2[i] + sb - 2 * r[i,j]
+            end
+        end
+    else
+        for j = 1 : size(r,2)
+            sb = sb2[j]
+            for i = 1 : size(r,1)
+                @inbounds selfterms = sa2[i] + sb
+                @inbounds v = selfterms - 2*r[i,j]
+                if v < threshT*selfterms
+                    # The distance is likely to be inaccurate, recalculate at higher prec.
+                    # This reflects the following:
+                    #   ((x+ϵ) - y)^2 ≈ x^2 - 2xy + y^2 + O(ϵ)    when |x-y| >> ϵ
+                    #   ((x+ϵ) - y)^2 ≈ O(ϵ^2)                    otherwise
+                    v = zero(v)
+                    for k = 1:size(a,1)
+                        @inbounds v += (a[k,i]-b[k,j])^2
+                    end
+                end
+                @inbounds r[i,j] = v
+            end
         end
     end
     r
 end
+
 function pairwise!(r::AbstractMatrix, dist::SqEuclidean, a::AbstractMatrix)
     m, n = get_pairwise_dims(r, a)
     At_mul_B!(r, a, a)
     sa2 = sumsq_percol(a)
+    threshT = convert(eltype(r), dist.thresh)
     @inbounds for j = 1 : n
         for i = 1 : j-1
             r[i,j] = r[j,i]
         end
         r[j,j] = 0
-        for i = j+1 : n
-            r[i,j] = sa2[i] + sa2[j] - 2 * r[i,j]
+        sa2j = sa2[j]
+        if threshT <= 0
+            @simd for i = j+1 : n
+                r[i,j] = sa2[i] + sa2j - 2 * r[i,j]
+            end
+        else
+            for i = j+1 : n
+                selfterms = sa2[i] + sa2j
+                v = selfterms - 2*r[i,j]
+                if v < threshT*selfterms
+                    v = zero(v)
+                    for k = 1:size(a,1)
+                        v += (a[k,i]-a[k,j])^2
+                    end
+                end
+                r[i,j] = v
+            end
         end
     end
     r
@@ -333,10 +413,23 @@ function pairwise!(r::AbstractMatrix, dist::Euclidean, a::AbstractMatrix, b::Abs
     At_mul_B!(r, a, b)
     sa2 = sumsq_percol(a)
     sb2 = sumsq_percol(b)
+    threshT = convert(eltype(r), dist.thresh)
     @inbounds for j = 1 : nb
+        sb = sb2[j]
         for i = 1 : na
-            v = sa2[i] + sb2[j] - 2 * r[i,j]
-            r[i,j] = isnan(v) ? NaN : sqrt(max(v, 0.))
+            selfterms = sa2[i] + sb
+            v = selfterms - 2*r[i,j]
+            if v < threshT*selfterms
+                # The distance is likely to be inaccurate, recalculate directly
+                # This reflects the following:
+                #   while sqrt(x+ϵ) ≈ sqrt(x) + O(ϵ/sqrt(x)) when |x| >> ϵ,
+                #         sqrt(x+ϵ) ≈ O(sqrt(ϵ))             otherwise.
+                v = zero(v)
+                for k = 1:m
+                    v += (a[k,i]-b[k,j])^2
+                end
+            end
+            r[i,j] = sqrt(v)
         end
     end
     r
@@ -346,14 +439,23 @@ function pairwise!(r::AbstractMatrix, dist::Euclidean, a::AbstractMatrix)
     m, n = get_pairwise_dims(r, a)
     At_mul_B!(r, a, a)
     sa2 = sumsq_percol(a)
+    threshT = convert(eltype(r), dist.thresh)
     @inbounds for j = 1 : n
         for i = 1 : j-1
             r[i,j] = r[j,i]
         end
-        @inbounds r[j,j] = 0
+        r[j,j] = 0
+        sa2j = sa2[j]
         for i = j+1 : n
-            v = sa2[i] + sa2[j] - 2 * r[i,j]
-            r[i,j] = isnan(v) ? NaN : sqrt(max(v, 0.))
+            selfterms = sa2[i] + sa2j
+            v = selfterms - 2*r[i,j]
+            if v < threshT*selfterms
+                v = zero(v)
+                for k = 1:m
+                    v += (a[k,i]-a[k,j])^2
+                end
+            end
+            r[i,j] = sqrt(v)
         end
     end
     r
diff --git a/test/test_dists.jl b/test/test_dists.jl
@@ -103,7 +103,7 @@ w = rand(size(a))
 p = r = rand(12)
 p[p .< 0.3] = 0.0
 scale = sum(p) / sum(r)
-r /= sum(r)    
+r /= sum(r)
 p /= sum(p)
 q = rand(12)
 q /= sum(q)
@@ -121,14 +121,14 @@ end
 @test renyi_divergence(p, p, rand()) ≈ 0
 @test renyi_divergence(p, p, 1.0 + rand()) ≈ 0
 @test renyi_divergence(p, p, Inf) ≈ 0
-@test renyi_divergence(p, r, 0) ≈ -log(scale)    
-@test renyi_divergence(p, r, 1) ≈ -log(scale)    
-@test renyi_divergence(p, r, rand()) ≈ -log(scale)    
+@test renyi_divergence(p, r, 0) ≈ -log(scale)
+@test renyi_divergence(p, r, 1) ≈ -log(scale)
+@test renyi_divergence(p, r, rand()) ≈ -log(scale)
 @test renyi_divergence(p, r, Inf) ≈ -log(scale)
 @test isinf(renyi_divergence([0.0, 0.5, 0.5], [0.0, 1.0, 0.0], Inf))
 @test renyi_divergence([0.0, 1.0, 0.0], [0.0, 0.5, 0.5], Inf) ≈ log(2.0)
 @test renyi_divergence(p, q, 1) ≈ kl_divergence(p, q)
-    
+
 pm = (p + q) / 2
 jsv = kl_divergence(p, pm) / 2 + kl_divergence(q, pm) / 2
 @test js_divergence(p, p) ≈ 0.0
@@ -385,3 +385,19 @@ Q = Q * Q'  # make sure Q is positive-definite
 @test_pairwise Mahalanobis(Q) X Y
 
 end #testset
+
+@testset "Euclidean precision" begin
+    X = [0.1 0.2; 0.3 0.4; -0.1 -0.1]
+    pd = pairwise(Euclidean(1e-12), X, X)
+    @test pd[1,1] == 0
+    @test pd[2,2] == 0
+    pd = pairwise(Euclidean(1e-12), X)
+    @test pd[1,1] == 0
+    @test pd[2,2] == 0
+    pd = pairwise(SqEuclidean(1e-12), X, X)
+    @test pd[1,1] == 0
+    @test pd[2,2] == 0
+    pd = pairwise(SqEuclidean(1e-12), X)
+    @test pd[1,1] == 0
+    @test pd[2,2] == 0
+end