Make pairwise work with unitful data (#230)

dkarrasch · web-flow · commit c63dc149780e · 2021-10-21T10:26:25.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "Distances"
 uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
-version = "0.10.4"
+version = "0.10.5"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/src/common.jl b/src/common.jl
@@ -86,8 +86,7 @@ end
 
 function norm_percol(a::AbstractMatrix{T}) where {T}
     n = size(a, 2)
-    √T = typeof(sqrt(oneunit(T)))
-    r = Vector{√T}(undef, n)
+    r = Vector{float(T)}(undef, n)
     @simd for j in 1:n
         aj = view(a, :, j)
         r[j] = sqrt(dot(aj, aj))
diff --git a/src/metrics.jl b/src/metrics.jl
@@ -619,22 +619,22 @@ function _pairwise!(r::AbstractMatrix, dist::Union{SqEuclidean,Euclidean},
     R = inplace ? mul!(r, a', b) : a'b
     sa2 = sum(abs2, a, dims=1)
     sb2 = sum(abs2, b, dims=1)
-    threshT = convert(eltype(r), dist.thresh)
-    @inbounds if threshT <= 0
+    z² = zero(real(eltype(R)))
+    @inbounds if dist.thresh <= 0
         # If there's no chance of triggering the threshold, we can use @simd
         for j = 1:nb
             sb = sb2[j]
             @simd for i = 1:na
-                r[i, j] = eval_end(dist, (max(sa2[i] + sb - 2real(R[i, j]), 0)))
+                r[i, j] = eval_end(dist, (max(sa2[i] + sb - 2real(R[i, j]), z²)))
             end
         end
     else
         for j = 1:nb
             sb = sb2[j]
             for i = 1:na
                 selfterms = sa2[i] + sb
-                v = max(selfterms - 2real(R[i, j]), 0)
-                if v < threshT * selfterms
+                v = max(selfterms - 2real(R[i, j]), z²)
+                if v < dist.thresh * selfterms
                     # The distance is likely to be inaccurate, recalculate directly
                     # This reflects the following:
                     #   while sqrt(x+ϵ) ≈ sqrt(x) + O(ϵ/sqrt(x)) when |x| >> ϵ,
@@ -658,22 +658,23 @@ function _pairwise!(r::AbstractMatrix, dist::Union{SqEuclidean,Euclidean}, a::Ab
     # the following checks if a'*b can be stored in r directly, it fails for complex eltypes
     R = inplace ? mul!(r, a', a) : a'a
     sa2 = sum(abs2, a, dims=1)
-    threshT = convert(eltype(r), dist.thresh)
+    safe = dist.thresh <= 0
+    z² = zero(real(eltype(R)))
     @inbounds for j = 1:n
         for i = 1:(j - 1)
             r[i, j] = r[j, i]
         end
-        r[j, j] = 0
+        r[j, j] = zero(eltype(r))
         sa2j = sa2[j]
-        if threshT <= 0
+        if safe
             @simd for i = (j + 1):n
-                r[i, j] = eval_end(dist, (max(sa2[i] + sa2j - 2real(R[i, j]), 0)))
+                r[i, j] = eval_end(dist, (max(sa2[i] + sa2j - 2real(R[i, j]), z²)))
             end
         else
             for i = (j + 1):n
                 selfterms = sa2[i] + sa2j
-                v = max(selfterms - 2real(R[i, j]), 0)
-                if v < threshT * selfterms
+                v = max(selfterms - 2real(R[i, j]), z²)
+                if v < dist.thresh * selfterms
                     v = zero(v)
                     for k = 1:m
                         v += abs2(a[k, i] - a[k, j])
@@ -698,9 +699,10 @@ function _pairwise!(r::AbstractMatrix, dist::Union{WeightedSqEuclidean,WeightedE
     # the following checks if a'*b can be stored in r directly, it fails for complex eltypes
     inplace = promote_type(eltype(r), typeof(oneunit(eltype(a))'oneunit(eltype(b)))) === eltype(r)
     R = inplace ? mul!(r, a', w .* b) : a'*Diagonal(w)*b
+    z² = zero(real(eltype(R)))
     for j = 1:nb
         @simd for i = 1:na
-            @inbounds r[i, j] = eval_end(dist, max(sa2[i] + sb2[j] - 2real(R[i, j]), 0))
+            @inbounds r[i, j] = eval_end(dist, max(sa2[i] + sb2[j] - 2real(R[i, j]), z²))
         end
     end
     r
@@ -715,14 +717,15 @@ function _pairwise!(r::AbstractMatrix, dist::Union{WeightedSqEuclidean,WeightedE
     # the following checks if a'*b can be stored in r directly, it fails for complex eltypes
     inplace = promote_type(eltype(r), typeof(oneunit(eltype(a))'oneunit(eltype(a)))) === eltype(r)
     R = inplace ? mul!(r, a', w .* a) : a'*Diagonal(w)*a
+    z² = zero(real(eltype(R)))
 
     @inbounds for j = 1:n
         for i = 1:(j - 1)
             r[i, j] = r[j, i]
         end
-        r[j, j] = 0
+        r[j, j] = zero(eltype(r))
         @simd for i = (j + 1):n
-            r[i, j] = eval_end(dist, max(sa2[i] + sa2[j] - 2real(R[i, j]), 0))
+            r[i, j] = eval_end(dist, max(sa2[i] + sa2[j] - 2real(R[i, j]), z²))
         end
     end
     r
@@ -734,28 +737,30 @@ function _pairwise!(r::AbstractMatrix, ::CosineDist,
                     a::AbstractMatrix, b::AbstractMatrix)
     require_one_based_indexing(r, a, b)
     m, na, nb = get_pairwise_dims(r, a, b)
-    mul!(r, a', b)
+    inplace = promote_type(eltype(r), typeof(oneunit(eltype(a))'oneunit(eltype(b)))) === eltype(r)
+    R = inplace ? mul!(r, a', b) : a'b
     ra = norm_percol(a)
     rb = norm_percol(b)
     for j = 1:nb
         @simd for i = 1:na
-            @inbounds r[i, j] = max(1 - r[i, j] / (ra[i] * rb[j]), 0)
+            @inbounds r[i, j] = max(1 - R[i, j] / (ra[i] * rb[j]), 0)
         end
     end
     r
 end
 function _pairwise!(r::AbstractMatrix, ::CosineDist, a::AbstractMatrix)
     require_one_based_indexing(r, a)
     m, n = get_pairwise_dims(r, a)
-    mul!(r, a', a)
+    inplace = promote_type(eltype(r), typeof(oneunit(eltype(a))'oneunit(eltype(a)))) === eltype(r)
+    R = inplace ? mul!(r, a', a) : a'a
     ra = norm_percol(a)
     @inbounds for j = 1:n
         for i = 1:(j - 1)
             r[i, j] = r[j, i]
         end
-        r[j, j] = 0
+        r[j, j] = zero(eltype(r))
         @simd for i = j + 1:n
-            r[i, j] = max(1 - r[i, j] / (ra[i] * ra[j]), 0)
+            r[i, j] = max(1 - R[i, j] / (ra[i] * ra[j]), 0)
         end
     end
     r
diff --git a/test/test_dists.jl b/test/test_dists.jl
@@ -878,7 +878,8 @@ end
     @test bregman(G, ∇G, p, q) ≈ ISdist(p, q)
 end
 
-@testset "Unitful vectors" begin
+@testset "Unitful data" begin
+    using Distances, Unitful.DefaultSymbols, Test, LinearAlgebra
     x = [1m, 2m, 3m]; y = [2m, 3m, 4m]; w = [1, 1, 1]; p = [2m, 2m, 2m]
     @test @inferred sqeuclidean(x, y) == 3m^2
     @test @inferred euclidean(x, y) == sqrt(3)m
@@ -903,6 +904,34 @@ end
     @test @inferred wminkowski(x, y, w, 2) == euclidean(x, y)
     @test @inferred whamming(x, y, w) == hamming(x, y)
     @test @inferred peuclidean(x, y, p) == sqrt(3)m
+
+    X = [x y]; Y = [y x]
+    # check specialized pairwise implementations
+    @test pairwise(Euclidean(), X, dims=2)[1,1] == 0m
+    @test pairwise(Euclidean(), X, dims=2)[1,2] == sqrt(3)m
+    @test pairwise(SqEuclidean(), X, dims=2)[1,1] == 0m^2
+    @test pairwise(SqEuclidean(), X, dims=2)[1,2] == 3m^2
+    @test pairwise(WeightedEuclidean(w), X, dims=2)[1,1] == 0m
+    @test pairwise(WeightedEuclidean(w), X, dims=2)[1,2] == sqrt(3)m
+    @test pairwise(WeightedSqEuclidean(w), X, dims=2)[1,1] == 0m^2
+    @test pairwise(WeightedSqEuclidean(w), X, dims=2)[1,2] == 3m^2
+    @test pairwise(Euclidean(), X, Y, dims=2)[1,1] == sqrt(3)m
+    @test pairwise(Euclidean(), X, Y, dims=2)[1,2] == 0m
+    @test pairwise(SqEuclidean(), X, Y, dims=2)[1,1] == 3m^2
+    @test pairwise(SqEuclidean(), X, Y, dims=2)[1,2] == 0m^2
+    @test pairwise(WeightedEuclidean(w), X, Y, dims=2)[1,1] == sqrt(3)m
+    @test pairwise(WeightedEuclidean(w), X, Y, dims=2)[1,2] == 0m
+    @test pairwise(WeightedSqEuclidean(w), X, Y, dims=2)[1,1] == 3m^2
+    @test pairwise(WeightedSqEuclidean(w), X, Y, dims=2)[1,2] == 0m^2
+    @test pairwise(CosineDist(), X, dims=2)[1,1] == 0
+    @test pairwise(CosineDist(), X, dims=2)[1,2] == 1 - dot(x, y) / (norm(x) * norm(y))
+    @test pairwise(CorrDist(), X, dims=2)[1,1] == 0
+    @test pairwise(CorrDist(), X, dims=2)[1,2] == cosine_dist(x .- mean(x), y .- mean(y))
+    # check generic pairwise implementation for one metric
+    @test pairwise(PeriodicEuclidean(p), X, dims=2)[1,1] == 0m
+    @test pairwise(PeriodicEuclidean(p), X, dims=2)[1,2] == sqrt(3)m
+    @test pairwise(PeriodicEuclidean(p), X, Y, dims=2)[1,1] == sqrt(3)m
+    @test pairwise(PeriodicEuclidean(p), X, Y, dims=2)[1,2] == 0m
 end
 
 #=