Merge branch 'master' of https://github.com/scikit-tda/persim

ctralie · ctralie · commit 6bb756238733 · 2020-01-12T09:22:00.000-05:00
diff --git a/persim/bottleneck.py b/persim/bottleneck.py
@@ -1,6 +1,7 @@
 """
 
-    Implementation of the bottleneck distance
+    Implementation of the bottleneck distance using binary
+    search and the Hopcroft-Karp algorithm
 
     Author: Chris Tralie
 
@@ -10,6 +11,7 @@
 
 from bisect import bisect_left
 from hopcroftkarp import HopcroftKarp
+import warnings
 
 __all__ = ["bottleneck"]
 
@@ -44,12 +46,32 @@ def bottleneck(dgm1, dgm2, matching=False):
     return_matching = matching
 
     S = np.array(dgm1)
-    S = S[np.isfinite(S[:, 1]), :]
+    M = min(S.shape[0], S.size)
+    if S.size > 0:
+        S = S[np.isfinite(S[:, 1]), :]
+        if S.shape[0] < M:
+            warnings.warn(
+                "dgm1 has points with non-finite death times;"+
+                "ignoring those points"
+            )
+            M = S.shape[0]
     T = np.array(dgm2)
-    T = T[np.isfinite(T[:, 1]), :]
-
-    N = S.shape[0]
-    M = T.shape[0]
+    N = min(T.shape[0], T.size)
+    if T.size > 0:
+        T = T[np.isfinite(T[:, 1]), :]
+        if T.shape[0] < N:
+            warnings.warn(
+                "dgm2 has points with non-finite death times;"+
+                "ignoring those points"
+            )
+            N = T.shape[0]
+
+    if M == 0:
+        S = np.array([[0, 0]])
+        M = 1
+    if N == 0:
+        T = np.array([[0, 0]])
+        N = 1
 
     # Step 1: Compute CSM between S and T, including points on diagonal
     # L Infinity distance
@@ -61,18 +83,18 @@ def bottleneck(dgm1, dgm2, matching=False):
 
     # Put diagonal elements into the matrix, being mindful that Linfinity
     # balls meet the diagonal line at a diamond vertex
-    D = np.zeros((N + M, N + M))
-    D[0:N, 0:M] = DUL
-    UR = np.max(D) * np.ones((N, N))
+    D = np.zeros((M + N, M + N))
+    D[0:M, 0:N] = DUL
+    UR = np.max(D) * np.ones((M, M))
     np.fill_diagonal(UR, 0.5 * (S[:, 1] - S[:, 0]))
-    D[0:N, M::] = UR
-    UL = np.max(D) * np.ones((M, M))
+    D[0:M, N::] = UR
+    UL = np.max(D) * np.ones((N, N))
     np.fill_diagonal(UL, 0.5 * (T[:, 1] - T[:, 0]))
-    D[N::, 0:M] = UL
+    D[M::, 0:N] = UL
 
     # Step 2: Perform a binary search + Hopcroft Karp to find the
     # bottleneck distance
-    N = D.shape[0]
+    M = D.shape[0]
     ds = np.sort(np.unique(D.flatten()))
     bdist = ds[-1]
     matching = {}
@@ -82,18 +104,18 @@ def bottleneck(dgm1, dgm2, matching=False):
             idx = bisect_left(range(ds.size), int(ds.size / 2))
         d = ds[idx]
         graph = {}
-        for i in range(N):
-            graph["%s" % i] = {j for j in range(N) if D[i, j] <= d}
+        for i in range(M):
+            graph["%s" % i] = {j for j in range(M) if D[i, j] <= d}
         res = HopcroftKarp(graph).maximum_matching()
-        if len(res) == 2 * N and d <= bdist:
+        if len(res) == 2 * M and d <= bdist:
             bdist = d
             matching = res
             ds = ds[0:idx]
         else:
             ds = ds[idx + 1::]
 
     if return_matching:
-        matchidx = [(i, matching["%i" % i]) for i in range(N)]
+        matchidx = [(i, matching["%i" % i]) for i in range(M)]
         return bdist, (matchidx, D)
     else:
         return bdist
diff --git a/persim/wasserstein.py b/persim/wasserstein.py
@@ -1,6 +1,15 @@
+"""
+
+    Implementation of the Wasserstein distance using
+    the Hungarian algorithm
+
+    Author: Chris Tralie
+
+"""
 import numpy as np
 from sklearn import metrics
 from scipy import optimize
+import warnings
 
 __all__ = ["wasserstein"]
 
@@ -22,7 +31,7 @@ def wasserstein(dgm1, dgm2, matching=False):
     dgm2: Nx(>=2) 
         array of birth/death paris for PD 2
     matching: bool, default False
-        if True, return matching infromation and cross-similarity matrix
+        if True, return matching information and cross-similarity matrix
 
     Returns 
     ---------
@@ -34,34 +43,52 @@ def wasserstein(dgm1, dgm2, matching=False):
 
     """
 
-    # Step 1: Compute CSM between S and dgm2, including points on diagonal
-    N = dgm1.shape[0]
-    M = dgm2.shape[0]
-    # Handle the cases where there are no points in the diagrams
-    if N == 0:
-        dgm1 = np.array([[0, 0]])
-        N = 1
+    S = np.array(dgm1)
+    M = min(S.shape[0], S.size)
+    if S.size > 0:
+        S = S[np.isfinite(S[:, 1]), :]
+        if S.shape[0] < M:
+            warnings.warn(
+                "dgm1 has points with non-finite death times;"+
+                "ignoring those points"
+            )
+            M = S.shape[0]
+    T = np.array(dgm2)
+    N = min(T.shape[0], T.size)
+    if T.size > 0:
+        T = T[np.isfinite(T[:, 1]), :]
+        if T.shape[0] < N:
+            warnings.warn(
+                "dgm2 has points with non-finite death times;"+
+                "ignoring those points"
+            )
+            N = T.shape[0]
+
     if M == 0:
-        dgm2 = np.array([[0, 0]])
+        S = np.array([[0, 0]])
         M = 1
-    DUL = metrics.pairwise.pairwise_distances(dgm1, dgm2)
+    if N == 0:
+        T = np.array([[0, 0]])
+        N = 1
+    # Step 1: Compute CSM between S and dgm2, including points on diagonal
+    DUL = metrics.pairwise.pairwise_distances(S, T)
 
     # Put diagonal elements into the matrix
     # Rotate the diagrams to make it easy to find the straight line
     # distance to the diagonal
     cp = np.cos(np.pi/4)
     sp = np.sin(np.pi/4)
     R = np.array([[cp, -sp], [sp, cp]])
-    dgm1 = dgm1[:, 0:2].dot(R)
-    dgm2 = dgm2[:, 0:2].dot(R)
-    D = np.zeros((N+M, N+M))
-    D[0:N, 0:M] = DUL
-    UR = np.max(D)*np.ones((N, N))
-    np.fill_diagonal(UR, dgm1[:, 1])
-    D[0:N, M:M+N] = UR
-    UL = np.max(D)*np.ones((M, M))
-    np.fill_diagonal(UL, dgm2[:, 1])
-    D[N:M+N, 0:M] = UL
+    S = S[:, 0:2].dot(R)
+    T = T[:, 0:2].dot(R)
+    D = np.zeros((M+N, M+N))
+    D[0:M, 0:N] = DUL
+    UR = np.max(D)*np.ones((M, M))
+    np.fill_diagonal(UR, S[:, 1])
+    D[0:M, N:N+M] = UR
+    UL = np.max(D)*np.ones((N, N))
+    np.fill_diagonal(UL, T[:, 1])
+    D[M:N+M, 0:N] = UL
 
     # Step 2: Run the hungarian algorithm
     matchi, matchj = optimize.linear_sum_assignment(D)
diff --git a/test/test_distances.py b/test/test_distances.py
@@ -106,6 +106,21 @@ def test_2x2_bisect_bug(self):
         dgm2 = np.array([[4, 10], [9, 10]])
         dist = bottleneck(dgm1, dgm2)
         assert dist == 2
+    
+    def test_one_empty(self):
+        dgm1 = np.array([[1, 2]])
+        empty = np.array([[]])
+        dist = bottleneck(dgm1, empty)
+        assert dist == 0.5
+    
+    def test_inf_deathtime(self):
+        dgm = np.array([[1, 2]])
+        empty = np.array([[0, np.inf]])
+        with pytest.warns(UserWarning, match="dgm1 has points with non-finite death") as w:
+            dist1 = bottleneck(empty, dgm)
+        with pytest.warns(UserWarning, match="dgm2 has points with non-finite death") as w:
+            dist2 = bottleneck(dgm, empty)
+        assert (dist1 == 0.5) and (dist2 == 0.5)
 
 class TestWasserstein:
     def test_single(self):
@@ -145,6 +160,21 @@ def test_single_point_same(self):
         dgm = np.array([[0.11371516, 4.45734882]])
         dist = wasserstein(dgm, dgm)
         assert dist == 0
+    
+    def test_one_empty(self):
+        dgm1 = np.array([[1, 2]])
+        empty = np.array([])
+        dist = wasserstein(dgm1, empty)
+        assert np.allclose(dist, np.sqrt(2)/2)
+
+    def test_inf_deathtime(self):
+        dgm = np.array([[1, 2]])
+        empty = np.array([[0, np.inf]])
+        with pytest.warns(UserWarning, match="dgm1 has points with non-finite death") as w:
+            dist1 = wasserstein(empty, dgm)
+        with pytest.warns(UserWarning, match="dgm2 has points with non-finite death") as w:
+            dist2 = wasserstein(dgm, empty)
+        assert (np.allclose(dist1, np.sqrt(2)/2)) and (np.allclose(dist2, np.sqrt(2)/2))
 
 
 class TestSliced: