Merge pull request #53 from scikit-tda/better_corresp_api

ctralie · web-flow · commit c35fa049de6c · 2021-05-23T18:46:30.000-04:00
Better corresp api
diff --git a/RELEASE.txt b/RELEASE.txt
@@ -1,3 +1,7 @@
+0.3.1
+    - Fixed bug with repeated intervals in bottleneck
+    - Tidied up API for indicating matchings for bottleneck and wasserstein, and updated notebook
+
 0.3.0
     - Add implementations of Persistence Landscapes, including plotting methods, a transformer, and additional notebooks.
 
diff --git a/docs/notebooks/distances.ipynb b/docs/notebooks/distances.ipynb
diff --git a/persim/_version.py b/persim/_version.py
@@ -1 +1 @@
-__version__ = "0.3.0"
+__version__ = "0.3.1"
diff --git a/persim/bottleneck.py b/persim/bottleneck.py
@@ -39,8 +39,11 @@ def bottleneck(dgm1, dgm2, matching=False):
 
     d: float
         bottleneck distance between dgm1 and dgm2
-    (matching, D): Only returns if `matching=True`
-        (tuples of matched indices, (N+M)x(N+M) cross-similarity matrix)
+    matching: ndarray(Mx+Nx, 3), Only returns if `matching=True`
+        A list of correspondences in an optimal matching, as well as their distance, where:
+        * First column is index of point in first persistence diagram, or -1 if diagonal
+        * Second column is index of point in second persistence diagram, or -1 if diagonal
+        * Third column is the distance of each matching
     """
 
     return_matching = matching
@@ -84,18 +87,21 @@ def bottleneck(dgm1, dgm2, matching=False):
     # Put diagonal elements into the matrix, being mindful that Linfinity
     # balls meet the diagonal line at a diamond vertex
     D = np.zeros((M + N, M + N))
+    # Upper left is Linfinity cross-similarity between two diagrams
     D[0:M, 0:N] = DUL
-    UR = np.max(D) * np.ones((M, M))
+    # Upper right is diagonal matching of points from S
+    UR = np.inf * np.ones((M, M))
     np.fill_diagonal(UR, 0.5 * (S[:, 1] - S[:, 0]))
     D[0:M, N::] = UR
-    UL = np.max(D) * np.ones((N, N))
+    # Lower left is diagonal matching of points from T
+    UL = np.inf * np.ones((N, N))
     np.fill_diagonal(UL, 0.5 * (T[:, 1] - T[:, 0]))
     D[M::, 0:N] = UL
+    # Lower right is all 0s by default (remaining diagonals match to diagonals)
 
     # Step 2: Perform a binary search + Hopcroft Karp to find the
     # bottleneck distance
-    M = D.shape[0]
-    ds = np.sort(np.unique(D.flatten()))
+    ds = np.sort(np.unique(D.flatten()))[0:-1] # Everything but np.inf
     bdist = ds[-1]
     matching = {}
     while len(ds) >= 1:
@@ -104,18 +110,29 @@ def bottleneck(dgm1, dgm2, matching=False):
             idx = bisect_left(range(ds.size), int(ds.size / 2))
         d = ds[idx]
         graph = {}
-        for i in range(M):
-            graph["%s" % i] = {j for j in range(M) if D[i, j] <= d}
+        for i in range(D.shape[0]):
+            graph["{}".format(i)] = {j for j in range(D.shape[1]) if D[i, j] <= d}
         res = HopcroftKarp(graph).maximum_matching()
-        if len(res) == 2 * M and d <= bdist:
+        if len(res) == 2 * D.shape[0] and d <= bdist:
             bdist = d
             matching = res
             ds = ds[0:idx]
         else:
             ds = ds[idx + 1::]
 
     if return_matching:
-        matchidx = [(i, matching["%i" % i]) for i in range(M)]
-        return bdist, (matchidx, D)
+        matchidx = []
+        for i in range(M+N):
+            j = matching["{}".format(i)]
+            d = D[i, j]
+            if i < M:
+                if j >= N:
+                    j = -1 # Diagonal match from first persistence diagram
+            else:
+                if j >= N: # Diagonal to diagonal, so don't include this
+                    continue
+                i = -1
+            matchidx.append([i, j, d])
+        return bdist, np.array(matchidx)
     else:
         return bdist
diff --git a/persim/visuals.py b/persim/visuals.py
@@ -168,20 +168,21 @@ def plot_diagrams(
 def plot_a_bar(p, q, c='b', linestyle='-'):
     plt.plot([p[0], q[0]], [p[1], q[1]], c=c, linestyle=linestyle, linewidth=1)
 
-def bottleneck_matching(I1, I2, matchidx, D, labels=["dgm1", "dgm2"], ax=None):
+def bottleneck_matching(dgm1, dgm2, matching, labels=["dgm1", "dgm2"], ax=None):
     """ Visualize bottleneck matching between two diagrams
 
     Parameters
     ===========
 
-    I1: array
-        A diagram
-    I2: array
-        A diagram
-    matchidx: tuples of matched indices
-        if input `matching=True`, then return matching
-    D: array
-        cross-similarity matrix
+    dgm1: Mx(>=2) 
+        array of birth/death pairs for PD 1
+    dgm2: Nx(>=2) 
+        array of birth/death paris for PD 2
+    matching: ndarray(Mx+Nx, 3)
+        A list of correspondences in an optimal matching, as well as their distance, where:
+        * First column is index of point in first persistence diagram, or -1 if diagonal
+        * Second column is index of point in second persistence diagram, or -1 if diagonal
+        * Third column is the distance of each matching
     labels: list of strings
         names of diagrams for legend. Default = ["dgm1", "dgm2"], 
     ax: matplotlib Axis object
@@ -191,50 +192,61 @@ def bottleneck_matching(I1, I2, matchidx, D, labels=["dgm1", "dgm2"], ax=None):
     Examples
     ==========
 
-    bn_matching, (matchidx, D) = persim.bottleneck(A_h1, B_h1, matching=True)
-    persim.bottleneck_matching(A_h1, B_h1, matchidx, D)
+    dist, matching = persim.bottleneck(A_h1, B_h1, matching=True)
+    persim.bottleneck_matching(A_h1, B_h1, matching)
 
     """
     ax = ax or plt.gca()
 
-    plot_diagrams([I1, I2], labels=labels, ax=ax)
+    plot_diagrams([dgm1, dgm2], labels=labels, ax=ax)
     cp = np.cos(np.pi / 4)
     sp = np.sin(np.pi / 4)
     R = np.array([[cp, -sp], [sp, cp]])
-    if I1.size == 0:
-        I1 = np.array([[0, 0]])
-    if I2.size == 0:
-        I2 = np.array([[0, 0]])
-    I1Rot = I1.dot(R)
-    I2Rot = I2.dot(R)
-    dists = [D[i, j] for (i, j) in matchidx]
-    (i, j) = matchidx[np.argmax(dists)]
-    if i >= I1.shape[0] and j >= I2.shape[0]:
-        return
-    if i >= I1.shape[0]:
-        diagElem = np.array([I2Rot[j, 0], 0])
-        diagElem = diagElem.dot(R.T)
-        plt.plot([I2[j, 0], diagElem[0]], [I2[j, 1], diagElem[1]], "g")
-    elif j >= I2.shape[0]:
-        diagElem = np.array([I1Rot[i, 0], 0])
-        diagElem = diagElem.dot(R.T)
-        ax.plot([I1[i, 0], diagElem[0]], [I1[i, 1], diagElem[1]], "g")
-    else:
-        ax.plot([I1[i, 0], I2[j, 0]], [I1[i, 1], I2[j, 1]], "g")
-
-
-def wasserstein_matching(I1, I2, matchidx, palette=None, labels=["dgm1", "dgm2"], colors=None, ax=None):
+    if dgm1.size == 0:
+        dgm1 = np.array([[0, 0]])
+    if dgm2.size == 0:
+        dgm2 = np.array([[0, 0]])
+    dgm1Rot = dgm1.dot(R)
+    dgm2Rot = dgm2.dot(R)
+    max_idx = np.argmax(matching[:, 2])
+    for idx, [i, j, d] in enumerate(matching):
+        i = int(i)
+        j = int(j)
+        linestyle = '--'
+        linewidth = 1
+        c = 'C2'
+        if idx == max_idx:
+            linestyle = '-'
+            linewidth = 2
+            c = 'C3'
+        if i != -1 or j != -1: # At least one point is a non-diagonal point
+            if i == -1:
+                diagElem = np.array([dgm2Rot[j, 0], 0])
+                diagElem = diagElem.dot(R.T)
+                plt.plot([dgm2[j, 0], diagElem[0]], [dgm2[j, 1], diagElem[1]], c, linewidth=linewidth, linestyle=linestyle)
+            elif j == -1:
+                diagElem = np.array([dgm1Rot[i, 0], 0])
+                diagElem = diagElem.dot(R.T)
+                ax.plot([dgm1[i, 0], diagElem[0]], [dgm1[i, 1], diagElem[1]], c, linewidth=linewidth, linestyle=linestyle)
+            else:
+                ax.plot([dgm1[i, 0], dgm2[j, 0]], [dgm1[i, 1], dgm2[j, 1]], c, linewidth=linewidth, linestyle=linestyle)
+
+
+def wasserstein_matching(dgm1, dgm2, matching, labels=["dgm1", "dgm2"], ax=None):
     """ Visualize bottleneck matching between two diagrams
 
     Parameters
     ===========
 
-    I1: array
+    dgm1: array
         A diagram
-    I2: array
+    dgm2: array
         A diagram
-    matchidx: tuples of matched indices
-        if input `matching=True`, then return matching
+    matching: ndarray(Mx+Nx, 3)
+        A list of correspondences in an optimal matching, as well as their distance, where:
+        * First column is index of point in first persistence diagram, or -1 if diagonal
+        * Second column is index of point in second persistence diagram, or -1 if diagonal
+        * Third column is the distance of each matching
     labels: list of strings
         names of diagrams for legend. Default = ["dgm1", "dgm2"], 
     ax: matplotlib Axis object
@@ -252,25 +264,25 @@ def wasserstein_matching(I1, I2, matchidx, palette=None, labels=["dgm1", "dgm2"]
     cp = np.cos(np.pi / 4)
     sp = np.sin(np.pi / 4)
     R = np.array([[cp, -sp], [sp, cp]])
-    if I1.size == 0:
-        I1 = np.array([[0, 0]])
-    if I2.size == 0:
-        I2 = np.array([[0, 0]])
-    I1Rot = I1.dot(R)
-    I2Rot = I2.dot(R)
-    for index in matchidx:
-        (i, j) = index
-        if i >= I1.shape[0] and j >= I2.shape[0]:
-            continue
-        if i >= I1.shape[0]:
-            diagElem = np.array([I2Rot[j, 0], 0])
-            diagElem = diagElem.dot(R.T)
-            plt.plot([I2[j, 0], diagElem[0]], [I2[j, 1], diagElem[1]], "g")
-        elif j >= I2.shape[0]:
-            diagElem = np.array([I1Rot[i, 0], 0])
-            diagElem = diagElem.dot(R.T)
-            ax.plot([I1[i, 0], diagElem[0]], [I1[i, 1], diagElem[1]], "g")
-        else:
-            ax.plot([I1[i, 0], I2[j, 0]], [I1[i, 1], I2[j, 1]], "g")
-
-    plot_diagrams([I1, I2], labels=labels, ax=ax)
+    if dgm1.size == 0:
+        dgm1 = np.array([[0, 0]])
+    if dgm2.size == 0:
+        dgm2 = np.array([[0, 0]])
+    dgm1Rot = dgm1.dot(R)
+    dgm2Rot = dgm2.dot(R)
+    for [i, j, d] in matching:
+        i = int(i)
+        j = int(j)
+        if i != -1 or j != -1: # At least one point is a non-diagonal point
+            if i == -1:
+                diagElem = np.array([dgm2Rot[j, 0], 0])
+                diagElem = diagElem.dot(R.T)
+                plt.plot([dgm2[j, 0], diagElem[0]], [dgm2[j, 1], diagElem[1]], "g")
+            elif j == -1:
+                diagElem = np.array([dgm1Rot[i, 0], 0])
+                diagElem = diagElem.dot(R.T)
+                ax.plot([dgm1[i, 0], diagElem[0]], [dgm1[i, 1], diagElem[1]], "g")
+            else:
+                ax.plot([dgm1[i, 0], dgm2[j, 0]], [dgm1[i, 1], dgm2[j, 1]], "g")
+
+    plot_diagrams([dgm1, dgm2], labels=labels, ax=ax)
diff --git a/persim/wasserstein.py b/persim/wasserstein.py
@@ -70,7 +70,7 @@ def wasserstein(dgm1, dgm2, matching=False):
     if N == 0:
         T = np.array([[0, 0]])
         N = 1
-    # Step 1: Compute CSM between S and dgm2, including points on diagonal
+    # Compute CSM between S and dgm2, including points on diagonal
     DUL = metrics.pairwise.pairwise_distances(S, T)
 
     # Put diagonal elements into the matrix
@@ -82,11 +82,12 @@ def wasserstein(dgm1, dgm2, matching=False):
     S = S[:, 0:2].dot(R)
     T = T[:, 0:2].dot(R)
     D = np.zeros((M+N, M+N))
+    np.fill_diagonal(D, 0)
     D[0:M, 0:N] = DUL
-    UR = np.max(D)*np.ones((M, M))
+    UR = np.inf*np.ones((M, M))
     np.fill_diagonal(UR, S[:, 1])
     D[0:M, N:N+M] = UR
-    UL = np.max(D)*np.ones((N, N))
+    UL = np.inf*np.ones((N, N))
     np.fill_diagonal(UL, T[:, 1])
     D[M:N+M, 0:N] = UL
 
@@ -96,6 +97,14 @@ def wasserstein(dgm1, dgm2, matching=False):
 
     if matching:
         matchidx = [(i, j) for i, j in zip(matchi, matchj)]
-        return matchdist, (matchidx, D)
+        ret = np.zeros((len(matchidx), 3))
+        ret[:, 0:2] = np.array(matchidx)
+        ret[:, 2] = D[matchi, matchj]
+        # Indicate diagonally matched points
+        ret[ret[:, 0] >= M, 0] = -1
+        ret[ret[:, 1] >= N, 1] = -1
+        # Exclude diagonal to diagonal
+        ret = ret[ret[:, 0] + ret[:, 1] != -2, :] 
+        return matchdist, ret
 
     return matchdist
diff --git a/test/test_distances.py b/test/test_distances.py
@@ -61,9 +61,7 @@ def test_different_size(self):
                 [0.5, 1.1]
             ])
         )
-
-        # These are very loose bounds
-        assert d == pytest.approx(0.1, 0.001)
+        assert d == 0.25
 
     def test_matching(self):
         dgm1 = np.array([
@@ -77,14 +75,15 @@ def test_matching(self):
             [1.0, 1.1],
         ])
 
-        d, (m, D) = bottleneck(
+        d, m = bottleneck(
             dgm1, dgm2,
             matching=True
         )
-
-        # These are very loose bounds
-        assert len(m) == len(dgm1) + len(dgm2)
-        assert D.shape == (len(dgm1) + len(dgm2), len(dgm1) + len(dgm2))
+        u1 = np.unique(m[:, 0])
+        u1 = u1[u1 >= 0]
+        u2 = np.unique(m[:, 1])
+        u2 = u2[u2 >= 0]
+        assert u1.size == dgm1.shape[0] and u2.size == dgm2.shape[0]
     
     def test_matching_to_self(self):
         # Matching a diagram to itself should yield 0
@@ -122,6 +121,13 @@ def test_inf_deathtime(self):
             dist2 = bottleneck(dgm, empty)
         assert (dist1 == 0.5) and (dist2 == 0.5)
 
+    def test_repeated(self):
+        # Issue #44
+        G = np.array([[ 0, 1], [0,1]])
+        H = np.array([[ 0, 1]])
+        dist = bottleneck(G, H)
+        assert dist == 0.5
+
 class TestWasserstein:
     def test_single(self):
         d = wasserstein(
@@ -175,6 +181,34 @@ def test_inf_deathtime(self):
         with pytest.warns(UserWarning, match="dgm2 has points with non-finite death") as w:
             dist2 = wasserstein(dgm, empty)
         assert (np.allclose(dist1, np.sqrt(2)/2)) and (np.allclose(dist2, np.sqrt(2)/2))
+    
+    def test_repeated(self):
+        dgm1 = np.array([[0, 10], [0,10]])
+        dgm2 = np.array([[0, 10]])
+        dist = wasserstein(dgm1, dgm2)
+        assert dist == 5*np.sqrt(2)
+
+    def test_matching(self):
+        dgm1 = np.array([
+            [0.5, 1],
+            [0.6, 1.1]
+        ])
+        dgm2 = np.array([
+            [0.5, 1.1],
+            [0.6, 1.1],
+            [0.8, 1.1],
+            [1.0, 1.1],
+        ])
+
+        d, m = wasserstein(
+            dgm1, dgm2,
+            matching=True
+        )
+        u1 = np.unique(m[:, 0])
+        u1 = u1[u1 >= 0]
+        u2 = np.unique(m[:, 1])
+        u2 = u2[u2 >= 0]
+        assert u1.size == dgm1.shape[0] and u2.size == dgm2.shape[0]
 
 
 class TestSliced:
diff --git a/test/test_visuals.py b/test/test_visuals.py
@@ -198,8 +198,8 @@ def test_bottleneck_matching(self):
             [0.3, 0.45]
         ])
 
-        d, (matching, D) = persim.bottleneck(dgm1, dgm2, matching=True)
-        persim.bottleneck_matching(dgm1, dgm2, matching, D)
+        d, matching = persim.bottleneck(dgm1, dgm2, matching=True)
+        persim.bottleneck_matching(dgm1, dgm2, matching)
 
     def test_plot_labels(self):
         dgm1 = np.array([
@@ -211,6 +211,6 @@ def test_plot_labels(self):
             [0.3, 0.45]
         ])
 
-        d, (matching, D) = persim.bottleneck(dgm1, dgm2, matching=True)
+        d, matching = persim.bottleneck(dgm1, dgm2, matching=True)
         persim.bottleneck_matching(
-            dgm1, dgm2, matching, D, labels=["X", "Y"])
+            dgm1, dgm2, matching, labels=["X", "Y"])

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.3.0"`
	`1`	`+__version__ = "0.3.1"`