Fixed up bottleneck and came up with better conventions for plotting matching, but wasserstein is still broken

ctralie · ctralie · commit cc5c3511638f · 2021-05-23T18:08:02.000-04:00
diff --git a/docs/notebooks/distances.ipynb b/docs/notebooks/distances.ipynb
diff --git a/persim/bottleneck.py b/persim/bottleneck.py
@@ -39,8 +39,11 @@ def bottleneck(dgm1, dgm2, matching=False):
 
     d: float
         bottleneck distance between dgm1 and dgm2
-    (matching, D): Only returns if `matching=True`
-        (tuples of matched indices, (N+M)x(N+M) cross-similarity matrix)
+    matching: ndarray(Mx+Nx, 3), Only returns if `matching=True`
+        A list of correspondences in an optimal matching, as well as their distance, where:
+        * First column is index of point in first persistence diagram, or -1 if diagonal
+        * Second column is index of point in second persistence diagram, or -1 if diagonal
+        * Third column is the distance of each matching
     """
 
     return_matching = matching
@@ -83,19 +86,21 @@ def bottleneck(dgm1, dgm2, matching=False):
 
     # Put diagonal elements into the matrix, being mindful that Linfinity
     # balls meet the diagonal line at a diamond vertex
-    D = np.inf*np.ones((M + N, M + N))
-    np.fill_diagonal(D, 0)
+    D = np.zeros((M + N, M + N))
+    # Upper left is Linfinity cross-similarity between two diagrams
     D[0:M, 0:N] = DUL
-    UR = np.max(D) * np.ones((M, M))
+    # Upper right is diagonal matching of points from S
+    UR = np.inf * np.ones((M, M))
     np.fill_diagonal(UR, 0.5 * (S[:, 1] - S[:, 0]))
     D[0:M, N::] = UR
-    UL = np.max(D) * np.ones((N, N))
+    # Lower left is diagonal matching of points from T
+    UL = np.inf * np.ones((N, N))
     np.fill_diagonal(UL, 0.5 * (T[:, 1] - T[:, 0]))
     D[M::, 0:N] = UL
+    # Lower right is all 0s by default (remaining diagonals match to diagonals)
 
     # Step 2: Perform a binary search + Hopcroft Karp to find the
     # bottleneck distance
-    M = D.shape[0]
     ds = np.sort(np.unique(D.flatten()))[0:-1] # Everything but np.inf
     bdist = ds[-1]
     matching = {}
@@ -105,18 +110,29 @@ def bottleneck(dgm1, dgm2, matching=False):
             idx = bisect_left(range(ds.size), int(ds.size / 2))
         d = ds[idx]
         graph = {}
-        for i in range(M):
-            graph["%s" % i] = {j for j in range(M) if D[i, j] <= d}
+        for i in range(D.shape[0]):
+            graph["{}".format(i)] = {j for j in range(D.shape[1]) if D[i, j] <= d}
         res = HopcroftKarp(graph).maximum_matching()
-        if len(res) == 2 * M and d <= bdist:
+        if len(res) == 2 * D.shape[0] and d <= bdist:
             bdist = d
             matching = res
             ds = ds[0:idx]
         else:
             ds = ds[idx + 1::]
 
     if return_matching:
-        matchidx = [(i, matching["%i" % i]) for i in range(M)]
-        return bdist, (matchidx, D)
+        matchidx = []
+        for i in range(M+N):
+            j = matching["{}".format(i)]
+            d = D[i, j]
+            if i < M:
+                if j >= N:
+                    j = -1 # Diagonal match from first persistence diagram
+            else:
+                if j >= N: # Diagonal to diagonal, so don't include this
+                    continue
+                i = -1
+            matchidx.append([i, j, d])
+        return bdist, np.array(matchidx)
     else:
         return bdist
diff --git a/persim/visuals.py b/persim/visuals.py
@@ -168,20 +168,21 @@ def plot_diagrams(
 def plot_a_bar(p, q, c='b', linestyle='-'):
     plt.plot([p[0], q[0]], [p[1], q[1]], c=c, linestyle=linestyle, linewidth=1)
 
-def bottleneck_matching(I1, I2, matchidx, D, labels=["dgm1", "dgm2"], ax=None):
+def bottleneck_matching(dgm1, dgm2, matching, labels=["dgm1", "dgm2"], ax=None):
     """ Visualize bottleneck matching between two diagrams
 
     Parameters
     ===========
 
-    I1: array
-        A diagram
-    I2: array
-        A diagram
-    matchidx: tuples of matched indices
-        if input `matching=True`, then return matching
-    D: array
-        cross-similarity matrix
+    dgm1: Mx(>=2) 
+        array of birth/death pairs for PD 1
+    dgm2: Nx(>=2) 
+        array of birth/death paris for PD 2
+    matching: ndarray(Mx+Nx, 3)
+        A list of correspondences in an optimal matching, as well as their distance, where:
+        * First column is index of point in first persistence diagram, or -1 if diagonal
+        * Second column is index of point in second persistence diagram, or -1 if diagonal
+        * Third column is the distance of each matching
     labels: list of strings
         names of diagrams for legend. Default = ["dgm1", "dgm2"], 
     ax: matplotlib Axis object
@@ -191,50 +192,61 @@ def bottleneck_matching(I1, I2, matchidx, D, labels=["dgm1", "dgm2"], ax=None):
     Examples
     ==========
 
-    bn_matching, (matchidx, D) = persim.bottleneck(A_h1, B_h1, matching=True)
-    persim.bottleneck_matching(A_h1, B_h1, matchidx, D)
+    dist, matching = persim.bottleneck(A_h1, B_h1, matching=True)
+    persim.bottleneck_matching(A_h1, B_h1, matching)
 
     """
     ax = ax or plt.gca()
 
-    plot_diagrams([I1, I2], labels=labels, ax=ax)
+    plot_diagrams([dgm1, dgm2], labels=labels, ax=ax)
     cp = np.cos(np.pi / 4)
     sp = np.sin(np.pi / 4)
     R = np.array([[cp, -sp], [sp, cp]])
-    if I1.size == 0:
-        I1 = np.array([[0, 0]])
-    if I2.size == 0:
-        I2 = np.array([[0, 0]])
-    I1Rot = I1.dot(R)
-    I2Rot = I2.dot(R)
-    dists = [D[i, j] for (i, j) in matchidx]
-    (i, j) = matchidx[np.argmax(dists)]
-    if i >= I1.shape[0] and j >= I2.shape[0]:
-        return
-    if i >= I1.shape[0]:
-        diagElem = np.array([I2Rot[j, 0], 0])
-        diagElem = diagElem.dot(R.T)
-        plt.plot([I2[j, 0], diagElem[0]], [I2[j, 1], diagElem[1]], "g")
-    elif j >= I2.shape[0]:
-        diagElem = np.array([I1Rot[i, 0], 0])
-        diagElem = diagElem.dot(R.T)
-        ax.plot([I1[i, 0], diagElem[0]], [I1[i, 1], diagElem[1]], "g")
-    else:
-        ax.plot([I1[i, 0], I2[j, 0]], [I1[i, 1], I2[j, 1]], "g")
-
-
-def wasserstein_matching(I1, I2, matchidx, palette=None, labels=["dgm1", "dgm2"], colors=None, ax=None):
+    if dgm1.size == 0:
+        dgm1 = np.array([[0, 0]])
+    if dgm2.size == 0:
+        dgm2 = np.array([[0, 0]])
+    dgm1Rot = dgm1.dot(R)
+    dgm2Rot = dgm2.dot(R)
+    max_idx = np.argmax(matching[:, 2])
+    for idx, [i, j, d] in enumerate(matching):
+        i = int(i)
+        j = int(j)
+        linestyle = '--'
+        linewidth = 1
+        c = 'C2'
+        if idx == max_idx:
+            linestyle = '-'
+            linewidth = 2
+            c = 'C3'
+        if i != -1 or j != -1: # At least one point is a non-diagonal point
+            if i == -1:
+                diagElem = np.array([dgm2Rot[j, 0], 0])
+                diagElem = diagElem.dot(R.T)
+                plt.plot([dgm2[j, 0], diagElem[0]], [dgm2[j, 1], diagElem[1]], c, linewidth=linewidth, linestyle=linestyle)
+            elif j == -1:
+                diagElem = np.array([dgm1Rot[i, 0], 0])
+                diagElem = diagElem.dot(R.T)
+                ax.plot([dgm1[i, 0], diagElem[0]], [dgm1[i, 1], diagElem[1]], c, linewidth=linewidth, linestyle=linestyle)
+            else:
+                ax.plot([dgm1[i, 0], dgm2[j, 0]], [dgm1[i, 1], dgm2[j, 1]], c, linewidth=linewidth, linestyle=linestyle)
+
+
+def wasserstein_matching(dgm1, dgm2, matching, labels=["dgm1", "dgm2"], ax=None):
     """ Visualize bottleneck matching between two diagrams
 
     Parameters
     ===========
 
-    I1: array
+    dgm1: array
         A diagram
-    I2: array
+    dgm2: array
         A diagram
-    matchidx: tuples of matched indices
-        if input `matching=True`, then return matching
+    matching: ndarray(Mx+Nx, 3)
+        A list of correspondences in an optimal matching, as well as their distance, where:
+        * First column is index of point in first persistence diagram, or -1 if diagonal
+        * Second column is index of point in second persistence diagram, or -1 if diagonal
+        * Third column is the distance of each matching
     labels: list of strings
         names of diagrams for legend. Default = ["dgm1", "dgm2"], 
     ax: matplotlib Axis object
@@ -252,25 +264,25 @@ def wasserstein_matching(I1, I2, matchidx, palette=None, labels=["dgm1", "dgm2"]
     cp = np.cos(np.pi / 4)
     sp = np.sin(np.pi / 4)
     R = np.array([[cp, -sp], [sp, cp]])
-    if I1.size == 0:
-        I1 = np.array([[0, 0]])
-    if I2.size == 0:
-        I2 = np.array([[0, 0]])
-    I1Rot = I1.dot(R)
-    I2Rot = I2.dot(R)
-    for index in matchidx:
-        (i, j) = index
-        if i >= I1.shape[0] and j >= I2.shape[0]:
-            continue
-        if i >= I1.shape[0]:
-            diagElem = np.array([I2Rot[j, 0], 0])
-            diagElem = diagElem.dot(R.T)
-            plt.plot([I2[j, 0], diagElem[0]], [I2[j, 1], diagElem[1]], "g")
-        elif j >= I2.shape[0]:
-            diagElem = np.array([I1Rot[i, 0], 0])
-            diagElem = diagElem.dot(R.T)
-            ax.plot([I1[i, 0], diagElem[0]], [I1[i, 1], diagElem[1]], "g")
-        else:
-            ax.plot([I1[i, 0], I2[j, 0]], [I1[i, 1], I2[j, 1]], "g")
-
-    plot_diagrams([I1, I2], labels=labels, ax=ax)
+    if dgm1.size == 0:
+        dgm1 = np.array([[0, 0]])
+    if dgm2.size == 0:
+        dgm2 = np.array([[0, 0]])
+    dgm1Rot = dgm1.dot(R)
+    dgm2Rot = dgm2.dot(R)
+    for [i, j, d] in matching:
+        i = int(i)
+        j = int(j)
+        if i != -1 or j != -1: # At least one point is a non-diagonal point
+            if i == -1:
+                diagElem = np.array([dgm2Rot[j, 0], 0])
+                diagElem = diagElem.dot(R.T)
+                plt.plot([dgm2[j, 0], diagElem[0]], [dgm2[j, 1], diagElem[1]], "g")
+            elif j == -1:
+                diagElem = np.array([dgm1Rot[i, 0], 0])
+                diagElem = diagElem.dot(R.T)
+                ax.plot([dgm1[i, 0], diagElem[0]], [dgm1[i, 1], diagElem[1]], "g")
+            else:
+                ax.plot([dgm1[i, 0], dgm2[j, 0]], [dgm1[i, 1], dgm2[j, 1]], "g")
+
+    plot_diagrams([dgm1, dgm2], labels=labels, ax=ax)
diff --git a/persim/wasserstein.py b/persim/wasserstein.py
@@ -70,7 +70,7 @@ def wasserstein(dgm1, dgm2, matching=False):
     if N == 0:
         T = np.array([[0, 0]])
         N = 1
-    # Step 1: Compute CSM between S and dgm2, including points on diagonal
+    # Compute CSM between S and dgm2, including points on diagonal
     DUL = metrics.pairwise.pairwise_distances(S, T)
 
     # Put diagonal elements into the matrix
@@ -81,21 +81,29 @@ def wasserstein(dgm1, dgm2, matching=False):
     R = np.array([[cp, -sp], [sp, cp]])
     S = S[:, 0:2].dot(R)
     T = T[:, 0:2].dot(R)
-    D = np.zeros((M+N, M+N))
+    D = np.inf*np.ones((M+N, M+N))
+    np.fill_diagonal(D, 0)
     D[0:M, 0:N] = DUL
-    UR = np.max(D)*np.ones((M, M))
+    UR = np.inf*np.ones((M, M))
     np.fill_diagonal(UR, S[:, 1])
     D[0:M, N:N+M] = UR
-    UL = np.max(D)*np.ones((N, N))
+    UL = np.inf*np.ones((N, N))
     np.fill_diagonal(UL, T[:, 1])
     D[M:N+M, 0:N] = UL
+    print(D)
 
     # Step 2: Run the hungarian algorithm
     matchi, matchj = optimize.linear_sum_assignment(D)
     matchdist = np.sum(D[matchi, matchj])
 
     if matching:
         matchidx = [(i, j) for i, j in zip(matchi, matchj)]
-        return matchdist, (matchidx, D)
+        ret = np.zeros((len(matchidx), 3))
+        ret[:, 0:2] = np.array(matchidx)
+        ret[:, 2] = D[matchi, matchj]
+        # Indicate diagonally matched points
+        ret[ret[:, 0] >= M, 0] = -1
+        ret[ret[:, 1] >= N, 1] = -1
+        return matchdist, ret
 
     return matchdist
diff --git a/test/test_distances.py b/test/test_distances.py
@@ -75,14 +75,15 @@ def test_matching(self):
             [1.0, 1.1],
         ])
 
-        d, (m, D) = bottleneck(
+        d, m = bottleneck(
             dgm1, dgm2,
             matching=True
         )
-
-        # These are very loose bounds
-        assert len(m) == len(dgm1) + len(dgm2)
-        assert D.shape == (len(dgm1) + len(dgm2), len(dgm1) + len(dgm2))
+        u1 = np.unique(m[:, 0])
+        u1 = u1[u1 >= 0]
+        u2 = np.unique(m[:, 1])
+        u2 = u2[u2 >= 0]
+        assert u1.size == dgm1.shape[0] and u2.size == dgm2.shape[0]
     
     def test_matching_to_self(self):
         # Matching a diagram to itself should yield 0
diff --git a/test/test_visuals.py b/test/test_visuals.py
@@ -198,8 +198,8 @@ def test_bottleneck_matching(self):
             [0.3, 0.45]
         ])
 
-        d, (matching, D) = persim.bottleneck(dgm1, dgm2, matching=True)
-        persim.bottleneck_matching(dgm1, dgm2, matching, D)
+        d, matching = persim.bottleneck(dgm1, dgm2, matching=True)
+        persim.bottleneck_matching(dgm1, dgm2, matching)
 
     def test_plot_labels(self):
         dgm1 = np.array([
@@ -211,6 +211,6 @@ def test_plot_labels(self):
             [0.3, 0.45]
         ])
 
-        d, (matching, D) = persim.bottleneck(dgm1, dgm2, matching=True)
+        d, matching = persim.bottleneck(dgm1, dgm2, matching=True)
         persim.bottleneck_matching(
-            dgm1, dgm2, matching, D, labels=["X", "Y"])
+            dgm1, dgm2, matching, labels=["X", "Y"])