Skip to content

Commit 6117e63

Browse files
committed
add metrics
1 parent 4763293 commit 6117e63

File tree

1 file changed

+244
-0
lines changed

1 file changed

+244
-0
lines changed
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
from typing import Literal, Optional, Tuple, Union
2+
import numpy as np
3+
import scipy.sparse as sp
4+
from scipy import stats
5+
from scipy.sparse.csgraph import shortest_path
6+
from graphconstructor import Graph
7+
8+
9+
EdgeWeightMode = Literal["distance", "similarity", "unweighted"]
10+
CorrKind = Literal["spearman", "pearson"]
11+
12+
13+
def edge_jaccard(
14+
G1: Graph,
15+
G2: Graph,
16+
*,
17+
weighted: bool = False,
18+
) -> float:
19+
"""
20+
Jaccard similarity between the edge sets of two graphs built on the same node set.
21+
22+
Parameters
23+
----------
24+
G1, G2 : Graph
25+
Graphs to compare. Must have the same shape (same node set).
26+
If undirected, edges are interpreted as unordered pairs; we only count each once.
27+
If directed, edges are arcs; (i,j) and (j,i) are distinct.
28+
weighted : bool, default False
29+
- If False: binary Jaccard on presence/absence.
30+
J = |E1 ∩ E2| / |E1 ∪ E2|
31+
- If True : "generalized" (Tanimoto) Jaccard on weights:
32+
J = sum_{(i,j)∈E1∪E2} min(w1,w2) / sum_{(i,j)∈E1∪E2} max(w1,w2)
33+
Missing weights are treated as 0.
34+
35+
Returns
36+
-------
37+
float in [0,1]
38+
"""
39+
if G1.adj.shape != G2.adj.shape:
40+
raise ValueError("Graphs must have the same number of nodes to compare.")
41+
42+
A = G1.adj
43+
B = G2.adj
44+
45+
# For undirected, operate on upper triangles to avoid double counting.
46+
if not G1.directed and not G2.directed:
47+
A_use = sp.triu(A, k=1).tocsr()
48+
B_use = sp.triu(B, k=1).tocsr()
49+
else:
50+
A_use = A.tocsr()
51+
B_use = B.tocsr()
52+
53+
if not weighted:
54+
# Binary Jaccard: use set operations via sparse structure.
55+
A_bin = A_use.sign()
56+
B_bin = B_use.sign()
57+
inter = A_bin.multiply(B_bin).nnz
58+
# union = nnz(A) + nnz(B) - nnz(intersection)
59+
union = A_bin.nnz + B_bin.nnz - inter
60+
if union == 0:
61+
return 1.0 # both empty => identical
62+
return inter / union
63+
64+
# Weighted generalized Jaccard (Tanimoto)
65+
# Align on union index set: use COO to concatenate & coalesce.
66+
Acoo = A_use.tocoo()
67+
Bcoo = B_use.tocoo()
68+
# Build dict of weights for quick min/max accumulation
69+
# For large graphs, coalescing via CSR/CSC is faster: do elementwise ops.
70+
# min(A,B) and max(A,B) can be computed as:
71+
# min = 0.5*(A+B - |A-B|), max = 0.5*(A+B + |A-B|)
72+
# But abs() on sparse is not directly supported; emulate using elementwise operations.
73+
# We'll construct union coordinates and fetch values.
74+
keys_A = np.stack([Acoo.row, Acoo.col], axis=1)
75+
keys_B = np.stack([Bcoo.row, Bcoo.col], axis=1)
76+
77+
# Lexicographic order to merge
78+
def _lex_order(k):
79+
return np.lexsort((k[:, 1], k[:, 0]))
80+
81+
oA = _lex_order(keys_A) if keys_A.size else np.array([], dtype=int)
82+
oB = _lex_order(keys_B) if keys_B.size else np.array([], dtype=int)
83+
84+
keys_A = keys_A[oA] if keys_A.size else keys_A
85+
keys_B = keys_B[oB] if keys_B.size else keys_B
86+
vals_A = Acoo.data[oA] if Acoo.nnz else np.array([], dtype=float)
87+
vals_B = Bcoo.data[oB] if Bcoo.nnz else np.array([], dtype=float)
88+
89+
iA = iB = 0
90+
num = 0.0
91+
den = 0.0
92+
nA = keys_A.shape[0]
93+
nB = keys_B.shape[0]
94+
95+
while iA < nA or iB < nB:
96+
if iB >= nB or (iA < nA and (keys_A[iA, 0] < keys_B[iB, 0] or
97+
(keys_A[iA, 0] == keys_B[iB, 0] and keys_A[iA, 1] < keys_B[iB, 1]))):
98+
# key in A only
99+
w1 = float(vals_A[iA])
100+
num += 0.0
101+
den += w1
102+
iA += 1
103+
elif iA >= nA or (keys_B[iB, 0] < keys_A[iA, 0] or
104+
(keys_B[iB, 0] == keys_A[iA, 0] and keys_B[iB, 1] < keys_A[iA, 1])):
105+
# key in B only
106+
w2 = float(vals_B[iB])
107+
num += 0.0
108+
den += w2
109+
iB += 1
110+
else:
111+
# shared
112+
w1 = float(vals_A[iA]); w2 = float(vals_B[iB])
113+
num += min(w1, w2)
114+
den += max(w1, w2)
115+
iA += 1; iB += 1
116+
117+
if den == 0.0:
118+
return 1.0 # both empty => identical
119+
return num / den
120+
121+
122+
def shortest_path_metric_correlation(
123+
G: Graph,
124+
M: Union[np.ndarray, sp.spmatrix],
125+
*,
126+
metric_mode: Literal["distance", "similarity"] = "distance",
127+
edge_weight_mode: EdgeWeightMode = "distance",
128+
sample_pairs: Optional[int] = None,
129+
correlation: CorrKind = "spearman",
130+
random_state: Optional[int] = None,
131+
similarity_eps: float = 1e-12,
132+
) -> Tuple[float, float, int]:
133+
"""
134+
Correlate graph shortest-path distances d_G(i,j) with the original metric M(i,j).
135+
136+
Parameters
137+
----------
138+
G : Graph
139+
Backbone graph (weighted or unweighted).
140+
M : array-like (n x n) dense or sparse
141+
Original metric between nodes. If metric_mode="similarity", M is similarity (larger=closer).
142+
If metric_mode="distance", M is distance (smaller=closer).
143+
metric_mode : {"distance","similarity"}, default "distance"
144+
How to interpret M.
145+
edge_weight_mode : {"distance","similarity","unweighted"}, default "distance"
146+
How to interpret G.adj weights for shortest-path cost:
147+
- "distance": use weights as path costs directly.
148+
- "similarity": use cost = 1 / (w + similarity_eps) on nonzeros.
149+
- "unweighted": cost of every present edge = 1.0.
150+
sample_pairs : int, optional
151+
If provided, randomly sample this many unordered pairs (i<j) from the giant component
152+
to estimate correlation. Otherwise, use all pairs (upper triangle), which is O(n^2).
153+
correlation : {"spearman","pearson"}, default "spearman"
154+
Which correlation to compute between flattened vectors.
155+
random_state : int, optional
156+
Seed for pair sampling reproducibility.
157+
similarity_eps : float, default 1e-12
158+
Stabilizer when inverting similarities (both for M and for edge weights if needed).
159+
160+
Returns
161+
-------
162+
(rho, pval, n_pairs_used)
163+
rho : correlation coefficient
164+
pval: two-sided p-value from scipy.stats
165+
n_pairs_used: number of pairs used in the calculation
166+
167+
Notes
168+
-----
169+
- Infinite shortest-path distances (disconnected pairs) and diagonal entries are excluded.
170+
- For undirected graphs, pairs are taken with i<j.
171+
- If M is similarity, it is converted to distances via 1/(M+eps) elementwise.
172+
"""
173+
n = G.n_nodes
174+
if M.shape[0] != n or M.shape[1] != n:
175+
raise ValueError("M must be an n x n matrix matching the graph size.")
176+
177+
# Build edge cost matrix for shortest paths
178+
if edge_weight_mode == "unweighted":
179+
cost = G.adj.copy().astype(float)
180+
cost.data[:] = 1.0
181+
elif edge_weight_mode == "distance":
182+
cost = G.adj.copy().astype(float)
183+
elif edge_weight_mode == "similarity":
184+
cost = G.adj.copy().astype(float)
185+
# cost = 1 / (w + eps) on existing edges
186+
cost.data = 1.0 / (cost.data + similarity_eps)
187+
else:
188+
raise ValueError("edge_weight_mode must be 'distance', 'similarity', or 'unweighted'.")
189+
190+
# Shortest paths on the (possibly directed) graph
191+
dG = shortest_path(
192+
csgraph=cost,
193+
directed=G.directed,
194+
return_predecessors=False,
195+
unweighted=False,
196+
overwrite=False,
197+
)
198+
# Exclude diagonal and infs later
199+
200+
# Convert M to distances array
201+
if sp.issparse(M):
202+
M_arr = M.toarray()
203+
else:
204+
M_arr = np.asarray(M, dtype=float)
205+
206+
if metric_mode == "distance":
207+
D = M_arr
208+
elif metric_mode == "similarity":
209+
D = 1.0 / (M_arr + similarity_eps)
210+
else:
211+
raise ValueError("metric_mode must be 'distance' or 'similarity'.")
212+
213+
# We'll compare only i<j (undirected sense) to avoid duplicate pairs, regardless of G.directed.
214+
iu, ju = np.triu_indices(n, k=1)
215+
216+
# Mask out infinite graph distances
217+
mask = np.isfinite(dG[iu, ju])
218+
iu = iu[mask]; ju = ju[mask]
219+
if iu.size == 0:
220+
raise ValueError("No finite shortest paths between any node pairs; graph may be disconnected.")
221+
222+
# Optional sampling
223+
if sample_pairs is not None and iu.size > sample_pairs:
224+
rng = np.random.default_rng(random_state)
225+
sel = rng.choice(iu.size, size=sample_pairs, replace=False)
226+
iu = iu[sel]; ju = ju[sel]
227+
228+
x = D[iu, ju].ravel()
229+
y = dG[iu, ju].ravel()
230+
231+
# If there are NaNs in M, drop those pairs
232+
valid = np.isfinite(x) & np.isfinite(y)
233+
x = x[valid]; y = y[valid]
234+
if x.size < 2:
235+
raise ValueError("Not enough valid pairs to compute correlation.")
236+
237+
if correlation == "spearman":
238+
rho, p = stats.spearmanr(x, y)
239+
elif correlation == "pearson":
240+
rho, p = stats.pearsonr(x, y)
241+
else:
242+
raise ValueError("correlation must be 'spearman' or 'pearson'.")
243+
244+
return float(rho), float(p), int(x.size)

0 commit comments

Comments
 (0)