Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions src/statistics/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,40 @@
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
if len(X) == 0 or len(Y) == 0:
return np.array([])
X = np.array(X)
Y = np.array(Y)
X = np.asarray(X)
Y = np.asarray(Y)
if X.shape[1] != Y.shape[1]:
raise ValueError(
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
f"and Y has shape {Y.shape}."
)
X_norm = np.linalg.norm(X, axis=1)
Y_norm = np.linalg.norm(Y, axis=1)
similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0

# Avoid explicit creation of outer product and broadcasting division,
# instead use manual vector-wise normalization to save memory
# (normalize X and Y rows before matrix multiplication when feasible)
nonzero_X = X_norm != 0
nonzero_Y = Y_norm != 0
X_safe = X.astype(np.float64, copy=False)
Y_safe = Y.astype(np.float64, copy=False)
# Precompute normed variants only for valid rows
X_normed = np.zeros_like(X_safe, dtype=np.float64)
Y_normed = np.zeros_like(Y_safe, dtype=np.float64)
X_normed[nonzero_X] = X_safe[nonzero_X] / X_norm[nonzero_X, None]
Y_normed[nonzero_Y] = Y_safe[nonzero_Y] / Y_norm[nonzero_Y, None]
similarity = np.dot(X_normed, Y_normed.T)
# Explicitly set similarities to zero for rows with small norm
if not np.all(nonzero_X) or not np.all(nonzero_Y):
mask_X = ~nonzero_X
mask_Y = ~nonzero_Y
if np.any(mask_X):
similarity[mask_X, :] = 0.0
if np.any(mask_Y):
similarity[:, mask_Y] = 0.0

# Clean up any remaining nan/inf (highly unlikely after this normalization)
np.nan_to_num(similarity, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
return similarity


Expand Down