diff --git a/src/statistics/similarity.py b/src/statistics/similarity.py
index 0371c82..8371caf 100644
--- a/src/statistics/similarity.py
+++ b/src/statistics/similarity.py
@@ -10,17 +10,23 @@
 def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
     if len(X) == 0 or len(Y) == 0:
         return np.array([])
-    X = np.array(X)
-    Y = np.array(Y)
+    # Convert both X and Y to float arrays using np.asarray for possible efficiency and memory savings
+    X = np.asarray(X, dtype=np.float64)
+    Y = np.asarray(Y, dtype=np.float64)
     if X.shape[1] != Y.shape[1]:
         raise ValueError(
             f"Number of columns in X and Y must be the same. X has shape {X.shape} "
             f"and Y has shape {Y.shape}."
         )
-    X_norm = np.linalg.norm(X, axis=1)
-    Y_norm = np.linalg.norm(Y, axis=1)
-    similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
-    similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
+    # Use 'keepdims=True' to allow for later broadcasting, and avoid explicit outer product shape
+    X_norm = np.linalg.norm(X, axis=1, keepdims=True)
+    Y_norm = np.linalg.norm(Y, axis=1, keepdims=True)
+    # Compute denominator directly for efficiency
+    denom = X_norm @ Y_norm.T
+    # Handle division by zero in-place to avoid NaNs/Infs
+    with np.errstate(divide="ignore", invalid="ignore"):
+        similarity = np.dot(X, Y.T) / denom
+        np.copyto(similarity, 0.0, where=~np.isfinite(similarity))
     return similarity
 
 
@@ -33,11 +39,16 @@ def cosine_similarity_top_k(
     if len(X) == 0 or len(Y) == 0:
         return [], []
     score_array = cosine_similarity(X, Y)
-    sorted_idxs = score_array.flatten().argsort()[::-1]
+    flat_scores = (
+        score_array.flatten()
+    )  # Use flatten() to match original behavior exactly
+    sorted_idxs = flat_scores.argsort()[
+        ::-1
+    ]  # Use full argsort to match original ordering
     top_k = top_k or len(sorted_idxs)
     top_idxs = sorted_idxs[:top_k]
     score_threshold = score_threshold or -1.0
-    top_idxs = top_idxs[score_array.flatten()[top_idxs] > score_threshold]
+    top_idxs = top_idxs[flat_scores[top_idxs] > score_threshold]
     ret_idxs = [(x // score_array.shape[1], x % score_array.shape[1]) for x in top_idxs]
-    scores = score_array.flatten()[top_idxs].tolist()
+    scores = flat_scores[top_idxs].tolist()
     return ret_idxs, scores