Remove faiss and cuvs dependencies

daniel-unyi-42 · daniel-unyi-42 · commit e7a3034addd0 · 2025-01-11T17:43:28.000+01:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,19 +43,13 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-faiss = [
-    "faiss-cpu>=1.7.0",
-    "faiss-gpu>=1.7.0"
-]
 
-rapids11 = [
-    "cupy-cuda11x",
-    "cuvs-cu11==24.4.*",
+cuda11 = [
+    "cupy-cuda11x"
 ]
 
-rapids12 = [
-    "cupy-cuda12x",
-    "cuvs-cu12==24.4.*",
+cuda12 = [
+    "cupy-cuda12x"
 ]
 
 multiprocessing = ["multiprocessing"]
diff --git a/src/segger/data/utils.py b/src/segger/data/utils.py
@@ -32,13 +32,12 @@ def try_import(module_name):
 # Attempt to import specific modules with try_import function
 try_import("multiprocessing")
 try_import("joblib")
-try_import("faiss")
-try_import("cuvs")
-try:
-    import cupy as cp
-    from cuvs.neighbors import cagra
-except ImportError:
-    print(f"Warning: cupy and/or cuvs are not installed. Please install them to use this functionality.")
+# try_import("cuvs")
+# try:
+#     import cupy as cp
+#     from cuvs.neighbors import cagra
+# except ImportError:
+#     print(f"Warning: cupy and/or cuvs are not installed. Please install them to use this functionality.")
 
 import torch.utils.dlpack as dlpack
 from datetime import timedelta
@@ -270,33 +269,28 @@ def get_edge_index(
     k: int = 5,
     dist: int = 10,
     method: str = "kd_tree",
-    gpu: bool = False,
     workers: int = 1,
 ) -> torch.Tensor:
     """
-    Computes edge indices using various methods (KD-Tree, FAISS, RAPIDS::cuvs+cupy (cuda)).
+    Computes edge indices using KD-Tree.
 
     Parameters:
         coords_1 (np.ndarray): First set of coordinates.
         coords_2 (np.ndarray): Second set of coordinates.
         k (int, optional): Number of nearest neighbors.
         dist (int, optional): Distance threshold.
-        method (str, optional): The method to use ('kd_tree', 'faiss', 'cuda').
-        gpu (bool, optional): Whether to use GPU acceleration (applicable for FAISS).
+        method (str, optional): The method to use. Only 'kd_tree' is supported now.
 
     Returns:
         torch.Tensor: Edge indices.
     """
     if method == "kd_tree":
         return get_edge_index_kdtree(coords_1, coords_2, k=k, dist=dist, workers=workers)
-    elif method == "faiss":
-        return get_edge_index_faiss(coords_1, coords_2, k=k, dist=dist, gpu=gpu)
-    elif method == "cuda":
-        # pass
-        return get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist)
+    # elif method == "cuda":
+    #     return get_edge_index_cuda(coords_1, coords_2, k=k, dist=dist)
     else:
-        msg = f"Unknown method {method}. Valid methods include: 'kd_tree', " "'faiss', and 'cuda'."
-        raise ValueError()
+        msg = f"Unknown method {method}. The only supported method is 'kd_tree' now."
+        raise ValueError(msg)
 
 
 def get_edge_index_kdtree(
@@ -332,104 +326,64 @@ def get_edge_index_kdtree(
     return edge_index
 
 
-def get_edge_index_faiss(
-    coords_1: np.ndarray, coords_2: np.ndarray, k: int = 5, dist: int = 10, gpu: bool = False
-) -> torch.Tensor:
-    """
-    Computes edge indices using FAISS.
-
-    Parameters:
-        coords_1 (np.ndarray): First set of coordinates.
-        coords_2 (np.ndarray): Second set of coordinates.
-        k (int, optional): Number of nearest neighbors.
-        dist (int, optional): Distance threshold.
-        gpu (bool, optional): Whether to use GPU acceleration.
-
-    Returns:
-        torch.Tensor: Edge indices.
-    """
-    coords_1 = np.ascontiguousarray(coords_1, dtype=np.float32)
-    coords_2 = np.ascontiguousarray(coords_2, dtype=np.float32)
-    d = coords_1.shape[1]
-    if gpu:
-        res = faiss.StandardGpuResources()
-        index = faiss.GpuIndexFlatL2(res, d)
-    else:
-        index = faiss.IndexFlatL2(d)
-
-    index.add(coords_1.astype("float32"))
-    D, I = index.search(coords_2.astype("float32"), k)
-
-    valid_mask = D < dist**2
-    edges = []
-
-    for idx, valid in enumerate(valid_mask):
-        valid_indices = I[idx][valid]
-        if valid_indices.size > 0:
-            edges.append(np.vstack((np.full(valid_indices.shape, idx), valid_indices)).T)
-
-    edge_index = torch.tensor(np.vstack(edges), dtype=torch.long).contiguous()
-    return edge_index
-
-
-def get_edge_index_cuda(
-    coords_1: torch.Tensor,
-    coords_2: torch.Tensor,
-    k: int = 10,
-    dist: float = 10.0,
-    metric: str = "sqeuclidean",
-    nn_descent_niter: int = 100,
-) -> torch.Tensor:
-    """
-    Computes edge indices using RAPIDS cuVS with cagra for vector similarity search,
-    with input coordinates as PyTorch tensors on CUDA, using DLPack for conversion.
-
-    Parameters:
-        coords_1 (torch.Tensor): First set of coordinates (query vectors) on CUDA.
-        coords_2 (torch.Tensor): Second set of coordinates (index vectors) on CUDA.
-        k (int, optional): Number of nearest neighbors.
-        dist (float, optional): Distance threshold.
-
-    Returns:
-        torch.Tensor: Edge indices as a PyTorch tensor on CUDA.
-    """
-
-    def cupy_to_torch(cupy_array):
-        return torch.from_dlpack((cupy_array.toDlpack()))
-
-    # gg
-    def torch_to_cupy(tensor):
-        return cp.fromDlpack(dlpack.to_dlpack(tensor))
-
-    # Convert PyTorch tensors (CUDA) to CuPy arrays using DLPack
-    cp_coords_1 = torch_to_cupy(coords_1).astype(cp.float32)
-    cp_coords_2 = torch_to_cupy(coords_2).astype(cp.float32)
-    # Define the distance threshold in CuPy
-    cp_dist = cp.float32(dist)
-    # IndexParams and SearchParams for cagra
-    # compression_params = cagra.CompressionParams(pq_bits=pq_bits)
-    index_params = cagra.IndexParams(
-        metric=metric, nn_descent_niter=nn_descent_niter
-    )  # , compression=compression_params)
-    search_params = cagra.SearchParams()
-    # Build index using CuPy coords
-    try:
-        index = cagra.build(index_params, cp_coords_1)
-    except AttributeError:
-        index = cagra.build_index(index_params, cp_coords_1)
-    # Perform search to get distances and indices (still in CuPy)
-    D, I = cagra.search(search_params, index, cp_coords_2, k)
-    # Boolean mask for filtering distances below the squared threshold (all in CuPy)
-    valid_mask = cp.asarray(D < cp_dist**2)
-    # Vectorized operations for row and valid indices (all in CuPy)
-    repeats = valid_mask.sum(axis=1).tolist()
-    row_indices = cp.repeat(cp.arange(len(cp_coords_2)), repeats)
-    valid_indices = cp.asarray(I)[cp.where(valid_mask)]
-    # Stack row indices with valid indices to form edges
-    edges = cp.vstack((row_indices, valid_indices)).T
-    # Convert the result back to a PyTorch tensor using DLPack
-    edge_index = cupy_to_torch(edges).long().contiguous()
-    return edge_index
+# def get_edge_index_cuda(
+#     coords_1: torch.Tensor,
+#     coords_2: torch.Tensor,
+#     k: int = 10,
+#     dist: float = 10.0,
+#     metric: str = "sqeuclidean",
+#     nn_descent_niter: int = 100,
+# ) -> torch.Tensor:
+#     """
+#     Computes edge indices using RAPIDS cuVS with cagra for vector similarity search,
+#     with input coordinates as PyTorch tensors on CUDA, using DLPack for conversion.
+
+#     Parameters:
+#         coords_1 (torch.Tensor): First set of coordinates (query vectors) on CUDA.
+#         coords_2 (torch.Tensor): Second set of coordinates (index vectors) on CUDA.
+#         k (int, optional): Number of nearest neighbors.
+#         dist (float, optional): Distance threshold.
+
+#     Returns:
+#         torch.Tensor: Edge indices as a PyTorch tensor on CUDA.
+#     """
+
+#     def cupy_to_torch(cupy_array):
+#         return torch.from_dlpack((cupy_array.toDlpack()))
+
+#     # gg
+#     def torch_to_cupy(tensor):
+#         return cp.fromDlpack(dlpack.to_dlpack(tensor))
+
+#     # Convert PyTorch tensors (CUDA) to CuPy arrays using DLPack
+#     cp_coords_1 = torch_to_cupy(coords_1).astype(cp.float32)
+#     cp_coords_2 = torch_to_cupy(coords_2).astype(cp.float32)
+#     # Define the distance threshold in CuPy
+#     cp_dist = cp.float32(dist)
+#     # IndexParams and SearchParams for cagra
+#     # compression_params = cagra.CompressionParams(pq_bits=pq_bits)
+#     index_params = cagra.IndexParams(
+#         metric=metric, nn_descent_niter=nn_descent_niter
+#     )  # , compression=compression_params)
+#     search_params = cagra.SearchParams()
+#     # Build index using CuPy coords
+#     try:
+#         index = cagra.build(index_params, cp_coords_1)
+#     except AttributeError:
+#         index = cagra.build_index(index_params, cp_coords_1)
+#     # Perform search to get distances and indices (still in CuPy)
+#     D, I = cagra.search(search_params, index, cp_coords_2, k)
+#     # Boolean mask for filtering distances below the squared threshold (all in CuPy)
+#     valid_mask = cp.asarray(D < cp_dist**2)
+#     # Vectorized operations for row and valid indices (all in CuPy)
+#     repeats = valid_mask.sum(axis=1).tolist()
+#     row_indices = cp.repeat(cp.arange(len(cp_coords_2)), repeats)
+#     valid_indices = cp.asarray(I)[cp.where(valid_mask)]
+#     # Stack row indices with valid indices to form edges
+#     edges = cp.vstack((row_indices, valid_indices)).T
+#     # Convert the result back to a PyTorch tensor using DLPack
+#     edge_index = cupy_to_torch(edges).long().contiguous()
+#     return edge_index
 
 
 class SpatialTranscriptomicsDataset(InMemoryDataset):
diff --git a/src/segger/prediction/predict_multigpu.py b/src/segger/prediction/predict_multigpu.py
@@ -13,7 +13,6 @@
 from torch_geometric.loader import DataLoader
 from torch_geometric.data import Batch
 from segger.data.utils import (
-    get_edge_index_cuda,
     get_edge_index,
     format_time,
     create_anndata,
diff --git a/src/segger/prediction/predict_parquet.py b/src/segger/prediction/predict_parquet.py
@@ -14,7 +14,6 @@
 from torch_geometric.loader import DataLoader
 from torch_geometric.data import Batch
 from segger.data.utils import (
-    get_edge_index_cuda,
     get_edge_index,
     format_time,
     create_anndata,