Add faiss-gpu (#22)

Zethson · scverse-bot · web-flow · commit 8750e794891d · 2025-11-17T22:18:49.000+01:00
Signed-off-by: Lukas Heumos &lt;lukas.heumos@posteo.net&gt;
Co-authored-by: scverse-bot &lt;108668866+scverse-bot@users.noreply.github.com&gt;
diff --git a/.github/workflows/test-cpu.yaml b/.github/workflows/test-cpu.yaml
@@ -26,7 +26,7 @@ jobs:
     outputs:
       envs: ${{ steps.get-envs.outputs.envs }}
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           filter: blob:none
           fetch-depth: 0
diff --git a/.github/workflows/test-gpu.yaml b/.github/workflows/test-gpu.yaml
@@ -42,15 +42,15 @@ jobs:
       - name: Install Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.13"
+          python-version: "3.12"
 
       - name: Install uv
         uses: astral-sh/setup-uv@v7
         with:
           cache-dependency-glob: pyproject.toml
 
       - name: Install fknni
-        run: uv pip install --system -e ".[test]"
+        run: uv pip install --system -e ".[test,faissgpu]"
       - name: Pip list
         run: pip list
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,6 @@ classifiers = [
   "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
-  "faiss-cpu",
   "lamin-utils",
   "pandas",
   "scikit-learn",
@@ -45,6 +44,8 @@ optional-dependencies.doc = [
   "sphinxcontrib-bibtex>=1",
   "sphinxext-opengraph",
 ]
+optional-dependencies.faisscpu = [ "faiss-cpu" ]
+optional-dependencies.faissgpu = [ "faiss-gpu-cu12" ]
 optional-dependencies.rapids12 = [
   "cudf-cu12>=25.10",
   "cugraph-cu12>=25.10",
@@ -88,7 +89,7 @@ deps = [ "pre" ]
 python = [ "3.13" ]
 
 [tool.hatch.envs.hatch-test]
-features = [ "dev", "test" ]
+features = [ "dev", "test", "faisscpu" ]
 
 [tool.hatch.envs.hatch-test.overrides]
 # If the matrix variable `deps` is set to "pre",
@@ -142,6 +143,7 @@ testpaths = [ "tests" ]
 xfail_strict = true
 addopts = [
   "--import-mode=importlib", # allow using test files with same name
+  "-m not gpu",
 ]
 markers = [
   "gpu: mark test to run on GPU",
diff --git a/src/fknni/faiss/faiss.py b/src/fknni/faiss/faiss.py
@@ -9,6 +9,13 @@
 from numpy import dtype
 from sklearn.base import BaseEstimator, TransformerMixin
 
+try:
+    import faiss
+
+    HAS_FAISS_GPU = hasattr(faiss, "StandardGpuResources")
+except ImportError:
+    raise ImportError("faiss-cpu or faiss-gpu required") from None
+
 
 class FaissImputer(BaseEstimator, TransformerMixin):
     """Imputer for completing missing values using Faiss, incorporating weighted averages based on distance."""
@@ -23,6 +30,7 @@ def __init__(
         index_factory: str = "Flat",
         min_data_ratio: float = 0.25,
         temporal_mode: Literal["flatten", "per_variable"] = "flatten",
+        use_gpu: bool = False,
     ):
         """Initializes FaissImputer with specified parameters that are used for the imputation.
 
@@ -39,6 +47,7 @@ def __init__(
             temporal_mode: How to handle 3D temporal data. 'flatten' treats all (variable, timestep) pairs as
                        independent features (fast but allows temporal leakage).
                        'per_variable' imputes each variable independently across time (slower but respects temporal causality).
+            use_gpu: Whether to train using GPU.
         """
         if n_neighbors < 1:
             raise ValueError("n_neighbors must be at least 1.")
@@ -47,6 +56,10 @@ def __init__(
         if temporal_mode not in {"flatten", "per_variable"}:
             raise ValueError("Unknown temporal_mode. Choose one of 'flatten', 'per_variable'")
 
+        self.use_gpu = use_gpu
+        if use_gpu and not HAS_FAISS_GPU:
+            raise ValueError("use_gpu=True requires faiss-gpu package, install with: pip install faiss-gpu") from None
+
         self.missing_values = missing_values
         self.n_neighbors = n_neighbors
         self.metric = metric
@@ -236,6 +249,11 @@ def _features_indices_sorted_descending_on_nan(self) -> list[int]:
     def _train(self, x_train: np.ndarray) -> faiss.Index:
         index = faiss.index_factory(x_train.shape[1], self.index_factory)
         index.metric_type = faiss.METRIC_L2 if self.metric == "l2" else faiss.METRIC_INNER_PRODUCT
+
+        if self.use_gpu:
+            res = faiss.StandardGpuResources()
+            index = faiss.index_cpu_to_gpu(res, 0, index)
+
         index.train(x_train)
         index.add(x_train)
         return index
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/compare_predictions.py b/tests/compare_predictions.py
@@ -0,0 +1,46 @@
+import numpy as np
+
+
+def _are_ndarrays_equal(arr1: np.ndarray, arr2: np.ndarray) -> np.bool_:
+    """Check if two arrays are equal member-wise.
+
+    Note: Two NaN are considered equal.
+
+    Args:
+        arr1: First array to compare
+        arr2: Second array to compare
+
+    Returns:
+        True if the two arrays are equal member-wise
+    """
+    return np.all(np.equal(arr1, arr2, dtype=object) | ((arr1 != arr1) & (arr2 != arr2)))
+
+
+def _base_check_imputation(
+    data_original: np.ndarray,
+    data_imputed: np.ndarray,
+):
+    """Provides the following base checks:
+    - Imputation doesn't leave any NaN behind
+    - Imputation doesn't modify any data that wasn't NaN
+
+    Args:
+        data_before_imputation: Dataset before imputation
+        data_after_imputation: Dataset after imputation
+
+    Raises:
+        AssertionError: If any of the checks fail.
+    """
+    if data_original.shape != data_imputed.shape:
+        raise AssertionError("The shapes of the two datasets do not match")
+
+    # Ensure no NaN remains in the imputed dataset
+    if np.isnan(data_imputed).any():
+        raise AssertionError("NaN found in imputed columns of layer_after.")
+
+    # Ensure imputation does not alter non-NaN values in the imputed columns
+    imputed_non_nan_mask = ~np.isnan(data_original)
+    if not _are_ndarrays_equal(data_original[imputed_non_nan_mask], data_imputed[imputed_non_nan_mask]):
+        raise AssertionError("Non-NaN values in imputed columns were modified.")
+
+    return
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,19 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+
+@pytest.fixture
+def rng():
+    return np.random.default_rng(0)
+
+
+@pytest.fixture
+def simple_test_df(rng):
+    data = pd.DataFrame(rng.integers(0, 100, size=(10, 5)), columns=list("ABCDE"))
+    data_missing = data.copy()
+    indices = [(i, j) for i in range(data.shape[0]) for j in range(data.shape[1])]
+    rng.shuffle(indices)
+    for i, j in indices[:5]:
+        data_missing.iat[i, j] = np.nan
+    return data.to_numpy(), data_missing.to_numpy()
diff --git a/tests/cpu/conftest.py b/tests/cpu/conftest.py
diff --git a/tests/cpu/test_faiss_imputation.py b/tests/cpu/test_faiss_imputation.py
@@ -1,22 +1,11 @@
 import numpy as np
-import pandas as pd
 import pytest
 from sklearn.datasets import make_regression
+from tests.compare_predictions import _base_check_imputation
 
 from fknni.faiss.faiss import FaissImputer
 
 
-@pytest.fixture
-def simple_test_df(rng):
-    data = pd.DataFrame(rng.integers(0, 100, size=(10, 5)), columns=list("ABCDE"))
-    data_missing = data.copy()
-    indices = [(i, j) for i in range(data.shape[0]) for j in range(data.shape[1])]
-    rng.shuffle(indices)
-    for i, j in indices[:5]:
-        data_missing.iat[i, j] = np.nan
-    return data.to_numpy(), data_missing.to_numpy()
-
-
 @pytest.fixture
 def regression_dataset(rng):
     X, y = make_regression(n_samples=100, n_features=20, random_state=42)
@@ -28,36 +17,6 @@ def regression_dataset(rng):
     return X, X_missing, y
 
 
-def _base_check_imputation(
-    data_original: np.ndarray,
-    data_imputed: np.ndarray,
-):
-    """Provides the following base checks:
-    - Imputation doesn't leave any NaN behind
-    - Imputation doesn't modify any data that wasn't NaN
-
-    Args:
-        data_before_imputation: Dataset before imputation
-        data_after_imputation: Dataset after imputation
-
-    Raises:
-        AssertionError: If any of the checks fail.
-    """
-    if data_original.shape != data_imputed.shape:
-        raise AssertionError("The shapes of the two datasets do not match")
-
-    # Ensure no NaN remains in the imputed dataset
-    if np.isnan(data_imputed).any():
-        raise AssertionError("NaN found in imputed columns of layer_after.")
-
-    # Ensure imputation does not alter non-NaN values in the imputed columns
-    imputed_non_nan_mask = ~np.isnan(data_original)
-    if not _are_ndarrays_equal(data_original[imputed_non_nan_mask], data_imputed[imputed_non_nan_mask]):
-        raise AssertionError("Non-NaN values in imputed columns were modified.")
-
-    return
-
-
 def test_median_imputation(simple_test_df):
     """Tests if median imputation successfully fills all NaN values"""
     data, data_missing = simple_test_df
@@ -222,18 +181,3 @@ def test_invalid_temporal_mode():
     """Tests if imputer raises error for invalid temporal_mode"""
     with pytest.raises(ValueError):
         FaissImputer(temporal_mode="invalid")
-
-
-def _are_ndarrays_equal(arr1: np.ndarray, arr2: np.ndarray) -> np.bool_:
-    """Check if two arrays are equal member-wise.
-
-    Note: Two NaN are considered equal.
-
-    Args:
-        arr1: First array to compare
-        arr2: Second array to compare
-
-    Returns:
-        True if the two arrays are equal member-wise
-    """
-    return np.all(np.equal(arr1, arr2, dtype=object) | ((arr1 != arr1) & (arr2 != arr2)))
diff --git a/tests/gpu/test_gpu.py b/tests/gpu/test_gpu.py
@@ -1,6 +1,13 @@
 import pytest
+from tests.compare_predictions import _base_check_imputation
+
+from fknni.faiss.faiss import FaissImputer
 
 
 @pytest.mark.gpu
-def test_gpu():
-    assert 1 + 1 == 2
+def test_median_imputation(simple_test_df):
+    """Tests if median imputation successfully fills all NaN values"""
+    data, data_missing = simple_test_df
+    data_original = data_missing.copy()
+    FaissImputer(n_neighbors=5, strategy="median", use_gpu=True).fit_transform(data_missing)
+    _base_check_imputation(data_original, data_missing)