add multi select groups

Intron7 · Intron7 · commit c9540c90566b · 2026-03-09T22:14:34.000+01:00
diff --git a/src/rapids_singlecell/pertpy_gpu/_distance.py b/src/rapids_singlecell/pertpy_gpu/_distance.py
@@ -355,7 +355,7 @@ def bootstrap(
     def create_contrasts(
         adata: AnnData,
         groupby: str,
-        selected_group: str,
+        selected_group: str | Sequence[str],
         *,
         groups: Sequence[str] | None = None,
         split_by: str | Sequence[str] | None = None,
@@ -382,7 +382,10 @@ def create_contrasts(
             Column in ``adata.obs`` whose levels are compared against
             ``selected_group``
         selected_group
-            The reference (control) value in the ``groupby`` column
+            The reference (control) value(s) in the ``groupby`` column.
+            When a sequence is passed, each target is compared against
+            every reference, producing one row per (target, reference)
+            combination.
         groups
             Specific groups to include. If None, all non-reference groups
             are included.
@@ -405,6 +408,12 @@ def create_contrasts(
         ...     adata, groupby="target_gene", selected_group="Non_target"
         ... )
 
+        >>> # Multiple references
+        >>> contrasts = Distance.create_contrasts(
+        ...     adata, groupby="target_gene",
+        ...     selected_group=["Non_target", "Scramble"],
+        ... )
+
         >>> # Stratified by celltype
         >>> contrasts = Distance.create_contrasts(
         ...     adata, groupby="target_gene", selected_group="Non_target",
@@ -425,10 +434,16 @@ def create_contrasts(
         """
         import pandas as pd
 
-        if selected_group not in adata.obs[groupby].values:
-            raise ValueError(
-                f"Reference '{selected_group}' not found in column '{groupby}'"
-            )
+        # Normalize to list
+        if isinstance(selected_group, str):
+            selected_groups = [selected_group]
+        else:
+            selected_groups = list(selected_group)
+
+        obs_values = set(adata.obs[groupby].values)
+        for sg in selected_groups:
+            if sg not in obs_values:
+                raise ValueError(f"Reference '{sg}' not found in column '{groupby}'")
 
         if split_by is None:
             split_cols: list[str] = []
@@ -438,41 +453,46 @@ def create_contrasts(
             split_cols = list(split_by)
 
         allowed_groups = set(groups) if groups is not None else None
+        selected_set = set(selected_groups)
         all_cols = [groupby, *split_cols]
 
-        if split_cols:
-            # Get all existing (groupby, *split) combinations in one pass
-            existing = adata.obs[all_cols].drop_duplicates().reset_index(drop=True)
+        parts: list[pd.DataFrame] = []
+        for sg in selected_groups:
+            if split_cols:
+                existing = adata.obs[all_cols].drop_duplicates().reset_index(drop=True)
 
-            # Find which splits have the reference
-            ref_rows = existing[existing[groupby] == selected_group]
-            if len(ref_rows) == 0:
-                df = pd.DataFrame(columns=all_cols)
-            else:
-                # Inner join: keep only targets in splits that have reference
+                ref_rows = existing[existing[groupby] == sg]
+                if len(ref_rows) == 0:
+                    continue
                 ref_splits = ref_rows[split_cols]
-                targets = existing[existing[groupby] != selected_group]
+                targets = existing[~existing[groupby].isin(selected_set)]
                 if allowed_groups is not None:
                     targets = targets[targets[groupby].isin(allowed_groups)]
-                df = targets.merge(ref_splits, on=split_cols, how="inner")
-                df = (
-                    df[all_cols]
-                    .sort_values([*split_cols, groupby])
-                    .reset_index(drop=True)
-                )
-        else:
-            # No split — just all non-reference levels of groupby
-            targets = adata.obs[groupby].unique()
-            targets = [
-                t
-                for t in targets
-                if t != selected_group
-                and (allowed_groups is None or t in allowed_groups)
-            ]
-            df = pd.DataFrame({groupby: targets})
-
-        # Insert reference column right after groupby
-        df.insert(1, "reference", selected_group)
+                matched = targets.merge(ref_splits, on=split_cols, how="inner")
+                if len(matched) == 0:
+                    continue
+                matched = matched[all_cols].copy()
+            else:
+                target_vals = [
+                    t
+                    for t in adata.obs[groupby].unique()
+                    if t not in selected_set
+                    and (allowed_groups is None or t in allowed_groups)
+                ]
+                if not target_vals:
+                    continue
+                matched = pd.DataFrame({groupby: target_vals})
+
+            matched.insert(1, "reference", sg)
+            parts.append(matched)
+
+        if not parts:
+            cols = [groupby, "reference", *split_cols]
+            return pd.DataFrame(columns=cols)
+
+        df = pd.concat(parts, ignore_index=True)
+        sort_cols = ["reference", *split_cols, groupby]
+        df = df.sort_values(sort_cols).reset_index(drop=True)
 
         return df
 
diff --git a/tests/pertpy/test_distances.py b/tests/pertpy/test_distances.py
@@ -457,6 +457,101 @@ def test_contrast_distances_no_split(contrast_adata: AnnData) -> None:
     assert np.all(np.isfinite(result["edistance"].values))
 
 
+def test_contrast_distances_multiple_references() -> None:
+    """Test create_contrasts with multiple reference groups."""
+    rng = np.random.default_rng(42)
+    n = 10
+    cpu_emb = rng.normal(size=(n * 6, 5)).astype(np.float32)
+    obs = pd.DataFrame(
+        {
+            "treatment": pd.Categorical(
+                ["ref1"] * n
+                + ["ref2"] * n
+                + ["drugA"] * n
+                + ["drugB"] * n
+                + ["ref1"] * n
+                + ["drugA"] * n
+            ),
+            "celltype": pd.Categorical(["T"] * n * 4 + ["B"] * n * 2),
+        }
+    )
+    adata = AnnData(cpu_emb.copy(), obs=obs)
+    adata.obsm["X_pca"] = cp.asarray(cpu_emb, dtype=cp.float32)
+
+    from rapids_singlecell.pertpy_gpu._metrics._edistance import EDistanceMetric
+
+    d = EDistanceMetric(obsm_key="X_pca")
+    distance = Distance(metric="edistance")
+
+    # Two references
+    contrasts = Distance.create_contrasts(
+        adata,
+        groupby="treatment",
+        selected_group=["ref1", "ref2"],
+        split_by="celltype",
+    )
+
+    # References should not appear as targets
+    assert "ref1" not in contrasts["treatment"].values
+    assert "ref2" not in contrasts["treatment"].values
+
+    # Both references should appear in the reference column
+    assert "ref1" in contrasts["reference"].values
+    assert "ref2" in contrasts["reference"].values
+
+    result = distance.contrast_distances(adata, contrasts=contrasts)
+    assert "edistance" in result.columns
+
+    # Verify each row against compute_distance
+    for _, row in result.iterrows():
+        mask_target = (adata.obs["treatment"].values == row["treatment"]) & (
+            adata.obs["celltype"].values == row["celltype"]
+        )
+        mask_ref = (adata.obs["treatment"].values == row["reference"]) & (
+            adata.obs["celltype"].values == row["celltype"]
+        )
+        X = adata.obsm["X_pca"][mask_target]
+        Y = adata.obsm["X_pca"][mask_ref]
+
+        if len(X) == 0 or len(Y) == 0:
+            continue
+        expected = d.compute_distance(X, Y)
+        np.testing.assert_allclose(row["edistance"], expected, rtol=1e-5, atol=1e-5)
+
+
+def test_contrast_distances_multiple_references_no_split() -> None:
+    """Test create_contrasts with multiple references and no split_by."""
+    rng = np.random.default_rng(42)
+    n = 15
+    cpu_emb = rng.normal(size=(n * 4, 5)).astype(np.float32)
+    obs = pd.DataFrame(
+        {
+            "treatment": pd.Categorical(
+                ["ref1"] * n + ["ref2"] * n + ["drugA"] * n + ["drugB"] * n
+            ),
+        }
+    )
+    adata = AnnData(cpu_emb.copy(), obs=obs)
+    adata.obsm["X_pca"] = cp.asarray(cpu_emb, dtype=cp.float32)
+
+    distance = Distance(metric="edistance")
+
+    contrasts = Distance.create_contrasts(
+        adata,
+        groupby="treatment",
+        selected_group=["ref1", "ref2"],
+    )
+
+    # 2 targets x 2 references = 4 rows
+    assert len(contrasts) == 4
+    assert set(contrasts["treatment"].values) == {"drugA", "drugB"}
+    assert set(contrasts["reference"].values) == {"ref1", "ref2"}
+
+    result = distance.contrast_distances(adata, contrasts=contrasts)
+    assert len(result) == 4
+    assert np.all(np.isfinite(result["edistance"].values))
+
+
 def test_contrast_distances_filtered(contrast_adata: AnnData) -> None:
     """Test that filtering a contrasts DataFrame before computing works."""
     from rapids_singlecell.pertpy_gpu._metrics._edistance import EDistanceMetric