Wrap consensus-map protein inference (#19)

timosachsenberg · web-flow · commit 802f92694198 · 2025-11-18T17:02:17.000+01:00
* Wrap consensus-map protein inference

* Add protein inference examples to README
diff --git a/README.md b/README.md
@@ -145,6 +145,38 @@ print(f"Consensus contains {len(consensus)} features")
 The helper returns a fresh `Py_ConsensusMap` instance that can be exported,
 converted to a pandas DataFrame, or iterated for downstream analysis.
 
+## Protein inference and rollups
+
+Recent wrappers expose multiple entry points for inferring proteins directly
+from the Python API—either by starting from identification files, feature maps,
+or full consensus maps.
+
+```python
+from openms_python import Identifications, Py_FeatureMap, Py_ConsensusMap
+
+# 1) Run inference straight from an idXML file
+ids = Identifications.from_idxml("search_results.idXML")
+protein_summary = ids.infer_proteins(algorithm="bayesian")
+print(protein_summary.summary())
+
+# 2) Trigger inference on a feature map (assigned + unassigned peptides)
+fmap = Py_FeatureMap().load("sample.featureXML")
+proteins = fmap.infer_proteins(include_unassigned=True)
+proteins.to_idxml("sample_proteins.idXML")
+
+# 3) Operate directly on a consensus map
+consensus = Py_ConsensusMap().load("merged.consensusXML")
+consensus.infer_proteins(algorithm="basic")
+
+# Optionally compute quantitative protein ratios in place
+consensus.infer_protein_quantities(reference_map=1)
+consensus.store("merged_with_proteins.consensusXML")
+```
+
+All helpers share the same ergonomic parameter handling, accept native
+`pyopenms` parameters (`oms.Param`) or plain dictionaries, and return
+`Identifications` or the map instance itself for easy method chaining.
+
 ## Identification performance showcase
 
 Looking for a larger end-to-end example? `tests/test_idperformance.py` ships with
diff --git a/openms_python/py_consensusmap.py b/openms_python/py_consensusmap.py
@@ -9,6 +9,7 @@
 
 from ._io_utils import ensure_allowed_suffix, CONSENSUS_MAP_EXTENSIONS
 from .py_featuremap import Py_FeatureMap
+from .py_identifications import Identifications
 
 
 class Py_ConsensusMap:
@@ -112,6 +113,48 @@ def store(self, filepath: Union[str, Path]) -> 'Py_ConsensusMap':
         oms.ConsensusXMLFile().store(str(filepath), self._consensus_map)
         return self
 
+    # ==================== Protein inference ====================
+
+    def infer_proteins(
+        self,
+        *,
+        algorithm: str = "basic",
+        params: Optional[Union[oms.Param, Dict[str, Union[int, float, str]]]] = None,
+        include_unassigned: bool = False,
+        greedy_group_resolution: bool = True,
+        experimental_design: Optional[oms.ExperimentalDesign] = None,
+    ) -> Identifications:
+        """Run protein inference on the identifications attached to this map."""
+
+        identifications = self._collect_identifications()
+        return identifications.infer_proteins(
+            algorithm=algorithm,
+            params=params,
+            consensus_map=self._consensus_map,
+            include_unassigned=include_unassigned,
+            greedy_group_resolution=greedy_group_resolution,
+            experimental_design=experimental_design,
+        )
+
+    def infer_protein_quantities(self, reference_map: int = 0) -> "Py_ConsensusMap":
+        """Infer protein-level quantities directly on this consensus map.
+
+        The method wraps :class:`pyopenms.ProteinInference`, which attaches
+        quantitative results to the map's stored
+        :class:`pyopenms.ProteinIdentification` entries based on their
+        associated peptide identifications.
+
+        Parameters
+        ----------
+        reference_map:
+            Index of the reference (e.g. iTRAQ) channel used as denominator
+            when calculating protein ratios.
+        """
+
+        runner = oms.ProteinInference()
+        runner.infer(self._consensus_map, int(reference_map))
+        return self
+
     # ==================== Alignment helpers ====================
 
     @classmethod
@@ -362,3 +405,10 @@ def _link_feature_maps(
             consensus_map.setColumnHeaders(headers)
 
         return consensus_map
+
+    def _collect_identifications(self) -> Identifications:
+        proteins = self._consensus_map.getProteinIdentifications()
+        peptides = list(self._consensus_map.getUnassignedPeptideIdentifications())
+        for feature in self._consensus_map:
+            peptides.extend(feature.getPeptideIdentifications())
+        return Identifications(proteins, peptides)
diff --git a/openms_python/py_featuremap.py b/openms_python/py_featuremap.py
@@ -2,12 +2,13 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Iterable, Iterator, Optional, Union
+from typing import Dict, Iterable, Iterator, Optional, Union
 
 import pyopenms as oms
 import pandas as pd
 from ._io_utils import ensure_allowed_suffix, FEATURE_MAP_EXTENSIONS
 from .py_feature import Py_Feature
+from .py_identifications import Identifications
 
 
 class Py_FeatureMap:
@@ -111,6 +112,28 @@ def store(self, filepath: Union[str, Path]) -> 'Py_FeatureMap':
         oms.FeatureXMLFile().store(str(filepath), self._feature_map)
         return self
 
+    # ==================== Protein inference ====================
+
+    def infer_proteins(
+        self,
+        *,
+        algorithm: str = "basic",
+        params: Optional[Union[oms.Param, Dict[str, Union[int, float, str]]]] = None,
+        include_unassigned: bool = False,
+        greedy_group_resolution: bool = True,
+        experimental_design: Optional[oms.ExperimentalDesign] = None,
+    ) -> Identifications:
+        """Run protein inference on the identifications stored in this map."""
+
+        identifications = self._collect_identifications()
+        return identifications.infer_proteins(
+            algorithm=algorithm,
+            params=params,
+            include_unassigned=include_unassigned,
+            greedy_group_resolution=greedy_group_resolution,
+            experimental_design=experimental_design,
+        )
+
     # ==================== pandas integration ====================
 
     def to_dataframe(self) -> pd.DataFrame:
@@ -251,7 +274,7 @@ def _normalize_meta_value(value: object):
             except Exception:
                 return value
         return value
-      
+
     def _as_native_feature(self, feature: Union[oms.Feature, Py_Feature]) -> oms.Feature:
         if isinstance(feature, Py_Feature):
             return feature.native
@@ -261,3 +284,9 @@ def _as_native_feature(self, feature: Union[oms.Feature, Py_Feature]) -> oms.Fea
             "Features must be pyopenms.Feature or Py_Feature instances, "
             f"got {type(feature).__name__}"
         )
+
+    def _collect_identifications(self) -> Identifications:
+        proteins = self._feature_map.getProteinIdentifications()
+        peptides = list(self._feature_map.getUnassignedPeptideIdentifications())
+        peptides.extend(self._feature_map.get_assigned_peptide_identifications())
+        return Identifications(proteins, peptides)
diff --git a/openms_python/py_identifications.py b/openms_python/py_identifications.py
@@ -2,12 +2,15 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Iterable, Iterator, List, Optional, Sequence, Tuple, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, Union, TYPE_CHECKING
 
 import pyopenms as oms
 
 from ._io_utils import ensure_allowed_suffix, IDENTIFICATION_EXTENSIONS
 
+if TYPE_CHECKING:  # pragma: no cover - imported for type checking only
+    from .py_consensusmap import Py_ConsensusMap
+
 
 class ProteinIdentifications:
     """Sequence-like container for :class:`pyopenms.ProteinIdentification`."""
@@ -246,6 +249,82 @@ def filter_by_fdr(
         kept_proteins = [entry for entry in proteins if _extract_q_value(entry, qvalue_keys, False) <= threshold]
         return Identifications(kept_proteins, self.peptides.copy())
 
+    def infer_proteins(
+        self,
+        *,
+        algorithm: str = "basic",
+        params: Optional[Union[oms.Param, Dict[str, Union[int, float, str]]]] = None,
+        consensus_map: Optional[Union["Py_ConsensusMap", oms.ConsensusMap]] = None,
+        include_unassigned: bool = False,
+        greedy_group_resolution: bool = True,
+        experimental_design: Optional[oms.ExperimentalDesign] = None,
+    ) -> "Identifications":
+        """Run a protein inference algorithm and return the updated identifications.
+
+        Parameters
+        ----------
+        algorithm:
+            Name of the inference algorithm to run. Supported values are
+            ``"basic"`` and ``"bayesian"``.
+        params:
+            Optional parameter dictionary or :class:`pyopenms.Param` applied to
+            the underlying OpenMS algorithm.
+        consensus_map:
+            When provided, inference is performed on identifications attached to
+            this :class:`Py_ConsensusMap` / :class:`pyopenms.ConsensusMap`
+            instead of the peptide list.
+        include_unassigned:
+            Controls whether features without identifications should be
+            considered when running the basic inference on a consensus map.
+        greedy_group_resolution:
+            Passed to Epifany (the Bayesian implementation) to control how
+            indistinguishable protein groups are resolved.
+        experimental_design:
+            Optional :class:`pyopenms.ExperimentalDesign` forwarded to the
+            Bayesian algorithm for replicate-aware inference.
+        """
+
+        algorithm = algorithm.lower()
+        peptides = [oms.PeptideIdentification(entry) for entry in self.peptides.native]
+        proteins = [oms.ProteinIdentification(entry) for entry in self.proteins.native]
+
+        if algorithm == "basic":
+            runner = oms.BasicProteinInferenceAlgorithm()
+            _apply_algorithm_params(runner, params)
+            if consensus_map is not None:
+                if not proteins:
+                    raise ValueError("Protein inference requires at least one protein identification")
+                native_map = _coerce_consensus_map(consensus_map)
+                runner.run(native_map, proteins[0], bool(include_unassigned))
+            else:
+                runner.run(peptides, proteins)
+            return Identifications(proteins, peptides)
+
+        if algorithm == "bayesian":
+            runner = oms.BayesianProteinInferenceAlgorithm()
+            _apply_algorithm_params(runner, params)
+            if consensus_map is not None:
+                native_map = _coerce_consensus_map(consensus_map)
+                if experimental_design is not None:
+                    runner.inferPosteriorProbabilities(native_map, bool(greedy_group_resolution), experimental_design)
+                else:
+                    runner.inferPosteriorProbabilities(native_map, bool(greedy_group_resolution))
+                return Identifications(proteins, peptides)
+
+            if not proteins:
+                raise ValueError("Protein inference requires at least one protein identification")
+            if experimental_design is not None:
+                runner.inferPosteriorProbabilities(proteins, peptides, bool(greedy_group_resolution), experimental_design)
+            else:
+                runner.inferPosteriorProbabilities(proteins, peptides, bool(greedy_group_resolution))
+            return Identifications(proteins, peptides)
+
+        raise ValueError(
+            "Unknown protein inference algorithm '{algorithm}'. Supported values are 'basic' and 'bayesian'.".format(
+                algorithm=algorithm
+            )
+        )
+
     def summary(self) -> dict:
         """Return basic counts about the contained identifications."""
 
@@ -345,3 +424,29 @@ def _target_decoy_label(entry, hit, is_peptide: bool) -> Optional[str]:
     if entry.metaValueExists("target_decoy"):
         return str(entry.getMetaValue("target_decoy")).lower()
     return None
+
+
+def _apply_algorithm_params(algorithm, params: Optional[Union[oms.Param, Dict[str, Union[int, float, str]]]]) -> None:
+    if params is None:
+        return
+    if isinstance(params, oms.Param):
+        algorithm.setParameters(oms.Param(params))
+        return
+    param_obj = algorithm.getParameters()
+    for key, value in params.items():
+        param_obj.setValue(key, value)
+    algorithm.setParameters(param_obj)
+
+
+def _coerce_consensus_map(
+    consensus_map: Union["Py_ConsensusMap", oms.ConsensusMap]
+) -> oms.ConsensusMap:
+    if isinstance(consensus_map, oms.ConsensusMap):
+        return consensus_map
+    from .py_consensusmap import Py_ConsensusMap  # Local import to avoid circular dependency
+
+    if isinstance(consensus_map, Py_ConsensusMap):
+        return consensus_map.native
+    raise TypeError(
+        "consensus_map must be a Py_ConsensusMap or pyopenms.ConsensusMap instance"
+    )
diff --git a/tests/test_py_consensusmap.py b/tests/test_py_consensusmap.py
@@ -4,6 +4,7 @@
 oms = pytest.importorskip("pyopenms")
 
 from openms_python.py_consensusmap import Py_ConsensusMap
+from openms_python.py_identifications import Identifications
 
 
 def build_consensus_map(count: int = 3) -> Py_ConsensusMap:
@@ -17,6 +18,28 @@ def build_consensus_map(count: int = 3) -> Py_ConsensusMap:
     return Py_ConsensusMap(cmap)
 
 
+def _protein(identifier: str, accession: str) -> oms.ProteinIdentification:
+    entry = oms.ProteinIdentification()
+    entry.setIdentifier(identifier)
+    hit = oms.ProteinHit()
+    hit.setAccession(accession)
+    entry.setHits([hit])
+    return entry
+
+
+def _peptide(identifier: str, sequence: str, score: float, accession: str) -> oms.PeptideIdentification:
+    entry = oms.PeptideIdentification()
+    entry.setIdentifier(identifier)
+    hit = oms.PeptideHit()
+    hit.setSequence(oms.AASequence.fromString(sequence))
+    hit.setScore(score)
+    evidence = oms.PeptideEvidence()
+    evidence.setProteinAccession(accession)
+    hit.setPeptideEvidences([evidence])
+    entry.setHits([hit])
+    return entry
+
+
 def test_py_consensusmap_len_and_indexing():
     cmap = build_consensus_map()
 
@@ -149,3 +172,44 @@ def test_py_consensusmap_from_dataframe_requires_columns():
 
     with pytest.raises(ValueError):
         Py_ConsensusMap.from_dataframe(df)
+
+
+def test_py_consensusmap_infer_proteins_uses_consensus_map():
+    protein = _protein("run", "P10")
+    pep_a = _peptide("run", "PEPA", 5.0, "P10")
+    pep_b = _peptide("run", "PEPB", 30.0, "P10")
+
+    feature = oms.ConsensusFeature()
+    feature.setPeptideIdentifications([pep_a, pep_b])
+
+    cmap = oms.ConsensusMap()
+    cmap.setProteinIdentifications([protein])
+    cmap.push_back(feature)
+
+    wrapper = Py_ConsensusMap(cmap)
+    inferred = wrapper.infer_proteins(algorithm="basic", include_unassigned=True)
+
+    assert isinstance(inferred, Identifications)
+    assert inferred.protein_identifications[0].getHits()[0].getScore() == pytest.approx(30.0)
+
+
+def test_py_consensusmap_infer_protein_quantities(monkeypatch):
+    captured = {}
+
+    class DummyInference:
+        def __init__(self):
+            captured["instantiated"] = True
+
+        def infer(self, consensus_map, reference_map):
+            captured["consensus_map"] = consensus_map
+            captured["reference_map"] = reference_map
+
+    monkeypatch.setattr(oms, "ProteinInference", lambda: DummyInference())
+
+    wrapper = Py_ConsensusMap()
+    result = wrapper.infer_protein_quantities(reference_map=3)
+
+    assert result is wrapper
+    assert captured["instantiated"] is True
+    assert captured["consensus_map"] is wrapper.native
+    assert captured["reference_map"] == 3
diff --git a/tests/test_py_featuremap.py b/tests/test_py_featuremap.py
diff --git a/tests/test_py_identifications.py b/tests/test_py_identifications.py