feat!: get_genomic_mane_genes should return mane status (#398)

korikuzma · web-flow · commit de3d5c0750cd · 2025-02-12T10:08:55.000-05:00
diff --git a/src/cool_seq_tool/schemas.py b/src/cool_seq_tool/schemas.py
@@ -43,11 +43,18 @@ def values(cls) -> list[str]:
         return [item.value for item in cls]
 
 
+class ManeStatus(str, Enum):
+    """Define constraints for mane status"""
+
+    SELECT = "mane_select"
+    PLUS_CLINICAL = "mane_plus_clinical"
+
+
 class TranscriptPriority(str, Enum):
     """Create Enum for Transcript Priority labels"""
 
-    MANE_SELECT = "mane_select"
-    MANE_PLUS_CLINICAL = "mane_plus_clinical"
+    MANE_SELECT = ManeStatus.SELECT.value
+    MANE_PLUS_CLINICAL = ManeStatus.PLUS_CLINICAL.value
     LONGEST_COMPATIBLE_REMAINING = "longest_compatible_remaining"
     GRCH38 = "grch38"
 
@@ -137,6 +144,7 @@ class ManeGeneData(BaseModel, extra="forbid"):
     ncbi_gene_id: StrictInt
     hgnc_id: StrictInt | None
     symbol: StrictStr
+    status: list[ManeStatus]
 
 
 class ServiceMeta(BaseModelForbidExtra):
diff --git a/src/cool_seq_tool/sources/mane_transcript_mappings.py b/src/cool_seq_tool/sources/mane_transcript_mappings.py
@@ -117,26 +117,58 @@ def get_genomic_mane_genes(
         :param end: Genomic end position. Assumes residue coordinates.
         :return: Unique MANE gene(s) found for a genomic location
         """
+        # Only interested in rows where genomic location lives
         mane_rows = self.df.filter(
             (start >= pl.col("chr_start"))
             & (end <= pl.col("chr_end"))
             & (pl.col("GRCh38_chr") == ac)
-        ).unique(subset=["#NCBI_GeneID"])
+        )
 
-        if len(mane_rows) == 0:
+        if mane_rows.is_empty():
             return []
 
-        mane_rows = mane_rows.with_columns(
-            pl.col("#NCBI_GeneID")
-            .str.split_exact(":", 1)
-            .struct.field("field_1")
-            .cast(pl.Int32)
-            .alias("ncbi_gene_id"),
-            pl.col("HGNC_ID")
-            .str.split_exact(":", 1)
-            .struct.field("field_1")
-            .cast(pl.Int32)
-            .alias("hgnc_id"),
+        # Group rows by NCBI ID, transform values to representation we want, MANE status
+        # will be converted to list with DESC order
+        mane_rows = mane_rows.group_by("#NCBI_GeneID").agg(
+            [
+                pl.col("#NCBI_GeneID")
+                .first()
+                .str.split_exact(":", 1)
+                .struct.field("field_1")
+                .cast(pl.Int32)
+                .alias("ncbi_gene_id"),
+                pl.col("HGNC_ID")
+                .first()
+                .str.split_exact(":", 1)
+                .struct.field("field_1")
+                .cast(pl.Int32)
+                .alias("hgnc_id"),
+                pl.col("MANE_status")
+                .unique()
+                .str.to_lowercase()
+                .str.replace_all(" ", "_")
+                .alias("status")
+                .sort(descending=True),
+                pl.col("symbol").first(),
+            ]
+        )
+
+        # Sort final rows based on MANE status
+        # First by length (which means gene has both select and plus clinical)
+        # Then by DESC order
+        # Then by NCBI ID ASC order
+        mane_rows = (
+            mane_rows.with_columns(
+                [
+                    pl.col("status").list.len().alias("status_count"),
+                    pl.col("status").list.join("_").alias("status_str"),
+                    pl.col("ncbi_gene_id"),
+                ]
+            )
+            .sort(
+                ["status_count", "status_str", "ncbi_gene_id"],
+                descending=[True, True, False],
+            )
+            .drop(["status_count", "status_str", "#NCBI_GeneID"])
         )
-        mane_rows = mane_rows.select(["ncbi_gene_id", "hgnc_id", "symbol"])
         return [ManeGeneData(**mane_gene) for mane_gene in mane_rows.to_dicts()]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -340,10 +340,19 @@ def genomic_tx_data():
 @pytest.fixture(scope="session")
 def egfr_mane_gene():
     """Create test fixture for EGFR MANE gene"""
-    return ManeGeneData(ncbi_gene_id=1956, hgnc_id=3236, symbol="EGFR")
+    return ManeGeneData(
+        ncbi_gene_id=1956, hgnc_id=3236, symbol="EGFR", status=["mane_select"]
+    )
 
 
 @pytest.fixture(scope="session")
-def braf_mane_gene():
+def braf_mane_genes():
     """Create test fixture for BRAF MANE gene"""
-    return ManeGeneData(ncbi_gene_id=673, hgnc_id=1097, symbol="BRAF")
+    return [
+        ManeGeneData(
+            ncbi_gene_id=673,
+            hgnc_id=1097,
+            symbol="BRAF",
+            status=["mane_select", "mane_plus_clinical"],
+        ),
+    ]
diff --git a/tests/mappers/test_mane_transcript.py b/tests/mappers/test_mane_transcript.py
@@ -143,13 +143,13 @@ def grch38_egfr(egfr_mane_gene):
 
 
 @pytest.fixture(scope="module")
-def grch38_braf(braf_mane_gene):
+def grch38_braf(braf_mane_genes):
     """Create a test fixture for grch38 responses BRAF V600E (genomic)."""
     params = {
         "pos": (140753335, 140753336),
         "status": TranscriptPriority.GRCH38.value,
         "ac": "NC_000007.14",
-        "mane_genes": [braf_mane_gene],
+        "mane_genes": braf_mane_genes,
     }
     return GenomicRepresentation(**params)
 
diff --git a/tests/sources/test_mane_transcript_mappings.py b/tests/sources/test_mane_transcript_mappings.py
@@ -209,43 +209,105 @@ def test_get_mane_data_from_chr_pos(
 
 
 def test_get_genomic_mane_genes(
-    test_mane_transcript_mappings, braf_mane_gene, egfr_mane_gene
+    test_mane_transcript_mappings, braf_mane_genes, egfr_mane_gene
 ):
     """Test that get_genomic_mane_genes method works correctly"""
     new_df = pl.DataFrame(
         {
-            "#NCBI_GeneID": ["GeneID:673", "GeneID:673", "GeneID:1956", "GeneID:1"],
+            "#NCBI_GeneID": [
+                "GeneID:673",
+                "GeneID:673",
+                "GeneID:1956",
+                "GeneID:1",
+                "GeneID:2",
+                "GeneID:2",
+                "GeneID:3",
+            ],
             "Ensembl_Gene": [
                 "ENSG00000157764.14",
                 "ENSG00000157764.14",
                 "ENSG00000146648.21",
                 "ENSG1.1",
+                "ENSG1.1",
+                "ENSG1.1",
+                "ENSG1.1",
+            ],
+            "HGNC_ID": [
+                "HGNC:1097",
+                "HGNC:1097",
+                "HGNC:3236",
+                "HGNC:1",
+                "HGNC:2",
+                "HGNC:2",
+                "HGNC:3",
             ],
-            "HGNC_ID": ["HGNC:1097", "HGNC:1097", "HGNC:3236", "HGNC:2"],
-            "symbol": ["BRAF", "BRAF", "EGFR", "Dummy"],
+            "symbol": ["BRAF", "BRAF", "EGFR", "Dummy1", "Dummy2", "Dummy2", "Dummy3"],
             "GRCh38_chr": [
                 "NC_000007.14",
                 "NC_000007.14",
                 "NC_000007.14",
                 "NC_000007.14",
+                "NC_000007.14",
+                "NC_000007.14",
+                "NC_000007.14",
+            ],
+            "chr_start": [
+                140719337,
+                140730665,
+                55019017,
+                55019017,
+                55019017,
+                55019017,
+                55019017,
+            ],
+            "chr_end": [
+                140924929,
+                140924929,
+                55211628,
+                55211628,
+                55211628,
+                55211628,
+                55211628,
+            ],
+            "MANE_status": [
+                "MANE Plus Clinical",
+                "MANE Select",
+                "MANE Select",
+                "MANE Plus Clinical",
+                "MANE Select",
+                "MANE Plus Clinical",
+                "MANE Select",
             ],
-            "chr_start": [140719337, 140730665, 55019017, 55019017],
-            "chr_end": [140924929, 140924929, 55211628, 55211628],
         }
     )
 
     with patch.object(test_mane_transcript_mappings, "df", new_df):
         mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(
             "NC_000007.14", 140753336, 140753336
         )
-        assert mane_genes == [braf_mane_gene]
+        assert mane_genes == braf_mane_genes
 
         mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(
             "NC_000007.14", 55191822, 55191822
         )
-        assert len(mane_genes) == 2
-        assert egfr_mane_gene in mane_genes
-        assert ManeGeneData(ncbi_gene_id=1, hgnc_id=2, symbol="Dummy") in mane_genes
+        assert mane_genes == [
+            ManeGeneData(
+                ncbi_gene_id=2,
+                hgnc_id=2,
+                symbol="Dummy2",
+                status=["mane_select", "mane_plus_clinical"],
+            ),
+            ManeGeneData(
+                ncbi_gene_id=3, hgnc_id=3, symbol="Dummy3", status=["mane_select"]
+            ),
+            egfr_mane_gene,
+            ManeGeneData(
+                ncbi_gene_id=1,
+                hgnc_id=1,
+                symbol="Dummy1",
+                status=["mane_plus_clinical"],
+            ),
+        ]
 
         # No MANE genes found for given genomic location
         mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(