Skip to content

Commit de3d5c0

Browse files
authored
feat!: get_genomic_mane_genes should return mane status (#398)
1 parent ef342d4 commit de3d5c0

File tree

5 files changed

+142
-31
lines changed

5 files changed

+142
-31
lines changed

src/cool_seq_tool/schemas.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,18 @@ def values(cls) -> list[str]:
4343
return [item.value for item in cls]
4444

4545

46+
class ManeStatus(str, Enum):
47+
"""Define constraints for mane status"""
48+
49+
SELECT = "mane_select"
50+
PLUS_CLINICAL = "mane_plus_clinical"
51+
52+
4653
class TranscriptPriority(str, Enum):
4754
"""Create Enum for Transcript Priority labels"""
4855

49-
MANE_SELECT = "mane_select"
50-
MANE_PLUS_CLINICAL = "mane_plus_clinical"
56+
MANE_SELECT = ManeStatus.SELECT.value
57+
MANE_PLUS_CLINICAL = ManeStatus.PLUS_CLINICAL.value
5158
LONGEST_COMPATIBLE_REMAINING = "longest_compatible_remaining"
5259
GRCH38 = "grch38"
5360

@@ -137,6 +144,7 @@ class ManeGeneData(BaseModel, extra="forbid"):
137144
ncbi_gene_id: StrictInt
138145
hgnc_id: StrictInt | None
139146
symbol: StrictStr
147+
status: list[ManeStatus]
140148

141149

142150
class ServiceMeta(BaseModelForbidExtra):

src/cool_seq_tool/sources/mane_transcript_mappings.py

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -117,26 +117,58 @@ def get_genomic_mane_genes(
117117
:param end: Genomic end position. Assumes residue coordinates.
118118
:return: Unique MANE gene(s) found for a genomic location
119119
"""
120+
# Only interested in rows where genomic location lives
120121
mane_rows = self.df.filter(
121122
(start >= pl.col("chr_start"))
122123
& (end <= pl.col("chr_end"))
123124
& (pl.col("GRCh38_chr") == ac)
124-
).unique(subset=["#NCBI_GeneID"])
125+
)
125126

126-
if len(mane_rows) == 0:
127+
if mane_rows.is_empty():
127128
return []
128129

129-
mane_rows = mane_rows.with_columns(
130-
pl.col("#NCBI_GeneID")
131-
.str.split_exact(":", 1)
132-
.struct.field("field_1")
133-
.cast(pl.Int32)
134-
.alias("ncbi_gene_id"),
135-
pl.col("HGNC_ID")
136-
.str.split_exact(":", 1)
137-
.struct.field("field_1")
138-
.cast(pl.Int32)
139-
.alias("hgnc_id"),
130+
# Group rows by NCBI ID, transform values to representation we want, MANE status
131+
# will be converted to list with DESC order
132+
mane_rows = mane_rows.group_by("#NCBI_GeneID").agg(
133+
[
134+
pl.col("#NCBI_GeneID")
135+
.first()
136+
.str.split_exact(":", 1)
137+
.struct.field("field_1")
138+
.cast(pl.Int32)
139+
.alias("ncbi_gene_id"),
140+
pl.col("HGNC_ID")
141+
.first()
142+
.str.split_exact(":", 1)
143+
.struct.field("field_1")
144+
.cast(pl.Int32)
145+
.alias("hgnc_id"),
146+
pl.col("MANE_status")
147+
.unique()
148+
.str.to_lowercase()
149+
.str.replace_all(" ", "_")
150+
.alias("status")
151+
.sort(descending=True),
152+
pl.col("symbol").first(),
153+
]
154+
)
155+
156+
# Sort final rows based on MANE status
157+
# First by length (which means gene has both select and plus clinical)
158+
# Then by DESC order
159+
# Then by NCBI ID ASC order
160+
mane_rows = (
161+
mane_rows.with_columns(
162+
[
163+
pl.col("status").list.len().alias("status_count"),
164+
pl.col("status").list.join("_").alias("status_str"),
165+
pl.col("ncbi_gene_id"),
166+
]
167+
)
168+
.sort(
169+
["status_count", "status_str", "ncbi_gene_id"],
170+
descending=[True, True, False],
171+
)
172+
.drop(["status_count", "status_str", "#NCBI_GeneID"])
140173
)
141-
mane_rows = mane_rows.select(["ncbi_gene_id", "hgnc_id", "symbol"])
142174
return [ManeGeneData(**mane_gene) for mane_gene in mane_rows.to_dicts()]

tests/conftest.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -340,10 +340,19 @@ def genomic_tx_data():
340340
@pytest.fixture(scope="session")
341341
def egfr_mane_gene():
342342
"""Create test fixture for EGFR MANE gene"""
343-
return ManeGeneData(ncbi_gene_id=1956, hgnc_id=3236, symbol="EGFR")
343+
return ManeGeneData(
344+
ncbi_gene_id=1956, hgnc_id=3236, symbol="EGFR", status=["mane_select"]
345+
)
344346

345347

346348
@pytest.fixture(scope="session")
347-
def braf_mane_gene():
349+
def braf_mane_genes():
348350
"""Create test fixture for BRAF MANE gene"""
349-
return ManeGeneData(ncbi_gene_id=673, hgnc_id=1097, symbol="BRAF")
351+
return [
352+
ManeGeneData(
353+
ncbi_gene_id=673,
354+
hgnc_id=1097,
355+
symbol="BRAF",
356+
status=["mane_select", "mane_plus_clinical"],
357+
),
358+
]

tests/mappers/test_mane_transcript.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,13 +143,13 @@ def grch38_egfr(egfr_mane_gene):
143143

144144

145145
@pytest.fixture(scope="module")
146-
def grch38_braf(braf_mane_gene):
146+
def grch38_braf(braf_mane_genes):
147147
"""Create a test fixture for grch38 responses BRAF V600E (genomic)."""
148148
params = {
149149
"pos": (140753335, 140753336),
150150
"status": TranscriptPriority.GRCH38.value,
151151
"ac": "NC_000007.14",
152-
"mane_genes": [braf_mane_gene],
152+
"mane_genes": braf_mane_genes,
153153
}
154154
return GenomicRepresentation(**params)
155155

tests/sources/test_mane_transcript_mappings.py

Lines changed: 72 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -209,43 +209,105 @@ def test_get_mane_data_from_chr_pos(
209209

210210

211211
def test_get_genomic_mane_genes(
212-
test_mane_transcript_mappings, braf_mane_gene, egfr_mane_gene
212+
test_mane_transcript_mappings, braf_mane_genes, egfr_mane_gene
213213
):
214214
"""Test that get_genomic_mane_genes method works correctly"""
215215
new_df = pl.DataFrame(
216216
{
217-
"#NCBI_GeneID": ["GeneID:673", "GeneID:673", "GeneID:1956", "GeneID:1"],
217+
"#NCBI_GeneID": [
218+
"GeneID:673",
219+
"GeneID:673",
220+
"GeneID:1956",
221+
"GeneID:1",
222+
"GeneID:2",
223+
"GeneID:2",
224+
"GeneID:3",
225+
],
218226
"Ensembl_Gene": [
219227
"ENSG00000157764.14",
220228
"ENSG00000157764.14",
221229
"ENSG00000146648.21",
222230
"ENSG1.1",
231+
"ENSG1.1",
232+
"ENSG1.1",
233+
"ENSG1.1",
234+
],
235+
"HGNC_ID": [
236+
"HGNC:1097",
237+
"HGNC:1097",
238+
"HGNC:3236",
239+
"HGNC:1",
240+
"HGNC:2",
241+
"HGNC:2",
242+
"HGNC:3",
223243
],
224-
"HGNC_ID": ["HGNC:1097", "HGNC:1097", "HGNC:3236", "HGNC:2"],
225-
"symbol": ["BRAF", "BRAF", "EGFR", "Dummy"],
244+
"symbol": ["BRAF", "BRAF", "EGFR", "Dummy1", "Dummy2", "Dummy2", "Dummy3"],
226245
"GRCh38_chr": [
227246
"NC_000007.14",
228247
"NC_000007.14",
229248
"NC_000007.14",
230249
"NC_000007.14",
250+
"NC_000007.14",
251+
"NC_000007.14",
252+
"NC_000007.14",
253+
],
254+
"chr_start": [
255+
140719337,
256+
140730665,
257+
55019017,
258+
55019017,
259+
55019017,
260+
55019017,
261+
55019017,
262+
],
263+
"chr_end": [
264+
140924929,
265+
140924929,
266+
55211628,
267+
55211628,
268+
55211628,
269+
55211628,
270+
55211628,
271+
],
272+
"MANE_status": [
273+
"MANE Plus Clinical",
274+
"MANE Select",
275+
"MANE Select",
276+
"MANE Plus Clinical",
277+
"MANE Select",
278+
"MANE Plus Clinical",
279+
"MANE Select",
231280
],
232-
"chr_start": [140719337, 140730665, 55019017, 55019017],
233-
"chr_end": [140924929, 140924929, 55211628, 55211628],
234281
}
235282
)
236283

237284
with patch.object(test_mane_transcript_mappings, "df", new_df):
238285
mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(
239286
"NC_000007.14", 140753336, 140753336
240287
)
241-
assert mane_genes == [braf_mane_gene]
288+
assert mane_genes == braf_mane_genes
242289

243290
mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(
244291
"NC_000007.14", 55191822, 55191822
245292
)
246-
assert len(mane_genes) == 2
247-
assert egfr_mane_gene in mane_genes
248-
assert ManeGeneData(ncbi_gene_id=1, hgnc_id=2, symbol="Dummy") in mane_genes
293+
assert mane_genes == [
294+
ManeGeneData(
295+
ncbi_gene_id=2,
296+
hgnc_id=2,
297+
symbol="Dummy2",
298+
status=["mane_select", "mane_plus_clinical"],
299+
),
300+
ManeGeneData(
301+
ncbi_gene_id=3, hgnc_id=3, symbol="Dummy3", status=["mane_select"]
302+
),
303+
egfr_mane_gene,
304+
ManeGeneData(
305+
ncbi_gene_id=1,
306+
hgnc_id=1,
307+
symbol="Dummy1",
308+
status=["mane_plus_clinical"],
309+
),
310+
]
249311

250312
# No MANE genes found for given genomic location
251313
mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(

0 commit comments

Comments
 (0)