Skip to content
This repository was archived by the owner on Dec 22, 2025. It is now read-only.

Commit b1ccfce

Browse files
authored
test: add transcript selection checks (#13)
1 parent b855c5d commit b1ccfce

File tree

7 files changed

+189
-59
lines changed

7 files changed

+189
-59
lines changed

misc/bug_hunting/check_align.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
with open("notebooks/analysis/results/mave_blat.pickle", "rb") as f:
1919
mave_blat_dict = pickle.load(f)
2020

21-
with open("human_urns.txt", "r") as f:
21+
with open("misc/bug_hunting/human_urns.txt", "r") as f:
2222
urns = [line.strip() for line in f.readlines()]
2323

2424
strand_reformat = {1: Strand.POSITIVE, -1: Strand.NEGATIVE}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import logging
2+
import asyncio
3+
import pickle
4+
5+
from cool_seq_tool.schemas import Strand, TranscriptPriority
6+
7+
from dcd_mapping.align import align
8+
from dcd_mapping.resources import get_scoreset_metadata, get_scoreset_records
9+
from dcd_mapping.transcripts import select_transcript
10+
11+
logging.basicConfig(
12+
filename="dcd-check-transcript.log",
13+
format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
14+
level=logging.DEBUG,
15+
force=True,
16+
)
17+
_logger = logging.getLogger(__name__)
18+
19+
with open("misc/bug_hunting/human_urns.txt", "r") as f:
20+
urns = [line.strip() for line in f.readlines()]
21+
22+
with open("notebooks/analysis/results/mave_blat.pickle", "rb") as f:
23+
mave_blat_dict = pickle.load(f)
24+
25+
with open("notebooks/analysis/results/mappings.pickle", "rb") as f:
26+
expected_mappings = pickle.load(f)
27+
28+
async def check_tx_results():
29+
for urn in urns:
30+
try:
31+
# prereqs
32+
metadata = get_scoreset_metadata(urn)
33+
records = get_scoreset_records(urn)
34+
alignment = align(metadata, use_cached=True)
35+
except Exception as e:
36+
_logger.error("%s error before transcript selection: %s", urn, e)
37+
continue
38+
try:
39+
tx_result = await select_transcript(metadata, records, alignment)
40+
except Exception as e:
41+
if urn in expected_mappings:
42+
_logger.error("%s error during transcript selection: %s", urn, e)
43+
else:
44+
_logger.error("%s error during transcript selection (not in expected_mappings): %s", urn, e)
45+
continue
46+
47+
if urn not in mave_blat_dict:
48+
continue # not in original experiment
49+
if urn not in expected_mappings:
50+
if tx_result is not None:
51+
_logger.error("%s performed transcript selection when it shouldn't have", urn)
52+
else:
53+
_logger.info("Skipping %s as expected", urn)
54+
continue
55+
if urn in expected_mappings and tx_result is None:
56+
_logger.error("%s skipped transcript selection when it shouldn't have", urn)
57+
continue
58+
59+
def reformat_mane(status: str):
60+
return TranscriptPriority[status.replace(" ", "_").upper()]
61+
62+
try:
63+
expected = expected_mappings[urn]
64+
for (name, actual, expected) in [
65+
("NP accession", tx_result.np, expected[0]),
66+
("start position", tx_result.start, expected[1]),
67+
("is full match", tx_result.is_full_match, expected[3]),
68+
("NM accession", tx_result.nm, expected[4]),
69+
("Tx priority", tx_result.transcript_mode, reformat_mane(expected[5]))
70+
]:
71+
if actual != expected:
72+
_logger.error(
73+
"%s %s mismatch: %s (actual) vs %s (expected)",
74+
urn,
75+
name,
76+
actual,
77+
expected,
78+
)
79+
except Exception as e:
80+
_logger.error("%s exception: %s", urn, e)
81+
82+
_logger.info("%s completed successfully", urn) # not a warning but w/e
83+
84+
asyncio.run(check_tx_results())

src/dcd_mapping/transcripts.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Outstanding questions/confusion
44
-------------------------------
55
* ``urn:mavedb:00000097-n-1``: unable to find any matching transcripts
6-
* Lots of scoresets (2023-) giving index errors in offset calculation
6+
* Lots of scoresets (esp. 2023-) giving index errors in offset calculation
77
* Remove MANE sorting once upstream sorting is confirmed
88
"""
99
import logging
@@ -274,7 +274,8 @@ def _offset_target_sequence(metadata: ScoresetMetadata, records: List[ScoreRow])
274274
protein_sequence[i + p1 - p0] == amino_acids_by_position[p1],
275275
protein_sequence[i + p2 - p0] == amino_acids_by_position[p2],
276276
protein_sequence[i + p3 - p0] == amino_acids_by_position[p3],
277-
protein_sequence[i + p4 - p0] == amino_acids_by_position[p4],
277+
protein_sequence[i + p4 - p0]
278+
== amino_acids_by_position[p4], # TODO problem here-ish
278279
]
279280
):
280281
if i + 1 == min(amino_acids_by_position.keys()) or i + 2 == min(

tests/conftest.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,13 @@
44
Notes on test cases:
55
-------------------
66
7-
* urn:mavedb:00000068-a-1: TP53, protein-coding, DNA.
7+
* urn:mavedb:00000041-a-1: SRC, protein-coding, dna, uniprot ref
8+
* urn:mavedb:00000018-a-1: HBB promoter, regulatory, DNA
9+
* urn:mavedb:00000001-a-4: UBE2I, protein-coding, dna, uniprot ref
10+
* urn:mavedb:00000113-a-2: APP, protein-coding, protein sequence, uniprot ref. Not in original notebooks.
11+
* urn:mavedb:00000098-a-1: SCN5A, protein-coding, protein sequence, uniprot ref with offset
12+
* urn:mavedb:00000061-h-1: RAF, protein coding, DNA, uniprot ref with offset
13+
* urn:mavedb:00000068-a-1: TP53, protein-coding, DNA
814
"""
915
import json
1016
import os

tests/fixtures/align_result.json

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,6 @@
2323
{ "start": 37403170, "end": 37403325 }
2424
]
2525
},
26-
"urn:mavedb:00000098-a-1": {
27-
"chrom": "chr3",
28-
"strand": -1,
29-
"coverage": 100.0,
30-
"ident_pct": 100.0,
31-
"query_range": { "start": 0, "end": 12 },
32-
"query_subranges": [{ "start": 0, "end": 12 }],
33-
"hit_range": { "start": 38551475, "end": 38551511 },
34-
"hit_subranges": [{ "start": 38551475, "end": 38551511 }]
35-
},
3626
"urn:mavedb:00000018-a-1": {
3727
"chrom": "chr11",
3828
"strand": 1,
@@ -43,6 +33,40 @@
4333
"hit_range": { "start": 5227021, "end": 5227208 },
4434
"hit_subranges": [{ "start": 5227021, "end": 5227208 }]
4535
},
36+
"urn:mavedb:00000001-a-4": {
37+
"chrom": "chr16",
38+
"strand": 1,
39+
"coverage": 100.0,
40+
"ident_pct": 100.0,
41+
"query_range": { "start": 0, "end": 477 },
42+
"query_subranges": [
43+
{ "start": 0, "end": 66 },
44+
{ "start": 66, "end": 150 },
45+
{ "start": 150, "end": 223 },
46+
{ "start": 223, "end": 333 },
47+
{ "start": 333, "end": 413 },
48+
{ "start": 413, "end": 477 }
49+
],
50+
"hit_range": { "start": 1314030, "end": 1324793 },
51+
"hit_subranges": [
52+
{ "start": 1314030, "end": 1314096 },
53+
{ "start": 1314292, "end": 1314376 },
54+
{ "start": 1315653, "end": 1315726 },
55+
{ "start": 1320173, "end": 1320283 },
56+
{ "start": 1320437, "end": 1320517 },
57+
{ "start": 1324729, "end": 1324793 }
58+
]
59+
},
60+
"urn:mavedb:00000098-a-1": {
61+
"chrom": "chr3",
62+
"strand": -1,
63+
"coverage": 100.0,
64+
"ident_pct": 100.0,
65+
"query_range": { "start": 0, "end": 12 },
66+
"query_subranges": [{ "start": 0, "end": 12 }],
67+
"hit_range": { "start": 38551475, "end": 38551511 },
68+
"hit_subranges": [{ "start": 38551475, "end": 38551511 }]
69+
},
4670
"urn:mavedb:00000061-h-1": {
4771
"chrom": "chr3",
4872
"strand": -1,

tests/fixtures/transcript_result.json

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,35 @@
77
"transcript_mode": "mane_select",
88
"sequence": "TODO"
99
},
10-
"urn:mavedb:00000001-a-4": {
11-
"nm": "NM_003345.5",
12-
"np": "NP_003336.1",
13-
"start": 0,
14-
"is_full_match": true,
15-
"transcript_mode": "mane_select",
16-
"sequence": "TODO"
17-
},
18-
"urn:mavedb:00000098-a-1": {
19-
"nm": "NM_000335.5",
20-
"np": "NP_000326.2",
21-
"start": 1619,
22-
"is_full_match": true,
23-
"transcript_mode": "mane_select",
24-
"sequence": "TODO"
25-
},
26-
"urn:mavedb:00000061-h-1": {
27-
"nm": "NM_002880.4",
28-
"np": "NP_002871.1",
29-
"start": 51,
30-
"is_full_match": true,
31-
"transcript_mode": "mane_select",
32-
"sequence": "TODO"
33-
}
10+
"urn:mavedb:00000001-a-4": {
11+
"nm": "NM_003345.5",
12+
"np": "NP_003336.1",
13+
"start": 0,
14+
"is_full_match": true,
15+
"transcript_mode": "mane_select",
16+
"sequence": "TODO"
17+
},
18+
"urn:mavedb:00000098-a-1": {
19+
"nm": "NM_000335.5",
20+
"np": "NP_000326.2",
21+
"start": 1619,
22+
"is_full_match": true,
23+
"transcript_mode": "mane_select",
24+
"sequence": "TODO"
25+
},
26+
"urn:mavedb:00000061-h-1": {
27+
"nm": "NM_002880.4",
28+
"np": "NP_002871.1",
29+
"start": 51,
30+
"is_full_match": true,
31+
"transcript_mode": "mane_select",
32+
"sequence": "TODO"
33+
},
34+
"urn:mavedb:00000068-a-1": {
35+
"nm": "NM_000546.6",
36+
"np": "NP_000537.3",
37+
"start": 0,
38+
"is_full_match": false,
39+
"transcript_mode": "mane_select"
40+
}
3441
}

tests/unit/test_transcript.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,15 @@
88
from dcd_mapping.transcripts import select_transcript
99

1010

11+
def check_transcript_results_equality(actual: TxSelectResult, expected: TxSelectResult):
12+
"""Check equality of transcript selection result vs fixture"""
13+
assert actual.np == expected.np
14+
assert actual.start == expected.start
15+
assert actual.is_full_match is expected.is_full_match
16+
assert actual.nm == expected.nm
17+
assert actual.transcript_mode == expected.transcript_mode
18+
19+
1120
@pytest.mark.asyncio(scope="module")
1221
async def test_tx_src(
1322
scoreset_metadata_fixture: Dict[str, ScoresetMetadata],
@@ -17,18 +26,12 @@ async def test_tx_src(
1726
"""Test transcript selection for urn:mavedb:00000041-a-1"""
1827
urn = "urn:mavedb:00000041-a-1"
1928
metadata = scoreset_metadata_fixture[urn]
20-
records = get_scoreset_records(urn) # TODO real fixture
29+
records = get_scoreset_records(urn)
2130
alignment_result = align_result_fixture[urn]
2231
expected = transcript_results_fixture[urn]
23-
2432
actual = await select_transcript(metadata, records, alignment_result)
25-
2633
assert actual
27-
assert actual.np == expected.np
28-
assert actual.start == expected.start
29-
assert actual.is_full_match is expected.is_full_match
30-
assert actual.nm == expected.nm
31-
assert actual.transcript_mode == expected.transcript_mode
34+
check_transcript_results_equality(actual, expected)
3235

3336

3437
@pytest.mark.asyncio(scope="module")
@@ -43,15 +46,9 @@ async def test_tx_scn5a(
4346
records = get_scoreset_records(urn)
4447
alignment_result = align_result_fixture[urn]
4548
expected = transcript_results_fixture[urn]
46-
4749
actual = await select_transcript(metadata, records, alignment_result)
48-
4950
assert actual
50-
assert actual.np == expected.np
51-
assert actual.start == expected.start
52-
assert actual.is_full_match is expected.is_full_match
53-
assert actual.nm == expected.nm
54-
assert actual.transcript_mode == expected.transcript_mode
51+
check_transcript_results_equality(actual, expected)
5552

5653

5754
@pytest.mark.asyncio(scope="module")
@@ -64,7 +61,6 @@ async def test_tx_hbb(
6461
metadata = scoreset_metadata_fixture[urn]
6562
records = get_scoreset_records(urn)
6663
alignment_result = align_result_fixture[urn]
67-
6864
actual = await select_transcript(metadata, records, alignment_result)
6965
assert actual is None
7066

@@ -81,11 +77,23 @@ async def test_tx_raf(
8177
records = get_scoreset_records(urn)
8278
alignment_result = align_result_fixture[urn]
8379
expected = transcript_results_fixture[urn]
80+
actual = await select_transcript(metadata, records, alignment_result)
81+
assert actual
82+
check_transcript_results_equality(actual, expected)
8483

84+
85+
@pytest.mark.asyncio(scope="module")
86+
async def test_tx_tp53(
87+
scoreset_metadata_fixture: Dict[str, ScoresetMetadata],
88+
align_result_fixture: Dict[str, AlignmentResult],
89+
transcript_results_fixture: Dict[str, TxSelectResult],
90+
):
91+
"""Test transcript selection for urn:mavedb:00000068-a-1"""
92+
urn = "urn:mavedb:00000068-a-1"
93+
metadata = scoreset_metadata_fixture[urn]
94+
records = get_scoreset_records(urn)
95+
alignment_result = align_result_fixture[urn]
96+
expected = transcript_results_fixture[urn]
8597
actual = await select_transcript(metadata, records, alignment_result)
8698
assert actual
87-
assert actual.np == expected.np
88-
assert actual.start == expected.start
89-
assert actual.is_full_match is expected.is_full_match
90-
assert actual.nm == expected.nm
91-
assert actual.transcript_mode == expected.transcript_mode
99+
check_transcript_results_equality(actual, expected)

0 commit comments

Comments
 (0)