test: add transcript selection checks (#13)

jsstevenson · web-flow · commit b1ccfce4e56b · 2024-01-25T16:49:39.000-05:00
diff --git a/misc/bug_hunting/check_align.py b/misc/bug_hunting/check_align.py
@@ -18,7 +18,7 @@
 with open("notebooks/analysis/results/mave_blat.pickle", "rb") as f:
     mave_blat_dict = pickle.load(f)
 
-with open("human_urns.txt", "r") as f:
+with open("misc/bug_hunting/human_urns.txt", "r") as f:
     urns = [line.strip() for line in f.readlines()]
 
 strand_reformat = {1: Strand.POSITIVE, -1: Strand.NEGATIVE}
diff --git a/misc/bug_hunting/check_transcript.py b/misc/bug_hunting/check_transcript.py
@@ -0,0 +1,84 @@
+import logging
+import asyncio
+import pickle
+
+from cool_seq_tool.schemas import Strand, TranscriptPriority
+
+from dcd_mapping.align import align
+from dcd_mapping.resources import get_scoreset_metadata, get_scoreset_records
+from dcd_mapping.transcripts import select_transcript
+
+logging.basicConfig(
+    filename="dcd-check-transcript.log",
+    format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
+    level=logging.DEBUG,
+    force=True,
+)
+_logger = logging.getLogger(__name__)
+
+with open("misc/bug_hunting/human_urns.txt", "r") as f:
+    urns = [line.strip() for line in f.readlines()]
+
+with open("notebooks/analysis/results/mave_blat.pickle", "rb") as f:
+    mave_blat_dict = pickle.load(f)
+
+with open("notebooks/analysis/results/mappings.pickle", "rb") as f:
+    expected_mappings = pickle.load(f)
+
+async def check_tx_results():
+    for urn in urns:
+        try:
+            # prereqs
+            metadata = get_scoreset_metadata(urn)
+            records = get_scoreset_records(urn)
+            alignment = align(metadata, use_cached=True)
+        except Exception as e:
+            _logger.error("%s error before transcript selection: %s", urn, e)
+            continue
+        try:
+            tx_result = await select_transcript(metadata, records, alignment)
+        except Exception as e:
+            if urn in expected_mappings:
+                _logger.error("%s error during transcript selection: %s", urn, e)
+            else:
+                _logger.error("%s error during transcript selection (not in expected_mappings): %s", urn, e)
+            continue
+
+        if urn not in mave_blat_dict:
+            continue  # not in original experiment
+        if urn not in expected_mappings:
+            if tx_result is not None:
+                _logger.error("%s performed transcript selection when it shouldn't have", urn)
+            else:
+                _logger.info("Skipping %s as expected", urn)
+            continue
+        if urn in expected_mappings and tx_result is None:
+            _logger.error("%s skipped transcript selection when it shouldn't have", urn)
+            continue
+
+        def reformat_mane(status: str):
+            return TranscriptPriority[status.replace(" ", "_").upper()]
+
+        try:
+            expected = expected_mappings[urn]
+            for (name, actual, expected) in [
+                ("NP accession", tx_result.np, expected[0]),
+                ("start position", tx_result.start, expected[1]),
+                ("is full match", tx_result.is_full_match, expected[3]),
+                ("NM accession", tx_result.nm, expected[4]),
+                ("Tx priority", tx_result.transcript_mode, reformat_mane(expected[5]))
+            ]:
+                if actual != expected:
+                    _logger.error(
+                        "%s %s mismatch: %s (actual) vs %s (expected)",
+                        urn,
+                        name,
+                        actual,
+                        expected,
+                    )
+        except Exception as e:
+            _logger.error("%s exception: %s", urn, e)
+
+        _logger.info("%s completed successfully", urn)  # not a warning but w/e
+
+asyncio.run(check_tx_results())
diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py
@@ -3,7 +3,7 @@
 Outstanding questions/confusion
 -------------------------------
 * ``urn:mavedb:00000097-n-1``: unable to find any matching transcripts
-* Lots of scoresets (2023-) giving index errors in offset calculation
+* Lots of scoresets (esp. 2023-) giving index errors in offset calculation
 * Remove MANE sorting once upstream sorting is confirmed
 """
 import logging
@@ -274,7 +274,8 @@ def _offset_target_sequence(metadata: ScoresetMetadata, records: List[ScoreRow])
                 protein_sequence[i + p1 - p0] == amino_acids_by_position[p1],
                 protein_sequence[i + p2 - p0] == amino_acids_by_position[p2],
                 protein_sequence[i + p3 - p0] == amino_acids_by_position[p3],
-                protein_sequence[i + p4 - p0] == amino_acids_by_position[p4],
+                protein_sequence[i + p4 - p0]
+                == amino_acids_by_position[p4],  # TODO problem here-ish
             ]
         ):
             if i + 1 == min(amino_acids_by_position.keys()) or i + 2 == min(
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,7 +4,13 @@
 Notes on test cases:
 -------------------
 
-* urn:mavedb:00000068-a-1: TP53, protein-coding, DNA.
+* urn:mavedb:00000041-a-1: SRC, protein-coding, dna, uniprot ref
+* urn:mavedb:00000018-a-1: HBB promoter, regulatory, DNA
+* urn:mavedb:00000001-a-4: UBE2I, protein-coding, dna, uniprot ref
+* urn:mavedb:00000113-a-2: APP, protein-coding, protein sequence, uniprot ref. Not in original notebooks.
+* urn:mavedb:00000098-a-1: SCN5A, protein-coding, protein sequence, uniprot ref with offset
+* urn:mavedb:00000061-h-1: RAF, protein coding, DNA, uniprot ref with offset
+* urn:mavedb:00000068-a-1: TP53, protein-coding, DNA
 """
 import json
 import os
diff --git a/tests/fixtures/align_result.json b/tests/fixtures/align_result.json
@@ -23,16 +23,6 @@
       { "start": 37403170, "end": 37403325 }
     ]
   },
-  "urn:mavedb:00000098-a-1": {
-    "chrom": "chr3",
-    "strand": -1,
-    "coverage": 100.0,
-    "ident_pct": 100.0,
-    "query_range": { "start": 0, "end": 12 },
-    "query_subranges": [{ "start": 0, "end": 12 }],
-    "hit_range": { "start": 38551475, "end": 38551511 },
-    "hit_subranges": [{ "start": 38551475, "end": 38551511 }]
-  },
   "urn:mavedb:00000018-a-1": {
     "chrom": "chr11",
     "strand": 1,
@@ -43,6 +33,40 @@
     "hit_range": { "start": 5227021, "end": 5227208 },
     "hit_subranges": [{ "start": 5227021, "end": 5227208 }]
   },
+  "urn:mavedb:00000001-a-4": {
+    "chrom": "chr16",
+    "strand": 1,
+    "coverage": 100.0,
+    "ident_pct": 100.0,
+    "query_range": { "start": 0, "end": 477 },
+    "query_subranges": [
+      { "start": 0, "end": 66 },
+      { "start": 66, "end": 150 },
+      { "start": 150, "end": 223 },
+      { "start": 223, "end": 333 },
+      { "start": 333, "end": 413 },
+      { "start": 413, "end": 477 }
+    ],
+    "hit_range": { "start": 1314030, "end": 1324793 },
+    "hit_subranges": [
+      { "start": 1314030, "end": 1314096 },
+      { "start": 1314292, "end": 1314376 },
+      { "start": 1315653, "end": 1315726 },
+      { "start": 1320173, "end": 1320283 },
+      { "start": 1320437, "end": 1320517 },
+      { "start": 1324729, "end": 1324793 }
+    ]
+  },
+  "urn:mavedb:00000098-a-1": {
+    "chrom": "chr3",
+    "strand": -1,
+    "coverage": 100.0,
+    "ident_pct": 100.0,
+    "query_range": { "start": 0, "end": 12 },
+    "query_subranges": [{ "start": 0, "end": 12 }],
+    "hit_range": { "start": 38551475, "end": 38551511 },
+    "hit_subranges": [{ "start": 38551475, "end": 38551511 }]
+  },
   "urn:mavedb:00000061-h-1": {
     "chrom": "chr3",
     "strand": -1,
diff --git a/tests/fixtures/transcript_result.json b/tests/fixtures/transcript_result.json
@@ -7,28 +7,35 @@
     "transcript_mode": "mane_select",
     "sequence": "TODO"
   },
-    "urn:mavedb:00000001-a-4": {
-        "nm": "NM_003345.5",
-        "np": "NP_003336.1",
-        "start": 0,
-        "is_full_match": true,
-        "transcript_mode": "mane_select",
-        "sequence": "TODO"
-    },
-    "urn:mavedb:00000098-a-1": {
-        "nm": "NM_000335.5",
-        "np": "NP_000326.2",
-        "start": 1619,
-        "is_full_match": true,
-        "transcript_mode": "mane_select",
-        "sequence": "TODO"
-    },
-    "urn:mavedb:00000061-h-1": {
-        "nm": "NM_002880.4",
-        "np": "NP_002871.1",
-        "start": 51,
-        "is_full_match": true,
-        "transcript_mode": "mane_select",
-        "sequence": "TODO"
-    }
+  "urn:mavedb:00000001-a-4": {
+    "nm": "NM_003345.5",
+    "np": "NP_003336.1",
+    "start": 0,
+    "is_full_match": true,
+    "transcript_mode": "mane_select",
+    "sequence": "TODO"
+  },
+  "urn:mavedb:00000098-a-1": {
+    "nm": "NM_000335.5",
+    "np": "NP_000326.2",
+    "start": 1619,
+    "is_full_match": true,
+    "transcript_mode": "mane_select",
+    "sequence": "TODO"
+  },
+  "urn:mavedb:00000061-h-1": {
+    "nm": "NM_002880.4",
+    "np": "NP_002871.1",
+    "start": 51,
+    "is_full_match": true,
+    "transcript_mode": "mane_select",
+    "sequence": "TODO"
+  },
+  "urn:mavedb:00000068-a-1": {
+    "nm": "NM_000546.6",
+    "np": "NP_000537.3",
+    "start": 0,
+    "is_full_match": false,
+    "transcript_mode": "mane_select"
+  }
 }
diff --git a/tests/unit/test_transcript.py b/tests/unit/test_transcript.py
@@ -8,6 +8,15 @@
 from dcd_mapping.transcripts import select_transcript
 
 
+def check_transcript_results_equality(actual: TxSelectResult, expected: TxSelectResult):
+    """Check equality of transcript selection result vs fixture"""
+    assert actual.np == expected.np
+    assert actual.start == expected.start
+    assert actual.is_full_match is expected.is_full_match
+    assert actual.nm == expected.nm
+    assert actual.transcript_mode == expected.transcript_mode
+
+
 @pytest.mark.asyncio(scope="module")
 async def test_tx_src(
     scoreset_metadata_fixture: Dict[str, ScoresetMetadata],
@@ -17,18 +26,12 @@ async def test_tx_src(
     """Test transcript selection for urn:mavedb:00000041-a-1"""
     urn = "urn:mavedb:00000041-a-1"
     metadata = scoreset_metadata_fixture[urn]
-    records = get_scoreset_records(urn)  # TODO real fixture
+    records = get_scoreset_records(urn)
     alignment_result = align_result_fixture[urn]
     expected = transcript_results_fixture[urn]
-
     actual = await select_transcript(metadata, records, alignment_result)
-
     assert actual
-    assert actual.np == expected.np
-    assert actual.start == expected.start
-    assert actual.is_full_match is expected.is_full_match
-    assert actual.nm == expected.nm
-    assert actual.transcript_mode == expected.transcript_mode
+    check_transcript_results_equality(actual, expected)
 
 
 @pytest.mark.asyncio(scope="module")
@@ -43,15 +46,9 @@ async def test_tx_scn5a(
     records = get_scoreset_records(urn)
     alignment_result = align_result_fixture[urn]
     expected = transcript_results_fixture[urn]
-
     actual = await select_transcript(metadata, records, alignment_result)
-
     assert actual
-    assert actual.np == expected.np
-    assert actual.start == expected.start
-    assert actual.is_full_match is expected.is_full_match
-    assert actual.nm == expected.nm
-    assert actual.transcript_mode == expected.transcript_mode
+    check_transcript_results_equality(actual, expected)
 
 
 @pytest.mark.asyncio(scope="module")
@@ -64,7 +61,6 @@ async def test_tx_hbb(
     metadata = scoreset_metadata_fixture[urn]
     records = get_scoreset_records(urn)
     alignment_result = align_result_fixture[urn]
-
     actual = await select_transcript(metadata, records, alignment_result)
     assert actual is None
 
@@ -81,11 +77,23 @@ async def test_tx_raf(
     records = get_scoreset_records(urn)
     alignment_result = align_result_fixture[urn]
     expected = transcript_results_fixture[urn]
+    actual = await select_transcript(metadata, records, alignment_result)
+    assert actual
+    check_transcript_results_equality(actual, expected)
 
+
+@pytest.mark.asyncio(scope="module")
+async def test_tx_tp53(
+    scoreset_metadata_fixture: Dict[str, ScoresetMetadata],
+    align_result_fixture: Dict[str, AlignmentResult],
+    transcript_results_fixture: Dict[str, TxSelectResult],
+):
+    """Test transcript selection for urn:mavedb:00000068-a-1"""
+    urn = "urn:mavedb:00000068-a-1"
+    metadata = scoreset_metadata_fixture[urn]
+    records = get_scoreset_records(urn)
+    alignment_result = align_result_fixture[urn]
+    expected = transcript_results_fixture[urn]
     actual = await select_transcript(metadata, records, alignment_result)
     assert actual
-    assert actual.np == expected.np
-    assert actual.start == expected.start
-    assert actual.is_full_match is expected.is_full_match
-    assert actual.nm == expected.nm
-    assert actual.transcript_mode == expected.transcript_mode
+    check_transcript_results_equality(actual, expected)