Skip to content

Commit 12525e1

Browse files
author
Dana Elizabeth Wyman
committed
Added test case to cover user-reported bug in which a read that was supposed to be Genomic was misclassified as Intergenic
1 parent 59567e9 commit 12525e1

File tree

5 files changed

+78
-0
lines changed

5 files changed

+78
-0
lines changed

testing_suite/build_test_databases.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,23 @@
8989
print(e)
9090
sys.exit("Database initialization failed on chr22 annotation")
9191

92+
try:
93+
os.system("mkdir -p scratch/multiexon_read_overlapping_monoexon_transcript")
94+
subprocess.check_output(
95+
["talon_initialize_database",
96+
"--f", "input_files/multiexon_read_overlapping_monoexon_transcript/HMGB1P1.gtf",
97+
"--a", "gencode_v29",
98+
"--5p", "500",
99+
"--3p", "300",
100+
"--idprefix", "ENCODEH",
101+
"--l", "300",
102+
"--g", "hg38", "--o",
103+
"scratch/multiexon_read_overlapping_monoexon_transcript/talon"])
104+
105+
except Exception as e:
106+
print(e)
107+
sys.exit("Database initialization failed on HMGB1P1 annotation")
108+
92109
# Actually perform the toy_mod run
93110
try:
94111
subprocess.check_output(
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
chr20 HAVANA gene 57488392 57489027 . - . gene_id "ENSG00000124097.7"; gene_type "processed_pseudogene"; gene_name "HMGB1P1"; level 1; tag "pseudo_consens"; havana_gene "OTTHUMG00000032823.3";
2+
chr20 HAVANA transcript 57488392 57489027 . - . gene_id "ENSG00000124097.7"; transcript_id "ENST00000522557.1"; gene_type "processed_pseudogene"; gene_name "HMGB1P1"; transcript_type "processed_pseudogene"; transcript_name "HMGB1P1-201"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTHUMG00000032823.3"; havana_transcript "OTTHUMT00000079848.3";
3+
chr20 HAVANA exon 57488392 57489027 . - . gene_id "ENSG00000124097.7"; transcript_id "ENST00000522557.1"; gene_type "processed_pseudogene"; gene_name "HMGB1P1"; transcript_type "processed_pseudogene"; transcript_name "HMGB1P1-201"; exon_number 1; exon_id "ENSE00002091063.1"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTHUMG00000032823.3"; havana_transcript "OTTHUMT00000079848.3";
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
test,test,test,input_files/multiexon_read_overlapping_monoexon_transcript/read.sam
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
@HD VN:1.6 SO:coordinate
2+
@SQ SN:chr20 LN:64444167
3+
m54119_190202_095143/62849336/ccs 16 chr20 57487908 60 1997M5298N74M1S * 0 0 GCAAAAACAGTACGATTTATTTTTTTATTTTTTTAATAAAAAAATTGTATTTACTTAGAAGCATTCAGAATGTCAACAAAACAGCTGCAACTTTTTTTTTTTGCAATTACAGAGTGGTATTCAGTTAACAGAACAATTATTTCGTATAAGCTGCATCAGAGACAACTGAAGATGAAAAAACTACCATCCCCATATATAACTAATTTGTGCTGCGCACCAACAAGAACCTGCTTTAAATTTCCATGCCAATTTACAACCCCCATACTCTACCAGGCAAGGTTAGTGACTATTGAAAATACCACCAGGACAGGGCCATCTAAAGACACATTTGGTAGTGTATTAACCATACAAAAAACGACACTGTACAGTTTAAAAACAAATCTTACACAGCCTTACATTTCAATTTTTTTCTTTAAAAGGAGTGAGTTGTGTACAGGGGCGTTAAATGCTTTAAGACAAGAAAAAAAACTGCGCTAGAGCCAACTTATTCATCATCATCATCTTCTTCATCTTCTTCATCTTCCTCCTCCTCATCCTCTTCATCTTCCTCATCTTCCTCCTCTTCCTTCTTTTTCTTGCTTTTTTCAGCTTTGACAACTCCCTTTTTTGCTGCCTCAGGCTTTCCTTTAGCTTGATATGCAGCAATATCCTTTTCGTATTTTTCCTTCAGCTTCGCAGCCTTCTTTTCACCAGGCTGCTTGTCATCTGCAGCAGTGTTATTCCACATCTCTCCCAGTTTCTTCGCAACATCACCAATGGACAGGCCAGGATGTTCTCCTTTGATTTTTGGGTGATACTCAGAGCAGAACAGGAAGAAGGCCGAAGGAGGCCTCTTGGGTGCATTGGGATCCTTGAACTTCTTTTTTGTCTCCCCTTTGGGAGGGATATAGGTTTTCATTTGTCTTTCATAATGGGTCTTGTCCGCCTTTGCCATATCCTCAAATTTTCCTTTCTCTTTAGCAGACATGGTCTTCCACCTCTCTGAGCACTTGTTAGAAAACTCTGAGAAGTTGACTGAAGCATCTGAGTGCTTCTTCTTATGCTCCTCCCGACAAGTTTGCACAAAAAATGCATATGATGACATTTTGCCTCTCGGCTTCTTAGGATCTCCTTTGCCCATGTTTAGTTATTTTTCTTCAGCGAGGCACAGAGTCGCCCAGTGCCCGTACGGCTCTCACTTGCCCCGGCGCTGTCTCTATGGAGCTCAATGTACTGCAGTGGCTAACAGTACGATTTATAATAGCACTCTCCCCAAAGGAAAATCCATGACAAACTTTGATACAGATCCAACATCTGTGCAAGATGTCTACGATAGATACAAAACAATGATGAAAGAAATCAAAGAAGGCCTAAATGATTGGAGGGATACACTCTTCTGTTCACGGATTGGGAGACTCAGTATTGTTAAAGTGTAAAATTCTTTCCAAATTGATCTATACATTCATTCCCCCAAAGATCTCAGCAACATTTTCTTTTTGCATAAATTCACAAGTGGATTTGAAAATTTAGATGGAAAGGCAAAGGATTCAGAATCACCAAAAACAACTTTAGAAAAGAAGAACAAAGCTGGAGGACTTGCATTATCTGAGTTTGAGGCTGACTATACGCTGTCAAGACAGCCTGGTTCTGATGAAGGACTAGCTTGGGAAAGAATGGAGCTACAACACCCACACATGTGGCCAATACTTTTTTTTTTTTTTTTGAGACGAAGTCTCACTGTAGCCCAGGCTGGAGTGCAGTGGTGTGATCTTGGCTCACGACAACCTCCGCCTCCATGGCTCAAGCCATTCTTGTGCCTCAGCCTCTGAGGAGCAGGGACTACAGGTGTGCGCCACCACATCCAGATAATATTTTGTATTTTAGTAGAGATGAGGTTTCACCATGTTGCCCAGCGTGGTCTCAAACTCCTGAGCTCAGGCGATCTGTCCTCCTCGGCTTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGTGCCTGTCAATACATTTTAATACAGCAAGCATTCACGAACTCAGAGCCAACTGGTGTCCTCACTCCCAGGTTTTGCAAAGGTCCCTTCTCTTCCTTGCCC * ms:i:2008 AS:i:1984 nn:i:0 ts:A:+ tp:A:P cm:i:613 s1:i:1944 s2:i:988 de:f:0.0053 rl:i:291 NM:i:0 MD:Z:2071 jM:B:c,2 jI:B:i,57489905,57495202 RG:Z:test
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import pytest
2+
from talon import talon, init_refs
3+
import sqlite3
4+
import pysam
5+
@pytest.mark.integration
6+
7+
class TestMultExonReadOverlapsMonoGene(object):
8+
9+
def test_transcript_assigned_intergenic(self):
10+
""" This test covers a case reported by a user where a read overlaps
11+
the ~600bp mono-exonic pseudogene HMGB1P1. The read itself has
12+
2 exons, the second of which contains the small pseudogene inside.
13+
earlier versions of TALON classified the read as intergenic,
14+
when it was actually supposed to be genomic """
15+
16+
# Set up references
17+
database = "scratch/multiexon_read_overlapping_monoexon_transcript/talon.db"
18+
conn = sqlite3.connect(database)
19+
conn.row_factory = sqlite3.Row
20+
cursor = conn.cursor()
21+
22+
build = "hg38"
23+
talon.get_counters(database)
24+
run_info = talon.init_run_info(database, build)
25+
struct_collection = talon.prepare_data_structures(cursor, run_info)
26+
27+
# Use pysam to get the read from the SAM file
28+
sam_file = "input_files/multiexon_read_overlapping_monoexon_transcript/read.sam"
29+
with pysam.AlignmentFile(sam_file) as sam:
30+
for entry in sam:
31+
sam_record = entry
32+
break
33+
34+
# Get read attributes
35+
chrom = sam_record.reference_name
36+
strand = "-" if sam_record.is_reverse else "+"
37+
sam_start = sam_record.reference_start
38+
sam_end = sam_record.reference_end
39+
40+
# Do we get any overlap with the reference gene?
41+
best_gene, match_strand = talon.search_for_overlap_with_gene(chrom, min(sam_start, sam_end),
42+
max(sam_start, sam_end), strand,
43+
cursor, run_info,
44+
struct_collection.tmp_gene)
45+
assert best_gene == 1
46+
assert match_strand == "-"
47+
48+
annotation_info = talon.annotate_read(sam_record, cursor, run_info,
49+
struct_collection, mode = 0)
50+
51+
assert annotation_info['gene_ID'] == 1
52+
assert annotation_info['transcript_ID'] == 2
53+
assert 'genomic_transcript' in annotation_info['transcript_novelty'][0]
54+

0 commit comments

Comments
 (0)