Skip to content

Commit 0b903f9

Browse files
committed
#72 - Correctly handle ncRNA_gene GFF data
1 parent 280a9d6 commit 0b903f9

File tree

5 files changed

+17
-6
lines changed

5 files changed

+17
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
- #60 - Fix for missing protein IDs due to Genbank / GenBank (thanks holtgrewe)
1515
- #64 - Split code/data versions. json.gz are now labelled according to data schema version (thanks holtgrewe)
1616
- Renamed 'CHM13v2.0' to 'T2T-CHM13v2.0' so it could work with biocommons bioutils
17+
- #72 - Correctly handle ncRNA_gene genes (thanks holtgrewe for reporting)
1718

1819
## [0.2.21] - 2023-08-14
1920

generate_transcript_data/gff_parser.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def _create_gene(feature, gene_accession):
112112
description = None
113113

114114
# Non mandatory - Ensembl doesn't have some stuff on some RNAs
115-
if feature.type in {"gene", "pseudogene"}:
115+
if feature.type in {"gene", "pseudogene", "ncRNA_gene"}:
116116
gene_name = feature.attr.get("Name")
117117
description = feature.attr.get("description")
118118
else:
@@ -178,7 +178,7 @@ def _add_transcript_data(self, transcript_accession, transcript, feature):
178178

179179
def _finish_process_features(self):
180180
for transcript_accession, transcript_data in self.transcript_data_by_accession.items():
181-
features_by_type = self.transcript_features_by_type.get(transcript_accession)
181+
features_by_type = self.transcript_features_by_type.get(transcript_accession, {})
182182

183183
# Store coding start/stop transcript positions
184184
# For RefSeq, we need to deal with alignment gaps, so easiest is to convert exons w/o gaps
@@ -194,7 +194,7 @@ def _finish_process_features(self):
194194
exons_stranded_order = self._create_cdna_exons(cdna_matches_stranded_order)
195195

196196
else:
197-
raw_exon_stranded_order = features_by_type["exon"]
197+
raw_exon_stranded_order = features_by_type.get("exon", [])
198198
raw_exon_stranded_order.sort(key=operator.itemgetter(0))
199199
if not forward_strand:
200200
raw_exon_stranded_order.reverse()
@@ -412,7 +412,7 @@ class GFF3Parser(GFFParser):
412412
413413
"""
414414

415-
GFF3_GENES = {"gene", "pseudogene"}
415+
GFF3_GENES = {"gene", "pseudogene", "ncRNA_gene"}
416416
GFF3_TRANSCRIPTS_DATA = {"exon", "CDS", "cDNA_match", "five_prime_UTR", "three_prime_UTR"}
417417

418418
def __init__(self, *args, **kwargs):
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# After 0.2.22 we split version into separate code (pip) and data schema versions
22
# The cdot client will use its own major/minor to determine whether it can read these data files
3-
JSON_SCHEMA_VERSION = "0.2.23"
3+
JSON_SCHEMA_VERSION = "0.2.24"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
MT insdc ncRNA_gene 8295 8364 . + . ID=gene:ENSG00000210156;Name=MT-TK;biotype=Mt_tRNA;description=mitochondrially encoded tRNA-Lys (AAA/G) [Source:HGNC Symbol%3BAcc:HGNC:7489];gene_id=ENSG00000210156;logic_name=mt_genbank_import_homo_sapiens;version=1
2+
MT insdc tRNA 8295 8364 . + . ID=transcript:ENST00000387421;Parent=gene:ENSG00000210156;Name=MT-TK-201;biotype=Mt_tRNA;tag=basic,Ensembl_canonical;transcript_id=ENST00000387421;transcript_support_level=NA;version=1

tests/test_gff_parsers.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ class Test(unittest.TestCase):
1010
test_data_dir = os.path.join(this_file_dir, "test_data")
1111
ENSEMBL_104_GTF_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.104.gtf")
1212
ENSEMBL_108_GFF3_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.108.gff3")
13+
ENSEMBL_110_GFF3_MT_TG_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.mt_tg.110.gff3")
1314
# Older RefSeq, before Genbank => GenBank changed
1415
REFSEQ_GFF3_FILENAME_2021 = os.path.join(test_data_dir, "refseq_test.GRCh38.p13_genomic.109.20210514.gff")
1516
# Newer RefSeq, before Genbank => GenBank changed
@@ -113,4 +114,11 @@ def test_chrom_contig_conversion(self):
113114
contig = transcript["genome_builds"][genome_build].get("contig")
114115
self.assertEqual(contig, "NC_000001.11")
115116

116-
117+
def test_ncrna_gene(self):
118+
""" We were incorrectly missing ncRNA gene info @see https://github.com/SACGF/cdot/issues/72 """
119+
genome_build = "GRCh38"
120+
parser = GFF3Parser(self.ENSEMBL_110_GFF3_MT_TG_FILENAME, genome_build, self.FAKE_URL)
121+
genes, transcripts = parser.get_genes_and_transcripts()
122+
gene = genes["ENSG00000210156"]
123+
gene_symbol = gene["gene_symbol"]
124+
self.assertEqual(gene_symbol, "MT-TK2")

0 commit comments

Comments
 (0)