Skip to content

Commit ba5c68c

Browse files
committed
#99 - Start storing GFF/GTF source
1 parent 934f5ca commit ba5c68c

File tree

2 files changed

+17
-3
lines changed

2 files changed

+17
-3
lines changed

generate_transcript_data/gff_parser.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,14 @@ def _create_gene(feature, gene_accession):
117117
if biotype:
118118
biotype_set.add(biotype)
119119

120+
source_set = set()
121+
if feature.source:
122+
source_set.add(feature.source)
123+
120124
return {
121125
"gene_symbol": gene_name,
122126
"biotype": biotype_set,
127+
"source": source_set,
123128
"id": gene_accession,
124129
"description": description
125130
}
@@ -132,6 +137,7 @@ def _create_transcript(feature, transcript_accession, gene_data):
132137
"gene_version": gene_data.get("id"),
133138
"exons": [],
134139
"biotype": set(),
140+
"source": set(),
135141
CONTIG: feature.iv.chrom,
136142
STRAND: feature.iv.strand,
137143
}
@@ -169,7 +175,6 @@ def _add_transcript_data(self, transcript_accession, transcript, feature):
169175
if note := feature.attr.get("Note"):
170176
transcript["note"] = note
171177

172-
173178
def _finish_process_features(self):
174179
for transcript_accession, transcript_data in self.transcript_data_by_accession.items():
175180
features_by_type = self.transcript_features_by_type.get(transcript_accession, {})
@@ -458,6 +463,10 @@ def handle_feature(self, feature):
458463
gene_data["biotype"].add(biotype)
459464
transcript["biotype"].add(biotype)
460465

466+
if feature.source:
467+
gene_data["source"].add(feature.source)
468+
transcript["source"].add(feature.source)
469+
461470
self._handle_protein_version(transcript_accession, feature)
462471
self._add_tags_to_transcript_data(transcript, feature)
463472

@@ -511,6 +520,9 @@ def handle_feature(self, feature):
511520
if m := self.hgnc_pattern.match(description):
512521
gene_data["hgnc"] = m.group(2)
513522

523+
if feature.source:
524+
gene_data["source"].add(feature.source)
525+
514526
self.gene_accession_by_feature_id[feature.attr["ID"]] = gene_accession
515527
else:
516528
transcript_accession = None
@@ -564,6 +576,8 @@ def handle_feature(self, feature):
564576
elif feature.type not in EXCLUDE_BIOTYPES:
565577
transcript["biotype"].add(feature.type)
566578

579+
if feature.source:
580+
transcript["source"].add(feature.source)
567581

568582

569583
@staticmethod

generate_transcript_data/json_schema_version.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@
66
# 0.2.29 - Ensembl now has HGNC added from outside GTFs
77
# 0.2.30 - Ensembl GRCh37 has canonical transcripts added from outside GTFs
88
# 0.2.31 - Add 'metadata' - method/urls
9-
10-
JSON_SCHEMA_VERSION = "0.2.31"
9+
# 0.2.32 - Add 'source' (GTF column #2) to build data
10+
JSON_SCHEMA_VERSION = "0.2.32"

0 commit comments

Comments
 (0)