Skip to content

Commit a7be41b

Browse files
committed
#36 - canonical transcripts - need to add tags to genome build specific data
1 parent 8792491 commit a7be41b

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

generate_transcript_data/cdot_json.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def add_gencode_hgnc(gencode_hgnc_filename: str, genes, transcripts):
144144
num_gencode_transcripts, num_gtf_transcripts)
145145

146146

147-
def add_canonical_transcripts(gene_canonical_transcripts_csv: str, transcripts):
147+
def add_canonical_transcripts(gene_canonical_transcripts_csv: str, genome_build: str, transcripts):
148148
""" Ensembl GRCh37 GTFs do not contain canonical transcripts info, so manually add.
149149
@see https://github.com/SACGF/cdot/issues/36 and ensembl_grch37_canonical_transcripts py / csv
150150
"""
@@ -165,13 +165,13 @@ def add_canonical_transcripts(gene_canonical_transcripts_csv: str, transcripts):
165165
if canonical_transcript := gene_canonical_transcripts.get(gene_version):
166166
if transcript_accession == canonical_transcript:
167167
# Add to tag, which is optional comma separated list at this point (made in gff_parser)
168-
if tag := tdata.get("tag"):
169-
tag_list = tag.split(",")
170-
else:
171-
tag_list = []
172-
tag_list.append("Ensembl_canonical")
173-
tdata["tag"] = ",".join(tag_list)
174-
168+
if build_data := tdata["genome_builds"].get(genome_build):
169+
if tag := build_data.get("tag"):
170+
tag_list = tag.split(",")
171+
else:
172+
tag_list = []
173+
tag_list.append("Ensembl_canonical")
174+
build_data["tag"] = ",".join(tag_list)
175175

176176

177177
def _gff_arg_check(args):
@@ -188,7 +188,7 @@ def gtf_to_json(args):
188188
genes, transcripts = parser.get_genes_and_transcripts()
189189
refseq_gene_summary_api_retrieval_date = add_gene_info(args.gene_info_json, genes)
190190
add_gencode_hgnc(args.gencode_hgnc_metadata, genes, transcripts)
191-
add_canonical_transcripts(args.gene_canonical_transcripts_csv, transcripts)
191+
add_canonical_transcripts(args.gene_canonical_transcripts_csv, args.genome_build, transcripts)
192192
write_cdot_json(args.output, genes, transcripts, [args.genome_build],
193193
refseq_gene_summary_api_retrieval_date=refseq_gene_summary_api_retrieval_date)
194194

@@ -202,7 +202,7 @@ def gff3_to_json(args):
202202
genes, transcripts = parser.get_genes_and_transcripts()
203203
refseq_gene_summary_api_retrieval_date = add_gene_info(args.gene_info_json, genes)
204204
add_gencode_hgnc(args.gencode_hgnc_metadata, genes, transcripts)
205-
add_canonical_transcripts(args.gene_canonical_transcripts_csv, transcripts)
205+
add_canonical_transcripts(args.gene_canonical_transcripts_csv, args.genome_build, transcripts)
206206
write_cdot_json(args.output, genes, transcripts, [args.genome_build],
207207
refseq_gene_summary_api_retrieval_date=refseq_gene_summary_api_retrieval_date)
208208

0 commit comments

Comments
 (0)