@@ -144,7 +144,7 @@ def add_gencode_hgnc(gencode_hgnc_filename: str, genes, transcripts):
144144 num_gencode_transcripts , num_gtf_transcripts )
145145
146146
147- def add_canonical_transcripts (gene_canonical_transcripts_csv : str , transcripts ):
147+ def add_canonical_transcripts (gene_canonical_transcripts_csv : str , genome_build : str , transcripts ):
148148 """ Ensembl GRCh37 GTFs do not contain canonical transcripts info, so manually add.
149149 @see https://github.com/SACGF/cdot/issues/36 and ensembl_grch37_canonical_transcripts py / csv
150150 """
@@ -165,13 +165,13 @@ def add_canonical_transcripts(gene_canonical_transcripts_csv: str, transcripts):
165165 if canonical_transcript := gene_canonical_transcripts .get (gene_version ):
166166 if transcript_accession == canonical_transcript :
167167 # Add to tag, which is optional comma separated list at this point (made in gff_parser)
168- if tag := tdata .get ("tag" ):
169- tag_list = tag . split ( "," )
170- else :
171- tag_list = []
172- tag_list . append ( "Ensembl_canonical" )
173- tdata [ "tag" ] = "," . join ( tag_list )
174-
168+ if build_data := tdata [ "genome_builds" ] .get (genome_build ):
169+ if tag := build_data . get ( "tag" ):
170+ tag_list = tag . split ( "," )
171+ else :
172+ tag_list = []
173+ tag_list . append ( "Ensembl_canonical" )
174+ build_data [ "tag" ] = "," . join ( tag_list )
175175
176176
177177def _gff_arg_check (args ):
@@ -188,7 +188,7 @@ def gtf_to_json(args):
188188 genes , transcripts = parser .get_genes_and_transcripts ()
189189 refseq_gene_summary_api_retrieval_date = add_gene_info (args .gene_info_json , genes )
190190 add_gencode_hgnc (args .gencode_hgnc_metadata , genes , transcripts )
191- add_canonical_transcripts (args .gene_canonical_transcripts_csv , transcripts )
191+ add_canonical_transcripts (args .gene_canonical_transcripts_csv , args . genome_build , transcripts )
192192 write_cdot_json (args .output , genes , transcripts , [args .genome_build ],
193193 refseq_gene_summary_api_retrieval_date = refseq_gene_summary_api_retrieval_date )
194194
@@ -202,7 +202,7 @@ def gff3_to_json(args):
202202 genes , transcripts = parser .get_genes_and_transcripts ()
203203 refseq_gene_summary_api_retrieval_date = add_gene_info (args .gene_info_json , genes )
204204 add_gencode_hgnc (args .gencode_hgnc_metadata , genes , transcripts )
205- add_canonical_transcripts (args .gene_canonical_transcripts_csv , transcripts )
205+ add_canonical_transcripts (args .gene_canonical_transcripts_csv , args . genome_build , transcripts )
206206 write_cdot_json (args .output , genes , transcripts , [args .genome_build ],
207207 refseq_gene_summary_api_retrieval_date = refseq_gene_summary_api_retrieval_date )
208208
0 commit comments