@@ -38,6 +38,7 @@ def _setup_arg_parser():
3838 p .add_argument ('--genome-build' , required = True , help = "'GRCh37' or 'GRCh38'" )
3939 p .add_argument ('--gene-info-json' , required = True , help = "'JSON of gene info, produced by cdot_gene_info.py" )
4040 p .add_argument ('--gencode-hgnc-metadata' , required = False , help = "GENCODE HGNC metadata for adding HGNC to Ensembl" )
41+ p .add_argument ('--gene-canonical-transcripts-csv' , required = False , help = "Manually provide canonical transcripts (for Ensembl GRCh37)" )
4142
4243 parser_uta = subparsers .add_parser ("uta_to_json" , help = "Convert UTA to JSON" )
4344 parser_uta .add_argument ("uta_csv_filename" , help = "UTA SQL CSV to convert to JSON" )
@@ -143,6 +144,36 @@ def add_gencode_hgnc(gencode_hgnc_filename: str, genes, transcripts):
143144 num_gencode_transcripts , num_gtf_transcripts )
144145
145146
147+ def add_canonical_transcripts (gene_canonical_transcripts_csv : str , transcripts ):
148+ """ Ensembl GRCh37 GTFs do not contain canonical transcripts info, so manually add.
149+ @see https://github.com/SACGF/cdot/issues/36 and ensembl_grch37_canonical_transcripts py / csv
150+ """
151+
152+ if gene_canonical_transcripts_csv and transcripts :
153+ gene_canonical_transcripts = {}
154+ with open (gene_canonical_transcripts_csv ) as f :
155+ header = f .readline ().strip ()
156+ if header != "gene_id,canonical_transcript" :
157+ raise ValueError (f"file: '{ gene_canonical_transcripts_csv } ' had unexpected header line: '{ header } '" )
158+ for line in f .readlines ():
159+ gene_id , canonical_transcript = line .strip ().split ("," )
160+ gene_canonical_transcripts [gene_id ] = canonical_transcript
161+
162+ if gene_canonical_transcripts :
163+ for transcript_accession , tdata in transcripts .items ():
164+ if gene_version := tdata .get ("gene_version" ):
165+ if canonical_transcript := gene_canonical_transcripts .get (gene_version ):
166+ if transcript_accession == canonical_transcript :
167+ # Add to tag, which is optional comma separated list at this point (made in gff_parser)
168+ if tag := tdata .get ("tag" ):
169+ tag_list = tag .split ("," )
170+ else :
171+ tag_list = []
172+ tag_list .append ("Ensembl_canonical" )
173+ tdata ["tag" ] = "," .join (tag_list )
174+
175+
176+
146177def _gff_arg_check (args ):
147178 if args .no_contig_conversion :
148179 logging .warning (f"Skipping chrom/contig conversion. File won't work with Biocommons HGVS" )
@@ -157,6 +188,7 @@ def gtf_to_json(args):
157188 genes , transcripts = parser .get_genes_and_transcripts ()
158189 refseq_gene_summary_api_retrieval_date = add_gene_info (args .gene_info_json , genes )
159190 add_gencode_hgnc (args .gencode_hgnc_metadata , genes , transcripts )
191+ add_canonical_transcripts (args .gene_canonical_transcripts_csv , transcripts )
160192 write_cdot_json (args .output , genes , transcripts , [args .genome_build ],
161193 refseq_gene_summary_api_retrieval_date = refseq_gene_summary_api_retrieval_date )
162194
@@ -170,6 +202,7 @@ def gff3_to_json(args):
170202 genes , transcripts = parser .get_genes_and_transcripts ()
171203 refseq_gene_summary_api_retrieval_date = add_gene_info (args .gene_info_json , genes )
172204 add_gencode_hgnc (args .gencode_hgnc_metadata , genes , transcripts )
205+ add_canonical_transcripts (args .gene_canonical_transcripts_csv , transcripts )
173206 write_cdot_json (args .output , genes , transcripts , [args .genome_build ],
174207 refseq_gene_summary_api_retrieval_date = refseq_gene_summary_api_retrieval_date )
175208
0 commit comments