Skip to content

Commit 8792491

Browse files
committed
#36 - canonical transcripts - add Ensembl_canonical to Ensembl/37
1 parent 233a5d5 commit 8792491

File tree

5 files changed

+58059
-2
lines changed

5 files changed

+58059
-2
lines changed

generate_transcript_data/Snakefile

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ with open(gencode_csv) as _fh:
2222
for _r in _reader:
2323
_gencode_map[_r["Ensembl release"].strip()] = _r["GENCODE release"].strip()
2424

25+
# For manually adding canonical transcripts tags to Ensembl/GRCh37
26+
ensembl_grch37_canonical_transcripts_csv = os.path.join(workflow.basedir, "ensembl_grch37_canonical_transcripts.csv")
27+
2528
# Regex to pull Ensembl release number from a URL (handles release_XX or release-XX)
2629
_re_ensembl_rel = re.compile(r"release[_-](\d+)")
2730

@@ -175,6 +178,11 @@ rule cdot_gff_json:
175178
(f'--gencode-hgnc-metadata "{_gencode_metadata_path(wc)}"')
176179
if (wc.annotation_consortium.lower() == "ensembl" and _gencode_release_from_url(all_urls[wc.name]) is not None)
177180
else ""
181+
),
182+
gene_canonical_transcripts_params=lambda wc: (
183+
(f'--gene-canonical-transcripts-csv "{ensembl_grch37_canonical_transcripts_csv}"')
184+
if (wc.annotation_consortium.lower() == "ensembl" and wc.genome_build == "GRCh37")
185+
else ""
178186
)
179187
shell:
180188
"""
@@ -186,7 +194,8 @@ rule cdot_gff_json:
186194
--genome-build="{wildcards.genome_build}" \
187195
--output "{output}" \
188196
--gene-info-json="{input.gene_info_json}" \
189-
{params.gencode_param}
197+
{params.gencode_param} \
198+
{params.gene_canonical_transcripts_params}
190199
"""
191200

192201

generate_transcript_data/cdot_json.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def _setup_arg_parser():
3838
p.add_argument('--genome-build', required=True, help="'GRCh37' or 'GRCh38'")
3939
p.add_argument('--gene-info-json', required=True, help="'JSON of gene info, produced by cdot_gene_info.py")
4040
p.add_argument('--gencode-hgnc-metadata', required=False, help="GENCODE HGNC metadata for adding HGNC to Ensembl")
41+
p.add_argument('--gene-canonical-transcripts-csv', required=False, help="Manually provide canonical transcripts (for Ensembl GRCh37)")
4142

4243
parser_uta = subparsers.add_parser("uta_to_json", help="Convert UTA to JSON")
4344
parser_uta.add_argument("uta_csv_filename", help="UTA SQL CSV to convert to JSON")
@@ -143,6 +144,36 @@ def add_gencode_hgnc(gencode_hgnc_filename: str, genes, transcripts):
143144
num_gencode_transcripts, num_gtf_transcripts)
144145

145146

147+
def add_canonical_transcripts(gene_canonical_transcripts_csv: str, transcripts):
148+
""" Ensembl GRCh37 GTFs do not contain canonical transcripts info, so manually add.
149+
@see https://github.com/SACGF/cdot/issues/36 and ensembl_grch37_canonical_transcripts py / csv
150+
"""
151+
152+
if gene_canonical_transcripts_csv and transcripts:
153+
gene_canonical_transcripts = {}
154+
with open(gene_canonical_transcripts_csv) as f:
155+
header = f.readline().strip()
156+
if header != "gene_id,canonical_transcript":
157+
raise ValueError(f"file: '{gene_canonical_transcripts_csv}' had unexpected header line: '{header}'")
158+
for line in f.readlines():
159+
gene_id, canonical_transcript = line.strip().split(",")
160+
gene_canonical_transcripts[gene_id] = canonical_transcript
161+
162+
if gene_canonical_transcripts:
163+
for transcript_accession, tdata in transcripts.items():
164+
if gene_version := tdata.get("gene_version"):
165+
if canonical_transcript := gene_canonical_transcripts.get(gene_version):
166+
if transcript_accession == canonical_transcript:
167+
# Add to tag, which is optional comma separated list at this point (made in gff_parser)
168+
if tag := tdata.get("tag"):
169+
tag_list = tag.split(",")
170+
else:
171+
tag_list = []
172+
tag_list.append("Ensembl_canonical")
173+
tdata["tag"] = ",".join(tag_list)
174+
175+
176+
146177
def _gff_arg_check(args):
147178
if args.no_contig_conversion:
148179
logging.warning(f"Skipping chrom/contig conversion. File won't work with Biocommons HGVS")
@@ -157,6 +188,7 @@ def gtf_to_json(args):
157188
genes, transcripts = parser.get_genes_and_transcripts()
158189
refseq_gene_summary_api_retrieval_date = add_gene_info(args.gene_info_json, genes)
159190
add_gencode_hgnc(args.gencode_hgnc_metadata, genes, transcripts)
191+
add_canonical_transcripts(args.gene_canonical_transcripts_csv, transcripts)
160192
write_cdot_json(args.output, genes, transcripts, [args.genome_build],
161193
refseq_gene_summary_api_retrieval_date=refseq_gene_summary_api_retrieval_date)
162194

@@ -170,6 +202,7 @@ def gff3_to_json(args):
170202
genes, transcripts = parser.get_genes_and_transcripts()
171203
refseq_gene_summary_api_retrieval_date = add_gene_info(args.gene_info_json, genes)
172204
add_gencode_hgnc(args.gencode_hgnc_metadata, genes, transcripts)
205+
add_canonical_transcripts(args.gene_canonical_transcripts_csv, transcripts)
173206
write_cdot_json(args.output, genes, transcripts, [args.genome_build],
174207
refseq_gene_summary_api_retrieval_date=refseq_gene_summary_api_retrieval_date)
175208

0 commit comments

Comments
 (0)