Skip to content

Commit 08211e2

Browse files
committed
#98 - Provide methods in JSON, and build for release
1 parent e04e93d commit 08211e2

File tree

3 files changed

+139
-79
lines changed

3 files changed

+139
-79
lines changed

generate_transcript_data/cdot_json.py

Lines changed: 94 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import ijson
55
import json
66
import logging
7+
import os
78
import re
89
import sys
910
from argparse import ArgumentParser
@@ -59,6 +60,10 @@ def _setup_arg_parser():
5960
parser_builds.add_argument('--grch38', required=True, help='cdot JSON.gz for GRCh38')
6061
parser_builds.add_argument('--t2t_chm13v2', required=True, help='cdot JSON.gz for t2t_chm13v2')
6162

63+
parser_release_notes = subparsers.add_parser("release_notes", help="JSON file to retrieve release notes")
64+
parser_release_notes.add_argument("json_filename", help="cdot JSON file")
65+
parser_release_notes.add_argument("--show-urls", action='store_true', default=False)
66+
6267
# I want this to be subcommands rather than global (would need to be listed before subcommand)
6368
for p in [parser_gtf, parser_gff3, parser_uta, parser_historical, parser_builds]:
6469
p.add_argument('--output', required=True, help='Output filename')
@@ -189,7 +194,9 @@ def gtf_to_json(args):
189194
refseq_gene_summary_api_retrieval_date = add_gene_info(args.gene_info_json, genes)
190195
add_gencode_hgnc(args.gencode_hgnc_metadata, genes, transcripts)
191196
add_canonical_transcripts(args.gene_canonical_transcripts_csv, args.genome_build, transcripts)
192-
write_cdot_json(args.output, genes, transcripts, [args.genome_build],
197+
method = "Single GFF3 file"
198+
write_cdot_json(args.output, method, [args.gtf_filename],
199+
genes, transcripts, [args.genome_build],
193200
refseq_gene_summary_api_retrieval_date=refseq_gene_summary_api_retrieval_date)
194201

195202

@@ -203,7 +210,9 @@ def gff3_to_json(args):
203210
refseq_gene_summary_api_retrieval_date = add_gene_info(args.gene_info_json, genes)
204211
add_gencode_hgnc(args.gencode_hgnc_metadata, genes, transcripts)
205212
add_canonical_transcripts(args.gene_canonical_transcripts_csv, args.genome_build, transcripts)
206-
write_cdot_json(args.output, genes, transcripts, [args.genome_build],
213+
method = "Single GFF3 file"
214+
write_cdot_json(args.output, method, [args.gff3_filename],
215+
genes, transcripts, [args.genome_build],
207216
refseq_gene_summary_api_retrieval_date=refseq_gene_summary_api_retrieval_date)
208217

209218

@@ -284,7 +293,9 @@ def uta_to_json(args):
284293
transcripts_by_id[transcript_accession] = transcript_data
285294

286295
print("Writing UTA to cdot JSON.gz")
287-
write_cdot_json(args.output, genes_by_id, transcripts_by_id, [args.genome_build])
296+
method = "Conversion of Universal Transcript Archive dump"
297+
write_cdot_json(args.output, method, [args.uta_csv_filename],
298+
genes_by_id, transcripts_by_id, [args.genome_build])
288299

289300

290301
def _convert_uta_exons(exon_starts, exon_ends, cigars):
@@ -356,9 +367,11 @@ def _cigar_to_gap_and_length(cigar):
356367
return gap, exon_length
357368

358369

359-
def write_cdot_json(filename, genes, transcript_versions, genome_builds, refseq_gene_summary_api_retrieval_date=None):
370+
def write_cdot_json(filename: str, method: str, input_files, genes, transcript_versions, genome_builds,
371+
refseq_gene_summary_api_retrieval_date=None):
360372
print(f"Writing cdot file: '{filename}'")
361373
data = {
374+
# We also write these in metadata now, but keep for legacy compatability
362375
"cdot_version": JSON_SCHEMA_VERSION,
363376
"genome_builds": genome_builds,
364377
"transcripts": transcript_versions,
@@ -368,6 +381,20 @@ def write_cdot_json(filename, genes, transcript_versions, genome_builds, refseq_
368381
if refseq_gene_summary_api_retrieval_date:
369382
data["refseq_gene_summary_api_retrieval_date"] = refseq_gene_summary_api_retrieval_date
370383

384+
url_counts = Counter()
385+
for tv in transcript_versions.values():
386+
for build_coordinates in tv["genome_builds"].values():
387+
url_counts[build_coordinates["url"]] += 1
388+
389+
data["metadata"] = {
390+
"method": method,
391+
"input_files": input_files,
392+
"sys.argv": " ".join(sys.argv),
393+
"url_counts": dict(url_counts.most_common()),
394+
"cdot_version": JSON_SCHEMA_VERSION,
395+
"genome_builds": genome_builds
396+
}
397+
371398
with gzip.open(filename, 'wt') as outfile:
372399
json.dump(data, outfile, cls=SortedSetEncoder, sort_keys=True) # Sort so diffs work
373400

@@ -405,76 +432,87 @@ def merge_historical(args):
405432
historical_transcript_version["gene_version"] = gene_accession
406433
transcript_versions[transcript_accession] = historical_transcript_version
407434

408-
genes = {} # Only keep those that are used in transcript versions
409-
# Summarise where it's from
410-
transcript_urls = Counter()
435+
genes = {} # Only keep those that are used in kept transcript versions
411436
for tv in transcript_versions.values():
412437
if not args.no_genes:
413438
if gene_accession := tv.get("gene_version"):
414439
genes[gene_accession] = gene_versions[gene_accession]
415440

416-
for build_coordinates in tv["genome_builds"].values():
417-
transcript_urls[build_coordinates["url"]] += 1
418-
419-
total = sum(transcript_urls.values())
420-
print(f"{total} transcript versions from:")
421-
for url, count in transcript_urls.most_common():
422-
print(f"{url}: {count} ({count*100 / total:.1f}%)")
423-
424-
write_cdot_json(args.output, genes, transcript_versions, [args.genome_build])
441+
method = "Merge historical"
442+
write_cdot_json(args.output, method, args.json_filenames,
443+
genes, transcript_versions, [args.genome_build])
425444

426445

427446
def combine_builds(args):
428447
print("combine_builds")
429-
genome_build_file = {
430-
"GRCh37": gzip.open(args.grch37),
431-
"GRCh38": gzip.open(args.grch38),
432-
"T2T-CHM13v2.0": gzip.open(args.t2t_chm13v2),
448+
genome_build_filename = {
449+
"GRCh37": args.grch37,
450+
"GRCh38": args.grch38,
451+
"T2T-CHM13v2.0": args.t2t_chm13v2,
433452
}
434-
435453
urls_different_coding = defaultdict(list)
436454
genes = {}
437455
transcript_versions = {}
438-
for genome_build, f in genome_build_file.items():
439-
# TODO: Check cdot versions
440-
json_builds = next(ijson.items(f, "genome_builds"))
441-
if json_builds != [genome_build]:
442-
raise ValueError(f"JSON file provided for {genome_build} needs to have only {genome_build} data (has {json_builds})")
443-
444-
f.seek(0) # Reset for next ijson call
445-
for transcript_id, build_transcript in ijson.kvitems(f, "transcripts"):
446-
genome_builds = {}
447-
existing_transcript = transcript_versions.get(transcript_id)
448-
if existing_transcript:
449-
genome_builds = existing_transcript["genome_builds"]
450-
# Latest always used, but check existing - if codons are different old versions are wrong so remove
451-
for field in ["start_codon", "stop_codon"]:
452-
old = existing_transcript.get(field)
453-
new = build_transcript.get(field)
454-
if old != new: # Old relied on different codons so is obsolete
455-
for build_coordinates in genome_builds.values():
456-
url = build_coordinates["url"]
457-
urls_different_coding[url].append(transcript_id)
458-
genome_builds = {}
459-
460-
genome_builds[genome_build] = build_transcript["genome_builds"][genome_build]
461-
# Use latest (with merged genome builds)
462-
build_transcript["genome_builds"] = genome_builds
463-
transcript_versions[transcript_id] = build_transcript
464-
465-
f.seek(0) # Reset for next ijson call
466-
for gene_id, gene_data in ijson.kvitems(f, "genes"):
467-
genes[gene_id] = gene_data
468-
469-
f.close()
470-
471-
write_cdot_json(args.output, genes, transcript_versions, list(genome_build_file.keys()))
456+
for genome_build, filename in genome_build_filename.items():
457+
with gzip.open(filename) as f:
458+
# TODO: Check cdot versions
459+
json_builds = next(ijson.items(f, "genome_builds"))
460+
if json_builds != [genome_build]:
461+
raise ValueError(f"JSON file provided for {genome_build} needs to have only {genome_build} "
462+
f"data (has {json_builds})")
463+
464+
f.seek(0) # Reset for next ijson call
465+
for transcript_id, build_transcript in ijson.kvitems(f, "transcripts"):
466+
genome_builds = {}
467+
existing_transcript = transcript_versions.get(transcript_id)
468+
if existing_transcript:
469+
genome_builds = existing_transcript["genome_builds"]
470+
# Latest always used, but check existing - if codons are different old versions are wrong so remove
471+
for field in ["start_codon", "stop_codon"]:
472+
old = existing_transcript.get(field)
473+
new = build_transcript.get(field)
474+
if old != new: # Old relied on different codons so is obsolete
475+
for build_coordinates in genome_builds.values():
476+
url = build_coordinates["url"]
477+
urls_different_coding[url].append(transcript_id)
478+
genome_builds = {}
479+
480+
genome_builds[genome_build] = build_transcript["genome_builds"][genome_build]
481+
# Use latest (with merged genome builds)
482+
build_transcript["genome_builds"] = genome_builds
483+
transcript_versions[transcript_id] = build_transcript
484+
485+
f.seek(0) # Reset for next ijson call
486+
for gene_id, gene_data in ijson.kvitems(f, "genes"):
487+
genes[gene_id] = gene_data
488+
489+
method = "Combine multiple genome builds"
490+
write_cdot_json(args.output, method, list(genome_build_filename.values()),
491+
genes, transcript_versions, list(genome_build_filename.keys()))
472492

473493
if urls_different_coding:
474494
print("Some transcripts were removed as they had different coding coordinates from latest")
475495
for url, transcript_ids in urls_different_coding.items():
476496
print(f"{url}: {','.join(transcript_ids)}")
477497

498+
def release_notes(args):
499+
with gzip.open(args.json_filename) as f:
500+
metadata = next(ijson.items(f, "metadata"))
501+
if metadata is None:
502+
raise ValueError("No metadata in JSON (requires schema version >=0.2.31)")
503+
print(f"### {os.path.basename(args.json_filename)}")
504+
print(f"Method: {metadata['method']}")
505+
print("Input files:")
506+
for input_file in metadata["input_files"]:
507+
print(f"- {input_file}")
508+
509+
if args.show_urls:
510+
print("Urls:")
511+
# Put in descending order
512+
url_counts = Counter(metadata["url_counts"])
513+
for url, count in url_counts.most_common():
514+
print(f"- {url}: {count}")
515+
478516

479517
def main():
480518
parser = _setup_arg_parser()
@@ -493,6 +531,7 @@ def main():
493531
"merge_historical": merge_historical,
494532
"combine_builds": combine_builds,
495533
"uta_to_json": uta_to_json,
534+
"release_notes": release_notes,
496535
}
497536
subcommands[args.subcommand](args)
498537

generate_transcript_data/github_release_upload.sh

Lines changed: 43 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,28 +14,48 @@ BASE_DIR=$(dirname ${FULL_PATH_TO_SCRIPT})
1414
export PYTHONPATH=${BASE_DIR}/..
1515

1616
CDOT_DATA_VERSION=$(${BASE_DIR}/cdot_json.py --version)
17-
1817
CDOT_RELEASE_NAME=data_v${CDOT_DATA_VERSION}
19-
echo "For the rest of the script to work, it assumes you have tagged + pushed a data release of ${CDOT_DATA_VERSION}"
20-
echo "then run: gh release create ${CDOT_RELEASE_NAME} --title=${CDOT_RELEASE_NAME} --notes 'release notes...'"
21-
22-
gh release upload ${CDOT_RELEASE_NAME} \
23-
${CDOT_DATA_DIR}/ensembl/cdot-${CDOT_DATA_VERSION}.all-builds-ensembl-grch37_grch38_t2t-chm13v2.0.json.gz \
24-
${CDOT_DATA_DIR}/ensembl/cdot-${CDOT_DATA_VERSION}.ensembl.GRCh37.json.gz \
25-
${CDOT_DATA_DIR}/ensembl/GRCh37/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh37_Ensembl_87.gtf.json.gz \
26-
${CDOT_DATA_DIR}/ensembl/cdot-${CDOT_DATA_VERSION}.ensembl.GRCh38.json.gz \
27-
${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_Ensembl_110.gtf.json.gz \
28-
${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_Ensembl_111.gtf.json.gz \
29-
${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_Ensembl_112.gtf.json.gz \
30-
${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_Ensembl_113.gtf.json.gz \
31-
${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_Ensembl_114.gtf.json.gz \
32-
${CDOT_DATA_DIR}/ensembl/cdot-${CDOT_DATA_VERSION}.ensembl.T2T-CHM13v2.0.json.gz \
33-
${CDOT_DATA_DIR}/refseq/cdot-${CDOT_DATA_VERSION}.all-builds-refseq-grch37_grch38_t2t-chm13v2.0.json.gz \
34-
${CDOT_DATA_DIR}/refseq/cdot-${CDOT_DATA_VERSION}.refseq.GRCh37.json.gz \
35-
${CDOT_DATA_DIR}/refseq/GRCh37/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh37_RefSeq_105.20201022.gff.json.gz \
36-
${CDOT_DATA_DIR}/refseq/GRCh37/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh37_RefSeq_105.20220307.gff.json.gz \
37-
${CDOT_DATA_DIR}/refseq/cdot-${CDOT_DATA_VERSION}.refseq.GRCh38.json.gz \
38-
${CDOT_DATA_DIR}/refseq/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_RefSeq_110.gff.json.gz \
39-
${CDOT_DATA_DIR}/refseq/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_RefSeq_RS_2023_10.gff.json.gz \
40-
${CDOT_DATA_DIR}/refseq/cdot-${CDOT_DATA_VERSION}.refseq.T2T-CHM13v2.0.json.gz
4118

19+
files=(
20+
"${CDOT_DATA_DIR}/ensembl/cdot-${CDOT_DATA_VERSION}.ensembl.GRCh37.json.gz"
21+
"${CDOT_DATA_DIR}/ensembl/GRCh37/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh37_Ensembl_87.gtf.json.gz"
22+
"${CDOT_DATA_DIR}/ensembl/cdot-${CDOT_DATA_VERSION}.ensembl.GRCh38.json.gz"
23+
"${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_Ensembl_110.gtf.json.gz"
24+
"${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_Ensembl_111.gtf.json.gz"
25+
"${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_Ensembl_112.gtf.json.gz"
26+
"${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_Ensembl_113.gtf.json.gz"
27+
"${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_Ensembl_114.gtf.json.gz"
28+
"${CDOT_DATA_DIR}/ensembl/cdot-${CDOT_DATA_VERSION}.ensembl.T2T-CHM13v2.0.json.gz"
29+
"${CDOT_DATA_DIR}/refseq/cdot-${CDOT_DATA_VERSION}.all-builds-refseq-grch37_grch38_t2t-chm13v2.0.json.gz"
30+
"${CDOT_DATA_DIR}/refseq/cdot-${CDOT_DATA_VERSION}.refseq.GRCh37.json.gz"
31+
"${CDOT_DATA_DIR}/refseq/GRCh37/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh37_RefSeq_105.20201022.gff.json.gz"
32+
"${CDOT_DATA_DIR}/refseq/GRCh37/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh37_RefSeq_105.20220307.gff.json.gz"
33+
"${CDOT_DATA_DIR}/refseq/cdot-${CDOT_DATA_VERSION}.refseq.GRCh38.json.gz"
34+
"${CDOT_DATA_DIR}/refseq/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_RefSeq_110.gff.json.gz"
35+
"${CDOT_DATA_DIR}/refseq/GRCh38/cdot-${CDOT_DATA_VERSION}.Homo_sapiens_GRCh38_RefSeq_RS_2023_10.gff.json.gz"
36+
"${CDOT_DATA_DIR}/refseq/cdot-${CDOT_DATA_VERSION}.refseq.T2T-CHM13v2.0.json.gz"
37+
)
38+
39+
git fetch --tags
40+
if ! git tag -l "${CDOT_RELEASE_NAME}" | grep -q .; then
41+
echo "Git repo has no tag of '${CDOT_RELEASE_NAME}'"
42+
exit 1
43+
fi
44+
45+
46+
if gh release view ${CDOT_RELEASE_NAME}; then
47+
echo "Release ${CDOT_RELEASE_NAME} exists"
48+
else
49+
echo "Creating release ${CDOT_RELEASE_NAME}"
50+
echo "Generating notes..."
51+
RELEASE_NOTES_FILENAME="/tmp/${CDOT_RELEASE_NAME}.txt"
52+
echo > ${RELEASE_NOTES_FILENAME} # Clear
53+
for f in "${files[@]}"; do
54+
${BASE_DIR}/cdot_json.py release_notes --json-filename=$f >> ${RELEASE_NOTES_FILENAME}
55+
done
56+
RELEASE_NOTES=$(cat ${RELEASE_NOTES_FILENAME})
57+
echo "Creating on GitHub"
58+
gh release create ${CDOT_RELEASE_NAME} --title=${CDOT_RELEASE_NAME} --notes ${RELEASE_NOTES}
59+
fi
60+
61+
gh release upload "${CDOT_RELEASE_NAME}" "${files[@]}"

generate_transcript_data/json_schema_version.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@
55

66
# 0.2.29 - Ensembl now has HGNC added from outside GTFs
77
# 0.2.30 - Ensembl GRCh37 has canonical transcripts added from outside GTFs
8+
# 0.2.31 - Add 'metadata' - method/urls
89

9-
JSON_SCHEMA_VERSION = "0.2.30"
10+
JSON_SCHEMA_VERSION = "0.2.31"

0 commit comments

Comments
 (0)