44import ijson
55import json
66import logging
7+ import os
78import re
89import sys
910from argparse import ArgumentParser
@@ -59,6 +60,10 @@ def _setup_arg_parser():
5960 parser_builds .add_argument ('--grch38' , required = True , help = 'cdot JSON.gz for GRCh38' )
6061 parser_builds .add_argument ('--t2t_chm13v2' , required = True , help = 'cdot JSON.gz for t2t_chm13v2' )
6162
63+ parser_release_notes = subparsers .add_parser ("release_notes" , help = "JSON file to retrieve release notes" )
64+ parser_release_notes .add_argument ("json_filename" , help = "cdot JSON file" )
65+ parser_release_notes .add_argument ("--show-urls" , action = 'store_true' , default = False )
66+
6267 # I want this to be subcommands rather than global (would need to be listed before subcommand)
6368 for p in [parser_gtf , parser_gff3 , parser_uta , parser_historical , parser_builds ]:
6469 p .add_argument ('--output' , required = True , help = 'Output filename' )
@@ -189,7 +194,9 @@ def gtf_to_json(args):
189194 refseq_gene_summary_api_retrieval_date = add_gene_info (args .gene_info_json , genes )
190195 add_gencode_hgnc (args .gencode_hgnc_metadata , genes , transcripts )
191196 add_canonical_transcripts (args .gene_canonical_transcripts_csv , args .genome_build , transcripts )
192- write_cdot_json (args .output , genes , transcripts , [args .genome_build ],
197+ method = "Single GFF3 file"
198+ write_cdot_json (args .output , method , [args .gtf_filename ],
199+ genes , transcripts , [args .genome_build ],
193200 refseq_gene_summary_api_retrieval_date = refseq_gene_summary_api_retrieval_date )
194201
195202
@@ -203,7 +210,9 @@ def gff3_to_json(args):
203210 refseq_gene_summary_api_retrieval_date = add_gene_info (args .gene_info_json , genes )
204211 add_gencode_hgnc (args .gencode_hgnc_metadata , genes , transcripts )
205212 add_canonical_transcripts (args .gene_canonical_transcripts_csv , args .genome_build , transcripts )
206- write_cdot_json (args .output , genes , transcripts , [args .genome_build ],
213+ method = "Single GFF3 file"
214+ write_cdot_json (args .output , method , [args .gff3_filename ],
215+ genes , transcripts , [args .genome_build ],
207216 refseq_gene_summary_api_retrieval_date = refseq_gene_summary_api_retrieval_date )
208217
209218
@@ -284,7 +293,9 @@ def uta_to_json(args):
284293 transcripts_by_id [transcript_accession ] = transcript_data
285294
286295 print ("Writing UTA to cdot JSON.gz" )
287- write_cdot_json (args .output , genes_by_id , transcripts_by_id , [args .genome_build ])
296+ method = "Conversion of Universal Transcript Archive dump"
297+ write_cdot_json (args .output , method , [args .uta_csv_filename ],
298+ genes_by_id , transcripts_by_id , [args .genome_build ])
288299
289300
290301def _convert_uta_exons (exon_starts , exon_ends , cigars ):
@@ -356,9 +367,11 @@ def _cigar_to_gap_and_length(cigar):
356367 return gap , exon_length
357368
358369
359- def write_cdot_json (filename , genes , transcript_versions , genome_builds , refseq_gene_summary_api_retrieval_date = None ):
370+ def write_cdot_json (filename : str , method : str , input_files , genes , transcript_versions , genome_builds ,
371+ refseq_gene_summary_api_retrieval_date = None ):
360372 print (f"Writing cdot file: '{ filename } '" )
361373 data = {
374+ # We also write these in metadata now, but keep for legacy compatability
362375 "cdot_version" : JSON_SCHEMA_VERSION ,
363376 "genome_builds" : genome_builds ,
364377 "transcripts" : transcript_versions ,
@@ -368,6 +381,20 @@ def write_cdot_json(filename, genes, transcript_versions, genome_builds, refseq_
368381 if refseq_gene_summary_api_retrieval_date :
369382 data ["refseq_gene_summary_api_retrieval_date" ] = refseq_gene_summary_api_retrieval_date
370383
384+ url_counts = Counter ()
385+ for tv in transcript_versions .values ():
386+ for build_coordinates in tv ["genome_builds" ].values ():
387+ url_counts [build_coordinates ["url" ]] += 1
388+
389+ data ["metadata" ] = {
390+ "method" : method ,
391+ "input_files" : input_files ,
392+ "sys.argv" : " " .join (sys .argv ),
393+ "url_counts" : dict (url_counts .most_common ()),
394+ "cdot_version" : JSON_SCHEMA_VERSION ,
395+ "genome_builds" : genome_builds
396+ }
397+
371398 with gzip .open (filename , 'wt' ) as outfile :
372399 json .dump (data , outfile , cls = SortedSetEncoder , sort_keys = True ) # Sort so diffs work
373400
@@ -405,76 +432,87 @@ def merge_historical(args):
405432 historical_transcript_version ["gene_version" ] = gene_accession
406433 transcript_versions [transcript_accession ] = historical_transcript_version
407434
408- genes = {} # Only keep those that are used in transcript versions
409- # Summarise where it's from
410- transcript_urls = Counter ()
435+ genes = {} # Only keep those that are used in kept transcript versions
411436 for tv in transcript_versions .values ():
412437 if not args .no_genes :
413438 if gene_accession := tv .get ("gene_version" ):
414439 genes [gene_accession ] = gene_versions [gene_accession ]
415440
416- for build_coordinates in tv ["genome_builds" ].values ():
417- transcript_urls [build_coordinates ["url" ]] += 1
418-
419- total = sum (transcript_urls .values ())
420- print (f"{ total } transcript versions from:" )
421- for url , count in transcript_urls .most_common ():
422- print (f"{ url } : { count } ({ count * 100 / total :.1f} %)" )
423-
424- write_cdot_json (args .output , genes , transcript_versions , [args .genome_build ])
441+ method = "Merge historical"
442+ write_cdot_json (args .output , method , args .json_filenames ,
443+ genes , transcript_versions , [args .genome_build ])
425444
426445
427446def combine_builds (args ):
428447 print ("combine_builds" )
429- genome_build_file = {
430- "GRCh37" : gzip . open ( args .grch37 ) ,
431- "GRCh38" : gzip . open ( args .grch38 ) ,
432- "T2T-CHM13v2.0" : gzip . open ( args .t2t_chm13v2 ) ,
448+ genome_build_filename = {
449+ "GRCh37" : args .grch37 ,
450+ "GRCh38" : args .grch38 ,
451+ "T2T-CHM13v2.0" : args .t2t_chm13v2 ,
433452 }
434-
435453 urls_different_coding = defaultdict (list )
436454 genes = {}
437455 transcript_versions = {}
438- for genome_build , f in genome_build_file .items ():
439- # TODO: Check cdot versions
440- json_builds = next (ijson .items (f , "genome_builds" ))
441- if json_builds != [genome_build ]:
442- raise ValueError (f"JSON file provided for { genome_build } needs to have only { genome_build } data (has { json_builds } )" )
443-
444- f .seek (0 ) # Reset for next ijson call
445- for transcript_id , build_transcript in ijson .kvitems (f , "transcripts" ):
446- genome_builds = {}
447- existing_transcript = transcript_versions .get (transcript_id )
448- if existing_transcript :
449- genome_builds = existing_transcript ["genome_builds" ]
450- # Latest always used, but check existing - if codons are different old versions are wrong so remove
451- for field in ["start_codon" , "stop_codon" ]:
452- old = existing_transcript .get (field )
453- new = build_transcript .get (field )
454- if old != new : # Old relied on different codons so is obsolete
455- for build_coordinates in genome_builds .values ():
456- url = build_coordinates ["url" ]
457- urls_different_coding [url ].append (transcript_id )
458- genome_builds = {}
459-
460- genome_builds [genome_build ] = build_transcript ["genome_builds" ][genome_build ]
461- # Use latest (with merged genome builds)
462- build_transcript ["genome_builds" ] = genome_builds
463- transcript_versions [transcript_id ] = build_transcript
464-
465- f .seek (0 ) # Reset for next ijson call
466- for gene_id , gene_data in ijson .kvitems (f , "genes" ):
467- genes [gene_id ] = gene_data
468-
469- f .close ()
470-
471- write_cdot_json (args .output , genes , transcript_versions , list (genome_build_file .keys ()))
456+ for genome_build , filename in genome_build_filename .items ():
457+ with gzip .open (filename ) as f :
458+ # TODO: Check cdot versions
459+ json_builds = next (ijson .items (f , "genome_builds" ))
460+ if json_builds != [genome_build ]:
461+ raise ValueError (f"JSON file provided for { genome_build } needs to have only { genome_build } "
462+ f"data (has { json_builds } )" )
463+
464+ f .seek (0 ) # Reset for next ijson call
465+ for transcript_id , build_transcript in ijson .kvitems (f , "transcripts" ):
466+ genome_builds = {}
467+ existing_transcript = transcript_versions .get (transcript_id )
468+ if existing_transcript :
469+ genome_builds = existing_transcript ["genome_builds" ]
470+ # Latest always used, but check existing - if codons are different old versions are wrong so remove
471+ for field in ["start_codon" , "stop_codon" ]:
472+ old = existing_transcript .get (field )
473+ new = build_transcript .get (field )
474+ if old != new : # Old relied on different codons so is obsolete
475+ for build_coordinates in genome_builds .values ():
476+ url = build_coordinates ["url" ]
477+ urls_different_coding [url ].append (transcript_id )
478+ genome_builds = {}
479+
480+ genome_builds [genome_build ] = build_transcript ["genome_builds" ][genome_build ]
481+ # Use latest (with merged genome builds)
482+ build_transcript ["genome_builds" ] = genome_builds
483+ transcript_versions [transcript_id ] = build_transcript
484+
485+ f .seek (0 ) # Reset for next ijson call
486+ for gene_id , gene_data in ijson .kvitems (f , "genes" ):
487+ genes [gene_id ] = gene_data
488+
489+ method = "Combine multiple genome builds"
490+ write_cdot_json (args .output , method , list (genome_build_filename .values ()),
491+ genes , transcript_versions , list (genome_build_filename .keys ()))
472492
473493 if urls_different_coding :
474494 print ("Some transcripts were removed as they had different coding coordinates from latest" )
475495 for url , transcript_ids in urls_different_coding .items ():
476496 print (f"{ url } : { ',' .join (transcript_ids )} " )
477497
498+ def release_notes (args ):
499+ with gzip .open (args .json_filename ) as f :
500+ metadata = next (ijson .items (f , "metadata" ))
501+ if metadata is None :
502+ raise ValueError ("No metadata in JSON (requires schema version >=0.2.31)" )
503+ print (f"### { os .path .basename (args .json_filename )} " )
504+ print (f"Method: { metadata ['method' ]} " )
505+ print ("Input files:" )
506+ for input_file in metadata ["input_files" ]:
507+ print (f"- { input_file } " )
508+
509+ if args .show_urls :
510+ print ("Urls:" )
511+ # Put in descending order
512+ url_counts = Counter (metadata ["url_counts" ])
513+ for url , count in url_counts .most_common ():
514+ print (f"- { url } : { count } " )
515+
478516
479517def main ():
480518 parser = _setup_arg_parser ()
@@ -493,6 +531,7 @@ def main():
493531 "merge_historical" : merge_historical ,
494532 "combine_builds" : combine_builds ,
495533 "uta_to_json" : uta_to_json ,
534+ "release_notes" : release_notes ,
496535 }
497536 subcommands [args .subcommand ](args )
498537
0 commit comments