11import argparse
22import os
3+ from collections import Counter
34from ebi_eva_common_pyutils .logger import logging_config as log_cfg
5+
6+ from convert_gvf_to_vcf .conversionstatistics import FileStatistics
47from convert_gvf_to_vcf .lookup import Lookup
58from convert_gvf_to_vcf .utils import read_in_gvf_header , read_in_gvf_data
69from convert_gvf_to_vcf .vcfline import VcfLineBuilder
710
11+
812logger = log_cfg .get_logger (__name__ )
913
1014# setting up paths to useful directories
@@ -236,7 +240,7 @@ def get_pragma_tokens(pragma_value, first_delimiter, second_delimiter):
236240 return pragma_tokens
237241
238242
239- def write_header (vcf_output , pragmas_for_vcf , header_lines_per_type , is_missing_format_value , samples ):
243+ def write_header (vcf_output , pragmas_for_vcf , header_lines_per_type , header_fields , samples ):
240244 logger .info (f"Total number of samples in this VCF: { len (samples )} " )
241245 vcf_header_file = vcf_output + '_header'
242246 with open (vcf_header_file , "w" ) as vcf_header_output :
@@ -250,7 +254,6 @@ def write_header(vcf_output, pragmas_for_vcf, header_lines_per_type, is_missing_
250254
251255 # Part 2 of VCF file: Write the VCF header line.
252256 # Write the header.
253- header_fields = generate_vcf_header_line (is_missing_format = is_missing_format_value , samples = samples )
254257 vcf_header_output .write (f"{ header_fields } \n " )
255258 return vcf_header_file
256259
@@ -259,14 +262,13 @@ def convert(gvf_input, vcf_output, assembly):
259262 logger .info ("Running the GVF to VCF converter" )
260263 logger .info (f"The provided input file is: { gvf_input } " )
261264 logger .info (f"The provided output file is: { vcf_output } " )
265+
262266 if assembly :
263267 logger .info (f"The provided assembly file is: { assembly } " )
264268 assembly_file = os .path .abspath (assembly )
265269 assert os .path .isfile (assembly_file ), "Assembly file does not exist"
270+ assert os .path .isfile (gvf_input ), "GVF file does not exist"
266271
267- # Read input file and separate out its components
268- logger .info (f"Reading in the following GVF header from { gvf_input } " )
269- gvf_pragmas , gvf_pragma_comments = read_in_gvf_header (gvf_input )
270272 # Creating lookup object to store important dictionaries and log what has been stored.
271273 reference_lookup = Lookup (assembly_file )
272274 logger .info ("Creating the reference lookup object." )
@@ -275,51 +277,80 @@ def convert(gvf_input, vcf_output, assembly):
275277 logger .info (f"Storing the assembly file: { assembly_file } " )
276278 logger .info ("Storing the IUPAC ambiguity dictionary." )
277279
280+ # Read input file and separate out its components
281+ logger .info (f"Reading in the following GVF header from { gvf_input } " )
282+ gvf_pragmas , gvf_pragma_comments = read_in_gvf_header (gvf_input )
283+
278284 # Preparation work:
279285 # Store the VCF metainformation and ensure preservation of important GVF data.
280286 # This information will be useful when creating the VCF header.
281- # TODO: refactor function generate_vcf_metainfo
282287 (
283288 pragmas_for_vcf ,
284289 samples
285290 ) = convert_gvf_pragmas_for_vcf_header (gvf_pragmas , gvf_pragma_comments , reference_lookup )
286-
287- # TODO: place the all_header_lines_per_type_dict into the reference_lookup.
291+ report = FileStatistics (gvf_file_path = gvf_input , gvf_pragmas = gvf_pragmas , samples = samples )
288292
289293 # Create data structure to store all possible outcomes for header lines (for fields ALT, INFO, FILTER, FORMAT)
290294 all_header_lines_per_type_dict = {
291295 htype : generate_vcf_header_structured_lines (htype , reference_lookup .mapping_attribute_dict ) for htype in
292296 ["ALT" , "INFO" , "FILTER" , "FORMAT" ]
293297 }
298+
294299 vcf_builder = VcfLineBuilder (all_header_lines_per_type_dict , reference_lookup , samples )
295300 is_missing_format_value = True
301+ # VCF dataline generation
296302 # Convert each feature line in the GVF file to a VCF object (stores all the data for a line in the VCF file).
297303 # NOTE: Main Logic lives here.
298304 vcf_data_file = vcf_output + '_data_lines'
299305 with open (vcf_data_file , "w" ) as open_data_lines :
300306 logger .info ("Generating the VCF datalines" )
301307 previous_vcf_line = None
302- for gvf_lines_obj in read_in_gvf_data (gvf_input ):
303- current_vcf_line = vcf_builder .build_vcf_line (gvf_lines_obj )
308+ for gvf_entry in read_in_gvf_data (gvf_input ):
309+ # record GVF counts
310+ report .gvf_feature_line_count += 1
311+ report .gvf_chromosome_count [gvf_entry .seqid ] += 1
312+ report .gvf_sv_count .update ([gvf_entry .feature_type ])
313+ # create the VCF line object
314+ current_vcf_line = vcf_builder .build_vcf_line (gvf_entry )
304315 # is_missing_format_value will only be true if all the format field are missing.
305316 is_missing_format_value = is_missing_format_value and current_vcf_line .format_keys == ['.' ]
306317 # Each GVF feature has been converted to a VCF object so begin comparing and merging the VCF objects.
307318 if previous_vcf_line :
308319 if current_vcf_line == previous_vcf_line :
309320 current_vcf_line .merge (previous_vcf_line , list_of_sample_names = samples )
321+ report .vcf_number_of_merges += 1
310322 else :
311- open_data_lines .write (str (previous_vcf_line ) + "\n " )
323+ # TODO: address this in next bug fix
324+ assert current_vcf_line > previous_vcf_line , f"File not sorted.\n current_vcf_line.pos { current_vcf_line .pos } is smaller than previous_vcf_line.pos { previous_vcf_line .pos } . See the following line:\n { str (gvf_entry )} "
325+ record_vcf_entry (open_data_lines , previous_vcf_line , report )
312326 previous_vcf_line = current_vcf_line
327+ # Process the final previous_vcf_line
313328 if previous_vcf_line :
314- open_data_lines . write ( str ( previous_vcf_line ) + " \n " )
329+ record_vcf_entry ( open_data_lines , previous_vcf_line , report )
315330 else :
316331 logger .warning ("No feature lines were found for this GVF file." )
317332
333+ # VCF header generation
318334 header_lines_per_type = vcf_builder .build_vcf_header ()
319- vcf_header_file = write_header ( vcf_output , pragmas_for_vcf , header_lines_per_type ,
320- is_missing_format_value , samples )
335+ header_fields = generate_vcf_header_line ( is_missing_format = is_missing_format_value , samples = samples )
336+ vcf_header_file = write_header ( vcf_output , pragmas_for_vcf , header_lines_per_type , header_fields , samples )
321337
322338 logger .info (f"Combining the header and data lines to the following VCF output: { vcf_output } " )
339+ construct_vcf_output (vcf_header_file , vcf_data_file , vcf_output )
340+
341+ logger .info ("Remove the temporary files" )
342+ cleanup_temp_files ([vcf_header_file , vcf_data_file ])
343+
344+ logger .info ("Printing the summary of conversion report." )
345+ vcf_output_directory = os .path .dirname (vcf_output )
346+ stats_summary_file = os .path .join (vcf_output_directory , "summary_stats.txt" )
347+ report .print_report (stats_summary_file )
348+
349+ logger .info ("GVF to VCF conversion complete" )
350+
351+
352+ #helper functions for convert
353+ def construct_vcf_output (vcf_header_file , vcf_data_file , vcf_output ):
323354 with open (vcf_output , "w" ) as vcf_output :
324355 with open (vcf_header_file , "r" ) as vcf_header_fh :
325356 for line in vcf_header_fh :
@@ -328,12 +359,23 @@ def convert(gvf_input, vcf_output, assembly):
328359 for line in vcf_data_fh :
329360 vcf_output .write (line )
330361 vcf_output .close ()
331- logger .info ("Remove the temporary files" )
332- if os .path .exists (vcf_header_file ):
333- os .remove (vcf_header_file )
334- if os .path .exists (vcf_data_file ):
335- os .remove (vcf_data_file )
336- logger .info ("GVF to VCF conversion complete" )
362+
363+ def record_vcf_entry (open_data_lines , previous_vcf_line , report ):
364+ # write the VCF line and record counts (use update when iterable)
365+ open_data_lines .write (str (previous_vcf_line ) + "\n " )
366+ report .vcf_data_line_count += 1
367+ report .vcf_chromosome_count [previous_vcf_line .chrom ] += 1
368+ report .vcf_alt_alleles_count [previous_vcf_line .alt ] += 1
369+ report .vcf_info_counter .update (previous_vcf_line .info_dict .keys ())
370+ report .vcf_variant_region_SOID .update ([previous_vcf_line .info_dict .get ("VARREGSOID" )])
371+ report .vcf_variant_call_SOID .update ([previous_vcf_line .info_dict .get ("VARCALLSOID" )])
372+ report .vcf_sample_number_count .update (previous_vcf_line .order_sample_names )
373+ report .vcf_format_counter .update (previous_vcf_line .vcf_values_for_format )
374+
375+ def cleanup_temp_files (list_of_temp_files ):
376+ for temp_file in list_of_temp_files :
377+ if os .path .exists (temp_file ):
378+ os .remove (temp_file )
337379
338380def main ():
339381 # Parse command line arguments
0 commit comments