@@ -286,6 +286,11 @@ def format_vcf_datalines(list_of_vcf_objects, list_of_sample_names):
286286 return formatted_vcf_datalines
287287
288288def get_bigger_dictionary (dict1 , dict2 ):
289+ """Determines the biggest of two dictionaries
290+ :param: dictionary1
291+ :param: dictinary2
292+ :return: smallest, largest
293+ """
289294 if len (dict1 ) > len (dict2 ):
290295 biggest_dict = dict1
291296 smallest_dict = dict2
@@ -298,13 +303,20 @@ def get_bigger_dictionary(dict1, dict2):
298303 return smallest_dict , biggest_dict
299304
300305def merge_and_add (previous_element , current_element , delimiter ):
306+ """ If same, use current element. If different, merge with delimiter.
307+ :param: previous_element
308+ :param: current_element
309+ :param: delimiter
310+ :return: merged element
311+ """
301312 if previous_element == current_element :
302313 merged_element = current_element
303314 else :
304315 merged_element = delimiter .join ((previous_element , current_element ))
305316 return merged_element
306317
307318def compare_and_merge_lines (list_of_formatted_vcf_datalines , headerline ):
319+ merged_lines = []
308320 for previous , current in zip (list_of_formatted_vcf_datalines , list_of_formatted_vcf_datalines [1 :]):
309321 # print(f"previous line:\n{previous}\ncurrent line:\n{current}\n")
310322 previous_tokens = previous .split ("\t " )
@@ -319,7 +331,7 @@ def compare_and_merge_lines(list_of_formatted_vcf_datalines, headerline):
319331 and previous_data ["POS" ] == current_data ["POS" ]
320332 and previous_data ["REF" ] == current_data ["REF" ]
321333 ):
322- # print("True - merge")
334+ print ("True - merge" )
323335 merged_data ["#CHROM" ] = current_data ["#CHROM" ]
324336 merged_data ["POS" ] = current_data ["POS" ]
325337 merged_data ["ID" ] = merge_and_add (previous_data ["ID" ], current_data ["ID" ], ";" )
@@ -401,9 +413,26 @@ def compare_and_merge_lines(list_of_formatted_vcf_datalines, headerline):
401413 sample_format_string = ':' .join (flat_values )
402414
403415 merged_data [sample_name ] = sample_format_string
404- # else:
405- # print("False - keep previous")
406- # print("---")
416+ merged_lines .append (merged_data )
417+ print ("---" )
418+ else :
419+ print ("False - keep previous" )
420+ merged_data ["#CHROM" ] = previous_data ["#CHROM" ]
421+ merged_data ["POS" ] = previous_data ["POS" ]
422+ merged_data ["ID" ] = previous_data ["ID" ]
423+ merged_data ["REF" ] = previous_data ["REF" ]
424+ merged_data ["ALT" ] = previous_data ["ALT" ]
425+ merged_data ["QUAL" ] = previous_data ["QUAL" ]
426+ merged_data ["FILTER" ] = previous_data ["FILTER" ]
427+ merged_data ["INFO" ] = previous_data ["INFO" ]
428+ merged_data ["FORMAT" ] = previous_data ["FORMAT" ]
429+ sample_names = header_fields [9 :]
430+ for sample in sample_names :
431+ merged_data [sample ] = previous_data [sample ]
432+
433+ merged_lines .append (merged_data )
434+ print ("---" )
435+ return merged_lines
407436
408437
409438
@@ -478,9 +507,9 @@ def main():
478507 vcf_output .write (f"{ header_fields } \n " )
479508 logger .info ("Generating the VCF datalines" )
480509 formatted_vcf_datalines = format_vcf_datalines (list_of_vcf_objects , samples )
481- compare_and_merge_lines (formatted_vcf_datalines , header_fields )
482- for line in formatted_vcf_datalines :
483- vcf_output .write (f" { line } \n " )
510+ merged_lines = compare_and_merge_lines (formatted_vcf_datalines , header_fields )
511+ for line in merged_lines :
512+ vcf_output .write (" \t " . join ( str ( val ) for val in line . values ()) + " \n " )
484513 vcf_output .close ()
485514 logger .info ("GVF to VCF conversion complete" )
486515
0 commit comments