11import argparse
22import os
33from convert_gvf_to_vcf .utils import read_pragma_mapper , read_in_gvf_file
4- from convert_gvf_to_vcf .vcfline import VcfLine
4+ from convert_gvf_to_vcf .vcfline import VcfLineBuilder
55from convert_gvf_to_vcf .logger import set_up_logging , logger
66from convert_gvf_to_vcf .lookup import Lookup
77# setting up paths to useful directories
@@ -48,14 +48,10 @@ def generate_vcf_header_unstructured_line(vcf_unstructured_key,
4848 return custom_unstructured_string
4949
5050def generate_vcf_header_metainfo (gvf_pragmas ,
51- gvf_non_essential ,
52- list_of_vcf_objects ,
53- standard_lines_dictionary ):
51+ gvf_non_essential ):
5452 """ Generates a list of metainformation lines for the VCF header
5553 :param gvf_pragmas: list of gvf pragmas to convert
5654 :param gvf_non_essential: list of non-essential gvf pragmas to convert
57- :param list_of_vcf_objects: list of vcf objects
58- :param standard_lines_dictionary: dictionary of standard lines
5955 :return: unique_pragmas_to_add, sample_names: a list of pragmas (removed duplicates), list of sample names
6056 """
6157 pragmas_to_add = []
@@ -76,8 +72,9 @@ def generate_vcf_header_metainfo(gvf_pragmas,
7672 for pragma in gvf_pragmas :
7773 vcf_header_key , pragma_name , pragma_value = get_pragma_name_and_value (pragma , " " , list_of_pragma , pragma_to_vcf_map )
7874 pragmas_to_add .append (generate_vcf_header_unstructured_line (vcf_header_key , pragma_value ))
79- for vcf_obj in list_of_vcf_objects :
80- pragmas_to_add .append (generate_vcf_header_unstructured_line ("source" , vcf_obj .source ))
75+ # FIXME: Why are we adding header from the VCF lines
76+ # for vcf_obj in list_of_vcf_objects:
77+ # pragmas_to_add.append(generate_vcf_header_unstructured_line("source", vcf_obj.source))
8178 ####
8279 ####
8380 # Go through non-essential pragmas
@@ -112,10 +109,11 @@ def generate_vcf_header_metainfo(gvf_pragmas,
112109 uniq_sample_name .append (sample )
113110 ###
114111 unique_pragmas_to_add = list (dict .fromkeys (pragma for pragma in pragmas_to_add if pragma not in unique_pragmas_to_add ))
115- unique_alt_lines_to_add = list (dict .fromkeys (alt_line for alt_line in standard_lines_dictionary ["ALT" ] if alt_line not in unique_alt_lines_to_add ))
116- unique_info_lines_to_add = list (dict .fromkeys (info_line for info_line in standard_lines_dictionary ["INFO" ] if info_line not in unique_info_lines_to_add ))
117- unique_filter_lines_to_add = list (dict .fromkeys (filter_line for filter_line in standard_lines_dictionary ["FILTER" ] if filter_line not in unique_filter_lines_to_add ))
118- unique_format_lines_to_add = list (dict .fromkeys (format_line for format_line in standard_lines_dictionary ["FORMAT" ] if format_line not in unique_format_lines_to_add ))
112+ # TODO: The addition of headers from the VCF lines should be done in the VCF builder
113+ # unique_alt_lines_to_add = list(dict.fromkeys(alt_line for alt_line in standard_lines_dictionary["ALT"] if alt_line not in unique_alt_lines_to_add))
114+ # unique_info_lines_to_add = list(dict.fromkeys(info_line for info_line in standard_lines_dictionary["INFO"] if info_line not in unique_info_lines_to_add))
115+ # unique_filter_lines_to_add = list(dict.fromkeys(filter_line for filter_line in standard_lines_dictionary["FILTER"] if filter_line not in unique_filter_lines_to_add))
116+ # unique_format_lines_to_add = list(dict.fromkeys(format_line for format_line in standard_lines_dictionary["FORMAT"] if format_line not in unique_format_lines_to_add))
119117
120118 return unique_pragmas_to_add , uniq_sample_name , unique_alt_lines_to_add , unique_info_lines_to_add , unique_filter_lines_to_add , unique_format_lines_to_add
121119
@@ -182,13 +180,12 @@ def get_pragma_tokens(pragma_value, first_delimiter, second_delimiter):
182180 return pragma_tokens
183181
184182# This is the main conversion logic
185- def convert_gvf_features_to_vcf_objects (gvf_lines_obj_list , reference_lookup ):
183+ def convert_gvf_features_to_vcf_objects (gvf_lines_obj_list , reference_lookup , ordered_list_of_samples ):
186184 """ Creates VCF objects from GVF feature lines and stores the VCF objects.
187185 :param gvf_lines_obj_list: list of GVF feature line objects
188186 :param reference_lookup: an object that stores important dictionaries to be used for reference lookups.
189- :return: standard_header_lines, vcf_data_lines, list_of_vcf_objects: header lines for this VCF, datalines for this VCF and a list of VCF objects
187+ :return: standard_header_lines, list_of_vcf_objects: header lines for this VCF, datalines for this VCF and a list of VCF objects
190188 """
191- vcf_data_lines = {} # DICTIONARY OF LISTS, {Chromosome_Pos: [VCF line object]}
192189 list_of_vcf_objects = []
193190 # Create data structure to store the header lines for this VCF file (standard meta-information lines)
194191 standard_header_lines = {
@@ -203,27 +200,14 @@ def convert_gvf_features_to_vcf_objects(gvf_lines_obj_list, reference_lookup):
203200 all_header_lines_per_type_dict = {
204201 htype : generate_vcf_header_structured_lines (htype , reference_lookup .mapping_attribute_dict ) for htype in ["ALT" , "INFO" , "FILTER" , "FORMAT" ]
205202 }
206-
203+ vcf_builder = VcfLineBuilder ( standard_header_lines , all_header_lines_per_type_dict , reference_lookup , ordered_list_of_samples )
207204 # Create a vcf object for every feature line in the GVF (1:1)
208205 for gvf_featureline in gvf_lines_obj_list :
209- #NOTE: this is the main Logic of the code
210- vcf_object = VcfLine (gvf_featureline ,
211- standard_header_lines ,
212- all_header_lines_per_type_dict ,
213- reference_lookup )
206+ vcf_object = vcf_builder .build_vcf_line (gvf_featureline )
214207 # Store VCF object in the list
215208 list_of_vcf_objects .append (vcf_object )
216-
217- # vcf_object.key is formatted as follows: Chromosome_Pos
218- if vcf_object .key in vcf_data_lines :
219- # Add VCF object to the dictionary of lists
220- vcf_data_lines [vcf_object .key ].append (vcf_object )
221- else :
222- # Get it into a format where the VCF object can be added to the dictionary of lists
223- vcf_data_line_objects_list = [vcf_object ]
224- vcf_data_lines [vcf_object .key ] = vcf_data_line_objects_list
225- # Returns the header of the VCF file, the datalines of the VCF file, and the object.
226- return standard_header_lines , vcf_data_lines , list_of_vcf_objects
209+ # Returns the header of the VCF file, and the object.
210+ return standard_header_lines , list_of_vcf_objects
227211
228212# The functions below relate to the VCF objects
229213def compare_vcf_objects (list_of_vcf_objects ):
@@ -318,32 +302,27 @@ def main():
318302 logger .info (f"Storing the assembly file: { assembly_file } " )
319303 logger .info ("Storing the IUPAC ambiguity dictionary." )
320304
305+ # Preparation work:
306+ # Store the VCF metainformation and ensure preservation of important GVF data.
307+ # This information will be useful when creating the VCF header.
308+ # TODO: refactor function generate_vcf_metainfo
309+ (
310+ unique_pragmas_to_add ,
311+ samples ,
312+ unique_alt_lines_to_add ,
313+ unique_info_lines_to_add ,
314+ unique_filter_lines_to_add ,
315+ unique_format_lines_to_add
316+ ) = generate_vcf_header_metainfo (gvf_pragmas , gvf_non_essential )
317+
321318 # Convert each feature line in the GVF file to a VCF object (stores all the data for a line in the VCF file).
322319 # NOTE: Main Logic lives here.
323- (
324- header_lines ,
325- vcf_data_lines , #TODO: check if this can be removed
326- list_of_vcf_objects
327- ) = convert_gvf_features_to_vcf_objects (gvf_lines_obj_list , reference_lookup )
320+ (header_lines ,list_of_vcf_objects ) = convert_gvf_features_to_vcf_objects (gvf_lines_obj_list , reference_lookup , ordered_list_of_samples = samples )
328321
329322 logger .info (f"Writing to the following VCF output: { args .vcf_output } " )
330323 logger .info ("Generating the VCF header and the meta-information lines" )
331324 with open (args .vcf_output , "w" ) as vcf_output :
332- # Preparation work:
333- # Store the VCF metainformation and ensure preservation of important GVF data.
334- # This information will be useful when creating the VCF header.
335- # TODO: refactor function generate_vcf_metainfo
336- (
337- unique_pragmas_to_add ,
338- samples ,
339- unique_alt_lines_to_add ,
340- unique_info_lines_to_add ,
341- unique_filter_lines_to_add ,
342- unique_format_lines_to_add
343- ) = generate_vcf_header_metainfo (gvf_pragmas ,
344- gvf_non_essential ,
345- list_of_vcf_objects ,
346- header_lines )
325+
347326 logger .info (f"Total number of samples in this VCF: { len (samples )} " )
348327
349328 # Part 1 of VCF file: Write the VCF header. This will include perserved data from the GVF file.
0 commit comments