Skip to content

Commit 550736e

Browse files
authored
Split VCF line in builder and storage object (#18)
* Split VCF line into a line builder and an object to hold the line * Update tests
1 parent d9c32a6 commit 550736e

File tree

4 files changed

+341
-328
lines changed

4 files changed

+341
-328
lines changed

convert_gvf_to_vcf/convertGVFtoVCF.py

Lines changed: 31 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import argparse
22
import os
33
from convert_gvf_to_vcf.utils import read_pragma_mapper, read_in_gvf_file
4-
from convert_gvf_to_vcf.vcfline import VcfLine
4+
from convert_gvf_to_vcf.vcfline import VcfLineBuilder
55
from convert_gvf_to_vcf.logger import set_up_logging, logger
66
from convert_gvf_to_vcf.lookup import Lookup
77
# setting up paths to useful directories
@@ -48,14 +48,10 @@ def generate_vcf_header_unstructured_line(vcf_unstructured_key,
4848
return custom_unstructured_string
4949

5050
def generate_vcf_header_metainfo(gvf_pragmas,
51-
gvf_non_essential,
52-
list_of_vcf_objects,
53-
standard_lines_dictionary):
51+
gvf_non_essential):
5452
""" Generates a list of metainformation lines for the VCF header
5553
:param gvf_pragmas: list of gvf pragmas to convert
5654
:param gvf_non_essential: list of non-essential gvf pragmas to convert
57-
:param list_of_vcf_objects: list of vcf objects
58-
:param standard_lines_dictionary: dictionary of standard lines
5955
:return: unique_pragmas_to_add, sample_names: a list of pragmas (removed duplicates), list of sample names
6056
"""
6157
pragmas_to_add = []
@@ -76,8 +72,9 @@ def generate_vcf_header_metainfo(gvf_pragmas,
7672
for pragma in gvf_pragmas:
7773
vcf_header_key, pragma_name, pragma_value = get_pragma_name_and_value(pragma, " ", list_of_pragma, pragma_to_vcf_map)
7874
pragmas_to_add.append(generate_vcf_header_unstructured_line(vcf_header_key, pragma_value))
79-
for vcf_obj in list_of_vcf_objects:
80-
pragmas_to_add.append(generate_vcf_header_unstructured_line("source", vcf_obj.source))
75+
# FIXME: Why are we adding header from the VCF lines
76+
# for vcf_obj in list_of_vcf_objects:
77+
# pragmas_to_add.append(generate_vcf_header_unstructured_line("source", vcf_obj.source))
8178
####
8279
####
8380
# Go through non-essential pragmas
@@ -112,10 +109,11 @@ def generate_vcf_header_metainfo(gvf_pragmas,
112109
uniq_sample_name.append(sample)
113110
###
114111
unique_pragmas_to_add = list(dict.fromkeys(pragma for pragma in pragmas_to_add if pragma not in unique_pragmas_to_add))
115-
unique_alt_lines_to_add = list(dict.fromkeys(alt_line for alt_line in standard_lines_dictionary["ALT"] if alt_line not in unique_alt_lines_to_add))
116-
unique_info_lines_to_add = list(dict.fromkeys(info_line for info_line in standard_lines_dictionary["INFO"] if info_line not in unique_info_lines_to_add))
117-
unique_filter_lines_to_add = list(dict.fromkeys(filter_line for filter_line in standard_lines_dictionary["FILTER"] if filter_line not in unique_filter_lines_to_add))
118-
unique_format_lines_to_add = list(dict.fromkeys(format_line for format_line in standard_lines_dictionary["FORMAT"] if format_line not in unique_format_lines_to_add))
112+
# TODO: The addition of headers from the VCF lines should be done in the VCF builder
113+
# unique_alt_lines_to_add = list(dict.fromkeys(alt_line for alt_line in standard_lines_dictionary["ALT"] if alt_line not in unique_alt_lines_to_add))
114+
# unique_info_lines_to_add = list(dict.fromkeys(info_line for info_line in standard_lines_dictionary["INFO"] if info_line not in unique_info_lines_to_add))
115+
# unique_filter_lines_to_add = list(dict.fromkeys(filter_line for filter_line in standard_lines_dictionary["FILTER"] if filter_line not in unique_filter_lines_to_add))
116+
# unique_format_lines_to_add = list(dict.fromkeys(format_line for format_line in standard_lines_dictionary["FORMAT"] if format_line not in unique_format_lines_to_add))
119117

120118
return unique_pragmas_to_add, uniq_sample_name, unique_alt_lines_to_add, unique_info_lines_to_add, unique_filter_lines_to_add, unique_format_lines_to_add
121119

@@ -182,13 +180,12 @@ def get_pragma_tokens(pragma_value, first_delimiter, second_delimiter):
182180
return pragma_tokens
183181

184182
# This is the main conversion logic
185-
def convert_gvf_features_to_vcf_objects(gvf_lines_obj_list, reference_lookup):
183+
def convert_gvf_features_to_vcf_objects(gvf_lines_obj_list, reference_lookup, ordered_list_of_samples):
186184
""" Creates VCF objects from GVF feature lines and stores the VCF objects.
187185
:param gvf_lines_obj_list: list of GVF feature line objects
188186
:param reference_lookup: an object that stores important dictionaries to be used for reference lookups.
189-
:return: standard_header_lines, vcf_data_lines, list_of_vcf_objects: header lines for this VCF, datalines for this VCF and a list of VCF objects
187+
:return: standard_header_lines, list_of_vcf_objects: header lines for this VCF, datalines for this VCF and a list of VCF objects
190188
"""
191-
vcf_data_lines = {} # DICTIONARY OF LISTS, {Chromosome_Pos: [VCF line object]}
192189
list_of_vcf_objects = []
193190
# Create data structure to store the header lines for this VCF file (standard meta-information lines)
194191
standard_header_lines ={
@@ -203,27 +200,14 @@ def convert_gvf_features_to_vcf_objects(gvf_lines_obj_list, reference_lookup):
203200
all_header_lines_per_type_dict = {
204201
htype: generate_vcf_header_structured_lines(htype, reference_lookup.mapping_attribute_dict) for htype in ["ALT", "INFO", "FILTER", "FORMAT"]
205202
}
206-
203+
vcf_builder = VcfLineBuilder(standard_header_lines, all_header_lines_per_type_dict, reference_lookup, ordered_list_of_samples)
207204
# Create a vcf object for every feature line in the GVF (1:1)
208205
for gvf_featureline in gvf_lines_obj_list:
209-
#NOTE: this is the main Logic of the code
210-
vcf_object = VcfLine(gvf_featureline,
211-
standard_header_lines,
212-
all_header_lines_per_type_dict,
213-
reference_lookup)
206+
vcf_object = vcf_builder.build_vcf_line(gvf_featureline)
214207
# Store VCF object in the list
215208
list_of_vcf_objects.append(vcf_object)
216-
217-
# vcf_object.key is formatted as follows: Chromosome_Pos
218-
if vcf_object.key in vcf_data_lines:
219-
# Add VCF object to the dictionary of lists
220-
vcf_data_lines[vcf_object.key].append(vcf_object)
221-
else:
222-
# Get it into a format where the VCF object can be added to the dictionary of lists
223-
vcf_data_line_objects_list = [vcf_object]
224-
vcf_data_lines[vcf_object.key] = vcf_data_line_objects_list
225-
# Returns the header of the VCF file, the datalines of the VCF file, and the object.
226-
return standard_header_lines, vcf_data_lines, list_of_vcf_objects
209+
# Returns the header of the VCF file, and the object.
210+
return standard_header_lines, list_of_vcf_objects
227211

228212
# The functions below relate to the VCF objects
229213
def compare_vcf_objects(list_of_vcf_objects):
@@ -318,32 +302,27 @@ def main():
318302
logger.info(f"Storing the assembly file: {assembly_file}")
319303
logger.info("Storing the IUPAC ambiguity dictionary.")
320304

305+
# Preparation work:
306+
# Store the VCF metainformation and ensure preservation of important GVF data.
307+
# This information will be useful when creating the VCF header.
308+
# TODO: refactor function generate_vcf_metainfo
309+
(
310+
unique_pragmas_to_add,
311+
samples,
312+
unique_alt_lines_to_add,
313+
unique_info_lines_to_add,
314+
unique_filter_lines_to_add,
315+
unique_format_lines_to_add
316+
) = generate_vcf_header_metainfo(gvf_pragmas, gvf_non_essential)
317+
321318
# Convert each feature line in the GVF file to a VCF object (stores all the data for a line in the VCF file).
322319
# NOTE: Main Logic lives here.
323-
(
324-
header_lines,
325-
vcf_data_lines, #TODO: check if this can be removed
326-
list_of_vcf_objects
327-
) = convert_gvf_features_to_vcf_objects(gvf_lines_obj_list, reference_lookup)
320+
(header_lines,list_of_vcf_objects) = convert_gvf_features_to_vcf_objects(gvf_lines_obj_list, reference_lookup, ordered_list_of_samples=samples)
328321

329322
logger.info(f"Writing to the following VCF output: {args.vcf_output}")
330323
logger.info("Generating the VCF header and the meta-information lines")
331324
with open(args.vcf_output, "w") as vcf_output:
332-
# Preparation work:
333-
# Store the VCF metainformation and ensure preservation of important GVF data.
334-
# This information will be useful when creating the VCF header.
335-
# TODO: refactor function generate_vcf_metainfo
336-
(
337-
unique_pragmas_to_add,
338-
samples,
339-
unique_alt_lines_to_add,
340-
unique_info_lines_to_add,
341-
unique_filter_lines_to_add,
342-
unique_format_lines_to_add
343-
) = generate_vcf_header_metainfo(gvf_pragmas,
344-
gvf_non_essential,
345-
list_of_vcf_objects,
346-
header_lines)
325+
347326
logger.info(f"Total number of samples in this VCF: {len(samples)}")
348327

349328
# Part 1 of VCF file: Write the VCF header. This will include perserved data from the GVF file.

0 commit comments

Comments
 (0)