Skip to content

Commit 129f129

Browse files
committed
edited format to compare_and_merge_lines
1 parent 9df757b commit 129f129

File tree

2 files changed

+54
-12
lines changed

2 files changed

+54
-12
lines changed

convert_gvf_to_vcf/convertGVFtoVCF.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,11 @@ def format_vcf_datalines(list_of_vcf_objects, list_of_sample_names):
286286
return formatted_vcf_datalines
287287

288288
def get_bigger_dictionary(dict1, dict2):
289+
"""Determines the biggest of two dictionaries
290+
:param: dictionary1
291+
:param: dictinary2
292+
:return: smallest, largest
293+
"""
289294
if len(dict1) > len(dict2):
290295
biggest_dict = dict1
291296
smallest_dict = dict2
@@ -298,13 +303,20 @@ def get_bigger_dictionary(dict1, dict2):
298303
return smallest_dict, biggest_dict
299304

300305
def merge_and_add(previous_element, current_element, delimiter):
306+
""" If same, use current element. If different, merge with delimiter.
307+
:param: previous_element
308+
:param: current_element
309+
:param: delimiter
310+
:return: merged element
311+
"""
301312
if previous_element == current_element:
302313
merged_element = current_element
303314
else:
304315
merged_element = delimiter.join((previous_element, current_element))
305316
return merged_element
306317

307318
def compare_and_merge_lines(list_of_formatted_vcf_datalines, headerline):
319+
merged_lines = []
308320
for previous, current in zip(list_of_formatted_vcf_datalines, list_of_formatted_vcf_datalines[1:]):
309321
# print(f"previous line:\n{previous}\ncurrent line:\n{current}\n")
310322
previous_tokens = previous.split("\t")
@@ -319,7 +331,7 @@ def compare_and_merge_lines(list_of_formatted_vcf_datalines, headerline):
319331
and previous_data["POS"] == current_data["POS"]
320332
and previous_data["REF"] == current_data["REF"]
321333
):
322-
# print("True - merge")
334+
print("True - merge")
323335
merged_data["#CHROM"] = current_data["#CHROM"]
324336
merged_data["POS"] = current_data["POS"]
325337
merged_data["ID"] = merge_and_add(previous_data["ID"], current_data["ID"], ";")
@@ -401,9 +413,26 @@ def compare_and_merge_lines(list_of_formatted_vcf_datalines, headerline):
401413
sample_format_string =':'.join(flat_values)
402414

403415
merged_data[sample_name] = sample_format_string
404-
# else:
405-
# print("False - keep previous")
406-
# print("---")
416+
merged_lines.append(merged_data)
417+
print("---")
418+
else:
419+
print("False - keep previous")
420+
merged_data["#CHROM"] = previous_data["#CHROM"]
421+
merged_data["POS"] = previous_data["POS"]
422+
merged_data["ID"] = previous_data["ID"]
423+
merged_data["REF"] = previous_data["REF"]
424+
merged_data["ALT"] = previous_data["ALT"]
425+
merged_data["QUAL"] = previous_data["QUAL"]
426+
merged_data["FILTER"] = previous_data["FILTER"]
427+
merged_data["INFO"] = previous_data["INFO"]
428+
merged_data["FORMAT"] = previous_data["FORMAT"]
429+
sample_names = header_fields[9:]
430+
for sample in sample_names:
431+
merged_data[sample] = previous_data[sample]
432+
433+
merged_lines.append(merged_data)
434+
print("---")
435+
return merged_lines
407436

408437

409438

@@ -478,9 +507,9 @@ def main():
478507
vcf_output.write(f"{header_fields}\n")
479508
logger.info("Generating the VCF datalines")
480509
formatted_vcf_datalines = format_vcf_datalines(list_of_vcf_objects, samples)
481-
compare_and_merge_lines(formatted_vcf_datalines, header_fields)
482-
for line in formatted_vcf_datalines:
483-
vcf_output.write(f"{line}\n")
510+
merged_lines = compare_and_merge_lines(formatted_vcf_datalines, header_fields)
511+
for line in merged_lines:
512+
vcf_output.write("\t".join(str(val) for val in line.values()) + "\n")
484513
vcf_output.close()
485514
logger.info("GVF to VCF conversion complete")
486515

tests/test_convert_gvf_to_vcf.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,9 @@
44
#from convert_gvf_to_vcf.utils import read_file
55
from convert_gvf_to_vcf.convertGVFtoVCF import generate_custom_unstructured_meta_line, read_in_gvf_file, \
66
gvf_features_to_vcf_objects, format_vcf_datalines, \
7-
generate_vcf_metainfo, generate_vcf_header_structured_lines, \
8-
generate_vcf_header_line, \
9-
format_sample_values, read_yaml, read_pragma_mapper, generate_symbolic_allele_dict
10-
7+
generate_vcf_metainfo, generate_vcf_header_structured_lines, \
8+
generate_vcf_header_line, \
9+
format_sample_values, read_yaml, read_pragma_mapper, generate_symbolic_allele_dict, get_bigger_dictionary, merge_and_add, compare_and_merge_lines
1110
from convert_gvf_to_vcf.vcfline import VcfLine
1211
from convert_gvf_to_vcf.gvffeature import GvfFeatureline
1312

@@ -426,7 +425,21 @@ def test_format_sample_values(self):
426425
assert actual_number_of_tokens == number_of_tokens_should_have, f"must have {number_of_tokens_should_have}"
427426
assert sample_format_values_string == ".:.\t.:.\t.:.\t3:0:1", "String must match expected value"
428427

429-
428+
def test_get_bigger_dictionary(self):
429+
dictionary1 = {"key1": "value1"}
430+
dictionary2 = {"key1": "value1", "key2": "value2"}
431+
small, large = get_bigger_dictionary(dictionary1, dictionary2)
432+
assert len(large) > len(small)
433+
434+
def merge_and_add(self):
435+
previous="1"
436+
current ="2"
437+
delimiter =";"
438+
merged_string = merge_and_add(previous, current, delimiter)
439+
assert len(merged_string) > 1
440+
441+
def compare_and_merge_lines(self):
442+
pass
430443

431444
def test_format_vcf_datalines(self):
432445
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)

0 commit comments

Comments
 (0)