Skip to content

Commit a358048

Browse files
authored
Mergelines (#15)
* dictionary comparison * fixed issue so FORMAT sample values in VCF file are all present * added format to compare_and_merge_lines * edited format to compare_and_merge_lines * tidy * tidy * edited test function name * edited test function name * edited test function * deal with nested dictionaries better * reformatting function format_sample_values * tests passed * Extensive Refactor Major changes to convertGVFtoVCF: - removed compare_and_merge lines and placed functions into VCFlines file Major changes to VCF line: - improved single responsibility principle - added many functions. Function moved from VCFlines to Utils.py (build_iupac_ambiguitycode) New files added: - Lookup.py for referencing - New test files: test_assisting_converter, test_logger, test_utils, test_vcfline (separated out tests to match the file structure) Optimised the function: read_in_gvf_file by removing extraneous for loop * removed redundant code and moved code to setUp * added comments * removed empty lines * focused on removing INFO string. Use info_dict to store values. Uses __str__ to write the vcf line object * check for presence in dictionary before deletion * correct formatting IMPRECISE in INFO * removed dead code * removed dead code * removed empty line * added comments * removed dead code * replaced with f-string * minor edit * ordered the format keys and format the FORMAT keys string in __str__ * minor docstring edit * removed format_values_by_sample_string * removed duplicate merged VCF lines. removed extraneous ID in INFO * adjusted algorithm for merging * edited testing * minor edit * edited test_format_sample_values
1 parent 7af0590 commit a358048

File tree

10 files changed

+892
-666
lines changed

10 files changed

+892
-666
lines changed

convert_gvf_to_vcf/assistingconverter.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
This is an assistant converter to help convert gvf attributes
2+
This is contains functions to assist the conversion of gvf attributes
33
"""
44
import os
55
from convert_gvf_to_vcf.logger import logger
@@ -95,5 +95,7 @@ def convert_gvf_attributes_to_vcf_values(column9_of_gvf,
9595
else:
9696
logger.info(f"catching attribute keys for review at a later date {attrib_key} {attrib_value}")
9797
catching_for_review.append(attrib_key)
98-
info_string = ''.join(f'{key}={value};' for key, value in vcf_info_values.items()).rstrip(';')
99-
return gvf_attribute_dictionary, info_string, vcf_format_values
98+
# info_string = ''.join(f'{key}={value};' for key, value in vcf_info_values.items()).rstrip(';')
99+
# print(type(vcf_info_values))
100+
# print(vcf_info_values)
101+
return gvf_attribute_dictionary, vcf_info_values, vcf_format_values

convert_gvf_to_vcf/convertGVFtoVCF.py

Lines changed: 183 additions & 146 deletions
Large diffs are not rendered by default.

convert_gvf_to_vcf/lookup.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import os
2+
3+
from convert_gvf_to_vcf.utils import read_yaml, generate_symbolic_allele_dict, build_iupac_ambiguity_code
4+
5+
# setting up paths to useful directories
6+
convert_gvf_to_vcf_folder = os.path.dirname(__file__)
7+
etc_folder = os.path.join(convert_gvf_to_vcf_folder, 'etc')
8+
9+
10+
class Lookup:
11+
"""
12+
The class is responsible for the storage of look up dictionaries or files for a VCF file.
13+
"""
14+
def __init__(self, assembly_file):
15+
self.mapping_attribute_dict = read_yaml(os.path.join(etc_folder, "attribute_mapper.yaml"))
16+
self.symbolic_allele_dictionary = generate_symbolic_allele_dict(self.mapping_attribute_dict)
17+
self.assembly_file = assembly_file
18+
self.iupac_ambiguity_dictionary = build_iupac_ambiguity_code()
19+
# self.all_possible_vcf_header_lines_dictionary={
20+
# htype: generate_vcf_header_structured_lines(htype, self.mapping_attribute_dict) for htype in ["ALT", "INFO", "FILTER", "FORMAT"]
21+
# }

convert_gvf_to_vcf/utils.py

Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# this file contains readers
1+
"""This contains readers and utilities"""
22
import os
33
import yaml
44

@@ -34,14 +34,39 @@ def read_pragma_mapper(pragma_mapper_file):
3434
pragma_to_vcf_header[pragma] = vcf_header
3535
return pragma_to_vcf_header
3636

37+
def read_in_gvf_file(gvf_input):
38+
""" Reads in the user provided GVF file.
39+
:param gvf_input: arguments.gvf_input : The input GVF file
40+
:return:
41+
- gvf_pragmas: list of pragma lines (start with ## at the top of GVF file)
42+
- gvf_non_essential: list of non essential pragma (start with # near the top of GVF file)
43+
- gvf_lines_obj_list: list of objects where each object represents a GVF feature line
44+
"""
45+
gvf_pragmas = [] # list of pragma lines starting with: ##
46+
gvf_non_essential = [] # list of non-essential lines starting with: #
47+
gvf_lines_obj_list = [] # list of objects when reading in gvf files, one object represents a gvf line
48+
49+
with open(gvf_input) as gvf_file:
50+
for line in gvf_file:
51+
if line.startswith("##"):
52+
gvf_pragmas.append(line.rstrip())
53+
elif line.startswith("#"):
54+
gvf_non_essential.append(line.rstrip())
55+
else:
56+
f_list = line.rstrip().split("\t")
57+
line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], f_list[7], f_list[8])
58+
gvf_lines_obj_list.append(line_object)
59+
return gvf_pragmas, gvf_non_essential, gvf_lines_obj_list
60+
3761
def generate_symbolic_allele_dict(mapping_dictionary):
3862
"""Reads in mapping dictionary and returns a symbolic allele dictionary.
3963
:param mapping_dictionary: mapping dictionary
4064
:return symbolic_allele_dict: stores information for a symbolic allele
4165
"""
4266
symbolic_allele_dict = {}
4367
for attribute in mapping_dictionary:
44-
header_type= "ALT"
68+
# Symbolic alleles refer only to the header type "ALT".
69+
header_type = "ALT"
4570
if mapping_dictionary[attribute].get(header_type) is not None:
4671
if mapping_dictionary[attribute].get(header_type).get("FieldKey") is not None:
4772
name = attribute
@@ -54,27 +79,23 @@ def generate_symbolic_allele_dict(mapping_dictionary):
5479
symbolic_allele_dict.setdefault(name, []).append(description)
5580
return symbolic_allele_dict
5681

57-
58-
def read_in_gvf_file(gvf_input):
59-
""" Reads in the user provided GVF file.
60-
:param gvf_input: arguments.gvf_input
61-
:return: gvf_pragmas, gvf_non_essential, gvf_lines_obj_list
82+
def build_iupac_ambiguity_code():
83+
""" Builds dictionary for the iupac ambiguity code.
84+
:return: iupac_ambiguity_dictionary: iupac code as key, list of values as value
6285
"""
63-
gvf_pragmas = [] # list of pragma lines starting with: ##
64-
gvf_non_essential = [] # list of non-essential lines starting with: #
65-
features = []
66-
gvf_lines_obj_list = [] # list of objects when reading in gvf files, one object represents a gvf line
67-
68-
with open(gvf_input) as gvf_file:
69-
for line in gvf_file:
70-
if line.startswith("##"):
71-
gvf_pragmas.append(line.rstrip())
72-
elif line.startswith("#"):
73-
gvf_non_essential.append(line.rstrip())
74-
else:
75-
features.append(line.rstrip())
76-
for feature in features:
77-
f_list = feature.split("\t")
78-
line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], f_list[7], f_list[8])
79-
gvf_lines_obj_list.append(line_object)
80-
return gvf_pragmas, gvf_non_essential, gvf_lines_obj_list
86+
# see PMID: 20202974 (Table 1) for the official list
87+
iupac_codes = ["R", "Y", "M", "K", "S", "D", "W", "H", "B", "V", "D", "N"]
88+
R = ["A", "G"]
89+
Y = ["C", "T"]
90+
M = ["A", "C"]
91+
K = ["G", "T"]
92+
S = ["C", "G"]
93+
W = ["A", "T"]
94+
H = ["A", "C", "T"]
95+
B = ["C", "G", "T"]
96+
V = ["A", "C", "G"]
97+
D = ["A", "G", "T"]
98+
N = ["A", "C", "G", "T"]
99+
iupac_values = [R, Y, M, K, S, D, W, H, B, V, D, N]
100+
iupac_ambiguity_dictionary = dict(zip(iupac_codes, iupac_values))
101+
return iupac_ambiguity_dictionary

0 commit comments

Comments
 (0)