Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
09920e4
dictionary comparison
khetherin Nov 7, 2025
e00bea3
fixed issue so FORMAT sample values in VCF file are all present
khetherin Nov 7, 2025
9df757b
added format to compare_and_merge_lines
khetherin Nov 10, 2025
129f129
edited format to compare_and_merge_lines
khetherin Nov 10, 2025
579db86
tidy
khetherin Nov 10, 2025
120db8a
tidy
khetherin Nov 10, 2025
49984b8
edited test function name
khetherin Nov 12, 2025
a3e7e3c
edited test function name
khetherin Nov 12, 2025
f6cca1f
edited test function
khetherin Nov 12, 2025
02a3ac2
deal with nested dictionaries better
khetherin Nov 12, 2025
89e6933
reformatting function format_sample_values
khetherin Nov 14, 2025
a1a24b6
tests passed
khetherin Nov 14, 2025
76dc2c5
Extensive Refactor
khetherin Nov 20, 2025
c6f3af3
removed redundant code and moved code to setUp
khetherin Nov 24, 2025
b4c3c34
added comments
khetherin Nov 24, 2025
fa0d021
removed empty lines
khetherin Nov 24, 2025
0a15978
focused on removing INFO string. Use info_dict to store values. Uses …
khetherin Nov 24, 2025
c93a092
check for presence in dictionary before deletion
khetherin Nov 24, 2025
3d0ff68
correct formatting IMPRECISE in INFO
khetherin Nov 24, 2025
407c27b
removed dead code
khetherin Nov 24, 2025
52f7029
removed dead code
khetherin Nov 24, 2025
e02ee5e
removed empty line
khetherin Nov 24, 2025
cb25aab
added comments
khetherin Nov 25, 2025
7626e99
removed dead code
khetherin Nov 25, 2025
9cd2973
replaced with f-string
khetherin Nov 25, 2025
423cc68
minor edit
khetherin Nov 25, 2025
e533c59
ordered the format keys and format the FORMAT keys string in __str__
khetherin Nov 25, 2025
ac818e7
minor docstring edit
khetherin Nov 26, 2025
6a4a4e0
removed format_values_by_sample_string
khetherin Nov 26, 2025
a21658c
removed duplicate merged VCF lines. removed extraneous ID in INFO
khetherin Nov 27, 2025
4015852
adjusted algorithm for merging
khetherin Nov 28, 2025
9cc23a2
edited testing
khetherin Nov 28, 2025
c98c291
minor edit
khetherin Nov 28, 2025
a0341e5
edited test_format_sample_values
khetherin Nov 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions convert_gvf_to_vcf/assistingconverter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This is an assistant converter to help convert gvf attributes
This is contains functions to assist the conversion of gvf attributes
"""
import os
from convert_gvf_to_vcf.logger import logger
Expand Down Expand Up @@ -95,5 +95,7 @@ def convert_gvf_attributes_to_vcf_values(column9_of_gvf,
else:
logger.info(f"catching attribute keys for review at a later date {attrib_key} {attrib_value}")
catching_for_review.append(attrib_key)
info_string = ''.join(f'{key}={value};' for key, value in vcf_info_values.items()).rstrip(';')
return gvf_attribute_dictionary, info_string, vcf_format_values
# info_string = ''.join(f'{key}={value};' for key, value in vcf_info_values.items()).rstrip(';')
# print(type(vcf_info_values))
# print(vcf_info_values)
return gvf_attribute_dictionary, vcf_info_values, vcf_format_values
329 changes: 183 additions & 146 deletions convert_gvf_to_vcf/convertGVFtoVCF.py

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions convert_gvf_to_vcf/lookup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os

from convert_gvf_to_vcf.utils import read_yaml, generate_symbolic_allele_dict, build_iupac_ambiguity_code

# setting up paths to useful directories
convert_gvf_to_vcf_folder = os.path.dirname(__file__)
etc_folder = os.path.join(convert_gvf_to_vcf_folder, 'etc')


class Lookup:
"""
The class is responsible for the storage of look up dictionaries or files for a VCF file.
"""
def __init__(self, assembly_file):
self.mapping_attribute_dict = read_yaml(os.path.join(etc_folder, "attribute_mapper.yaml"))
self.symbolic_allele_dictionary = generate_symbolic_allele_dict(self.mapping_attribute_dict)
self.assembly_file = assembly_file
self.iupac_ambiguity_dictionary = build_iupac_ambiguity_code()
# self.all_possible_vcf_header_lines_dictionary={
# htype: generate_vcf_header_structured_lines(htype, self.mapping_attribute_dict) for htype in ["ALT", "INFO", "FILTER", "FORMAT"]
# }
71 changes: 46 additions & 25 deletions convert_gvf_to_vcf/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# this file contains readers
"""This contains readers and utilities"""
import os
import yaml

Expand Down Expand Up @@ -34,14 +34,39 @@ def read_pragma_mapper(pragma_mapper_file):
pragma_to_vcf_header[pragma] = vcf_header
return pragma_to_vcf_header

def read_in_gvf_file(gvf_input):
""" Reads in the user provided GVF file.
:param gvf_input: arguments.gvf_input : The input GVF file
:return:
- gvf_pragmas: list of pragma lines (start with ## at the top of GVF file)
- gvf_non_essential: list of non essential pragma (start with # near the top of GVF file)
- gvf_lines_obj_list: list of objects where each object represents a GVF feature line
"""
gvf_pragmas = [] # list of pragma lines starting with: ##
gvf_non_essential = [] # list of non-essential lines starting with: #
gvf_lines_obj_list = [] # list of objects when reading in gvf files, one object represents a gvf line

with open(gvf_input) as gvf_file:
for line in gvf_file:
if line.startswith("##"):
gvf_pragmas.append(line.rstrip())
elif line.startswith("#"):
gvf_non_essential.append(line.rstrip())
else:
f_list = line.rstrip().split("\t")
line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], f_list[7], f_list[8])
gvf_lines_obj_list.append(line_object)
return gvf_pragmas, gvf_non_essential, gvf_lines_obj_list

def generate_symbolic_allele_dict(mapping_dictionary):
"""Reads in mapping dictionary and returns a symbolic allele dictionary.
:param mapping_dictionary: mapping dictionary
:return symbolic_allele_dict: stores information for a symbolic allele
"""
symbolic_allele_dict = {}
for attribute in mapping_dictionary:
header_type= "ALT"
# Symbolic alleles refer only to the header type "ALT".
header_type = "ALT"
if mapping_dictionary[attribute].get(header_type) is not None:
if mapping_dictionary[attribute].get(header_type).get("FieldKey") is not None:
name = attribute
Expand All @@ -54,27 +79,23 @@ def generate_symbolic_allele_dict(mapping_dictionary):
symbolic_allele_dict.setdefault(name, []).append(description)
return symbolic_allele_dict


def read_in_gvf_file(gvf_input):
""" Reads in the user provided GVF file.
:param gvf_input: arguments.gvf_input
:return: gvf_pragmas, gvf_non_essential, gvf_lines_obj_list
def build_iupac_ambiguity_code():
""" Builds dictionary for the iupac ambiguity code.
:return: iupac_ambiguity_dictionary: iupac code as key, list of values as value
"""
gvf_pragmas = [] # list of pragma lines starting with: ##
gvf_non_essential = [] # list of non-essential lines starting with: #
features = []
gvf_lines_obj_list = [] # list of objects when reading in gvf files, one object represents a gvf line

with open(gvf_input) as gvf_file:
for line in gvf_file:
if line.startswith("##"):
gvf_pragmas.append(line.rstrip())
elif line.startswith("#"):
gvf_non_essential.append(line.rstrip())
else:
features.append(line.rstrip())
for feature in features:
f_list = feature.split("\t")
line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], f_list[7], f_list[8])
gvf_lines_obj_list.append(line_object)
return gvf_pragmas, gvf_non_essential, gvf_lines_obj_list
# see PMID: 20202974 (Table 1) for the official list
iupac_codes = ["R", "Y", "M", "K", "S", "D", "W", "H", "B", "V", "D", "N"]
R = ["A", "G"]
Y = ["C", "T"]
M = ["A", "C"]
K = ["G", "T"]
S = ["C", "G"]
W = ["A", "T"]
H = ["A", "C", "T"]
B = ["C", "G", "T"]
V = ["A", "C", "G"]
D = ["A", "G", "T"]
N = ["A", "C", "G", "T"]
iupac_values = [R, Y, M, K, S, D, W, H, B, V, D, N]
iupac_ambiguity_dictionary = dict(zip(iupac_codes, iupac_values))
return iupac_ambiguity_dictionary
Loading