Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions convert_gvf_to_vcf/assistingconverter.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,4 @@ def convert_gvf_attributes_to_vcf_values(column9_of_gvf,
else:
logger.info(f"catching attribute keys for review at a later date {attrib_key} {attrib_value}")
catching_for_review.append(attrib_key)
# info_string = ''.join(f'{key}={value};' for key, value in vcf_info_values.items()).rstrip(';')
# print(type(vcf_info_values))
# print(vcf_info_values)
return gvf_attribute_dictionary, vcf_info_values, vcf_format_values
13 changes: 8 additions & 5 deletions convert_gvf_to_vcf/convertGVFtoVCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,12 @@ def generate_vcf_header_metainfo(gvf_pragmas,
vcf_header_key, pragma_name, pragma_value = get_pragma_name_and_value(non_essential_pragma, ": ", list_of_non_essential_pragma, pragma_to_vcf_map)
if pragma_name.startswith("#Publication"):
publication_tokens = get_pragma_tokens(pragma_value, ";", "=")
pragmas_to_add.append(generate_vcf_header_unstructured_line(publication_tokens[0], publication_tokens[1]))
for pub_token in publication_tokens:
pragmas_to_add.append(generate_vcf_header_unstructured_line(pub_token[0], pub_token[1]))
elif pragma_name == "#Study":
study_tokens = get_pragma_tokens(pragma_value, ";", "=")
pragmas_to_add.append(generate_vcf_header_unstructured_line(study_tokens[0], study_tokens[1]))
for s_token in study_tokens:
pragmas_to_add.append(generate_vcf_header_unstructured_line(s_token[0], s_token[1]))
else:
if vcf_header_key is not None:
pragmas_to_add.append(generate_vcf_header_unstructured_line(vcf_header_key, pragma_value))
Expand Down Expand Up @@ -147,8 +149,9 @@ def parse_pragma(pragma_to_parse, delimiter):
pragma_value = None
logger.warning(f"WARNING: no value for the following pragma {pragma_to_parse}")
return pragma_name, pragma_value
except ValueError:
logger.error(f"Skipping this, can't be parsed {pragma_to_parse}")
except AttributeError as e:
logger.error(f"Skipping this, can't be parsed {pragma_to_parse}: {e}")
raise AttributeError(f"Cannot parse {pragma_to_parse}")

def get_pragma_name_and_value(pragma_to_parse, delimiter, pragma_list, pragma_name_to_vcf_dict):
"""Get pragma name and value and its corresponding VCF header key.
Expand All @@ -175,7 +178,7 @@ def get_pragma_tokens(pragma_value, first_delimiter, second_delimiter):
initial_list = pragma_value.split(first_delimiter)
pragma_tokens = []
for element in initial_list:
pragma_tokens = element.split(second_delimiter)
pragma_tokens.append(element.split(second_delimiter))
return pragma_tokens

# This is the main conversion logic
Expand Down
105 changes: 76 additions & 29 deletions tests/test_convert_gvf_to_vcf.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#TODO: 5 test
import os.path
import unittest


from convert_gvf_to_vcf.lookup import Lookup
#from convert_gvf_to_vcf.utils import read_file
from convert_gvf_to_vcf.convertGVFtoVCF import generate_vcf_header_unstructured_line, read_in_gvf_file, convert_gvf_features_to_vcf_objects, generate_vcf_header_metainfo, generate_vcf_header_line, compare_vcf_objects, determine_merge_or_keep_vcf_objects
from convert_gvf_to_vcf.vcfline import VcfLine
from convert_gvf_to_vcf.convertGVFtoVCF import generate_vcf_header_unstructured_line, read_in_gvf_file, \
convert_gvf_features_to_vcf_objects, generate_vcf_header_metainfo, generate_vcf_header_line, compare_vcf_objects, \
determine_merge_or_keep_vcf_objects, merge_vcf_objects, parse_pragma, get_pragma_name_and_value, get_pragma_tokens, \
keep_vcf_objects



class TestConvertGVFtoVCF(unittest.TestCase):
Expand All @@ -29,20 +31,53 @@ def setUp(self):


def test_generate_vcf_header_structured_lines(self):
pass
key_to_test = "fileformat"
value_to_test= "VCFv4.4"
actual_output = generate_vcf_header_unstructured_line(key_to_test, value_to_test)
assert actual_output == "##fileformat=VCFv4.4"

def test_generate_custom_unstructured_meta_line(self):
formatted_string = generate_vcf_header_unstructured_line("test_string_key", "test_string_value")
assert formatted_string == "##test_string_key=test_string_value"

def test_parse_pragma(self):
pass
# testing: pragma has a name and value
essential_pragma = "##file-date 2015-07-15"
delimiter = " "
name, value = parse_pragma(essential_pragma, delimiter)
assert name, value == "##file-date, 2015-07-15"
# testing: pragma has only name, no value, should print warning
name_only_pragma = "##file-date"
name, value = parse_pragma(name_only_pragma, delimiter)
assert name, value == "##file-date, None"
# testing: invalid pragmas
invalid_pragma = None
with self.assertRaises(AttributeError):
parse_pragma(invalid_pragma, delimiter)

def test_get_pragma_name_and_value(self):
pass
pragma_to_test = "##file-date 2015-07-15"
delimiter = " "
list_of_pragma = ["##file-date", "##gff-version", "##gvf-version", "##species", "##genome-build"]
pragma_to_vcf_map = {'##file-date': 'fileDate', '##gff-version': 'gff-version', '##gvf-version': 'gvf-version', '##species': 'species', '##genome-build': 'genome-build', '#sample': 'sample', '#Study_accession': 'Study_accession', '#Study_type': 'Study_type', '#Display_name': 'Display_name', '#Publication': 'Publication', '#Study': 'Study', '#Assembly_name': 'Assembly_name', '#subject': 'subject'}
vcf_header_key, pragma_name, pragma_value = get_pragma_name_and_value(pragma_to_test, delimiter, list_of_pragma, pragma_to_vcf_map)
assert vcf_header_key == "fileDate"
assert pragma_name == "##file-date"
assert pragma_value == "2015-07-15"

def test_get_pragma_tokens(self):
pass
pragma_value = "First_author=Kim Brown;Description=Comparative genomic hybridization analysis of 3 laboratory and one wild zebrafish populations for Copy Number Variants"
pragma_tokens = get_pragma_tokens(pragma_value, ";", "=")
assert len(pragma_tokens) == 2
# Testing: expected
assert pragma_tokens[0][0] == "First_author"
assert pragma_tokens[0][1] == "Kim Brown"
assert pragma_tokens[1][0] == "Description"
assert pragma_tokens[1][1] == "Comparative genomic hybridization analysis of 3 laboratory and one wild zebrafish populations for Copy Number Variants"
# Testing: not expected
unexpected_pragma_tokens = [['A', '1'], ['B', '2']]
with self.assertRaises(AssertionError):
self.assertEqual(unexpected_pragma_tokens, pragma_tokens)

def test_generate_vcf_metainfo(self):
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
Expand All @@ -59,15 +94,20 @@ def test_generate_vcf_metainfo(self):
)
print(unique_pragmas_to_add)
assert unique_pragmas_to_add == ['##fileformat=VCFv4.4', '##gff-version=3', '##source=DGVa', '##gvf-version=1.06',
'##species=http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=7955', '##fileDate=2015-07-15',
'##genome-build=NCBIGRCz10', '##Study_accession=nstd62', '##Study_type=Control Set',
'##Display_name=Brown_et_al_2012', '##Publication_year=2012',
'##species=http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=7955',
'##fileDate=2015-07-15', '##genome-build=NCBIGRCz10', '##Study_accession=nstd62',
'##Study_type=Control Set', '##Display_name=Brown_et_al_2012', '##PMID=22203992',
'##Journal=Proceedings of the National Academy of Sciences of the United States of America',
'##Paper_title=Extensive genetic diversity and substructuring among zebrafish strains revealed through copy number variant analysis.',
'##Publication_year=2012', '##First_author=Kim Brown',
'##Description=Comparative genomic hybridization analysis of 3 laboratory and one wild zebrafish populations for Copy Number Variants',
'##Assembly_name=GRCz10', '##subject=subject_name=Wilds2-3', '##subject=subject_name=Zon9', '##subject=subject_name=JenMale7;subject_sex=Male',
'##subject=subject_name=JenMale6;subject_sex=Male', '##sample=sample_name=JenMale6;subject_name=JenMale6', '##sample=sample_name=Wilds2-3;subject_name=Wilds2-3',
'##Assembly_name=GRCz10', '##subject=subject_name=Wilds2-3', '##subject=subject_name=Zon9',
'##subject=subject_name=JenMale7;subject_sex=Male', '##subject=subject_name=JenMale6;subject_sex=Male',
'##sample=sample_name=JenMale6;subject_name=JenMale6', '##sample=sample_name=Wilds2-3;subject_name=Wilds2-3',
'##sample=sample_name=Zon9;subject_name=Zon9', '##sample=sample_name=JenMale7;subject_name=JenMale7']



assert unique_alt_lines_to_add == ['##ALT=<ID=DEL,Description="Deletion">', '##ALT=<ID=DUP,Description="Duplication">']
assert unique_info_lines_to_add == ['##INFO=<ID=ID,Number=.,Type=String,Description="A unique identifier">', '##INFO=<ID=NAME,Number=.,Type=String,Description="Name">', '##INFO=<ID=ALIAS,Number=.,Type=String,Description="Secondary Name">', '##INFO=<ID=VARCALLSOID,Number=.,Type=String,Description="Variant call Sequence ontology ID">', '##INFO=<ID=SVCID,Number=.,Type=Integer,Description="submitter variant call ID">', '##INFO=<ID=REMAP,Number=.,Type=Float,Description="Remap score">', '##INFO=<ID=VARSEQ,Number=.,Type=String,Description="Alleles found in an individual (or group of individuals).">', '##INFO=<ID=END,Number=1,Type=Integer,Description="End position on CHROM (used with symbolic alleles; see below) or End position of the longest variant described in this record">', '##INFO=<ID=SVLEN,Number=A,Type=String,Description="Length of structural variant">', '##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">', '##INFO=<ID=CIPOS,Number=.,Type=Integer,Description="Confidence interval around POS for symbolic structural variants">', '##INFO=<ID=CIEND,Number=.,Type=Integer,Description="Confidence interval around END for symbolic structural variants">', '##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">', '##INFO=<ID=DBXREF,Number=.,Type=String,Description="A database cross-reference">', '##INFO=<ID=AD,Number=R,Type=Integer,Description="Total read depth for each allele">']

Expand All @@ -94,7 +134,9 @@ def test_gvf_features_to_vcf_objects(self):

def test_compare_vcf_objects(self):
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
print(gvf_lines_obj_list)
header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = convert_gvf_features_to_vcf_objects(gvf_lines_obj_list, self.reference_lookup)
print(list_of_vcf_objects)
# compare object, if equal, True, if not equal, False # (next function will make true = current and merge; false= previous)
expected_flags_for_list_of_vcf_objects = [False, # line 1 vs 2
False, # line 2 vs 3
Expand All @@ -106,21 +148,29 @@ def test_compare_vcf_objects(self):
actual_flags_for_list_of_vcf_objects = compare_vcf_objects(list_of_vcf_objects)
assert actual_flags_for_list_of_vcf_objects == expected_flags_for_list_of_vcf_objects

def test_merge_vcf_objects(self):
# gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
# header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(
# gvf_lines_obj_list, self.reference_lookup)
# list_of_samples = ['JenMale6', 'Wilds2-3', 'Zon9', 'JenMale7']
# # use lines 4 and 5 of gvf file
# previous = list_of_vcf_objects[3] # line 4
# current = list_of_vcf_objects[4] #line 5
# merged_object = merge_vcf_objects(previous, current, list_of_samples)
# to_check = ('chromosome1', 127, '13;14', 'GTACGTACG', '<DUP>', '.', '.', 'ID=13,14;SVCID=CNV6230,CNV5711;ALIAS=CNV6230,CNV5711;END=131;NAME=nssv1389474,nssv1388955;VARCALLSOID=SO:0001742;AC=3;SVLEN=4;REMAP=.69625,.85344;VARSEQ=.', '.', '.\t.\t.\t.')
# assert merged_object == to_check #TODO: the info_string is different each time, ensure order is preserved
pass
# def test_merge_vcf_objects(self):
# gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
# header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = convert_gvf_features_to_vcf_objects(
# gvf_lines_obj_list, self.reference_lookup)
# list_of_samples = ['JenMale6', 'Wilds2-3', 'Zon9', 'JenMale7']
# # # use lines 4 and 5 of gvf file
# previous = list_of_vcf_objects[3] # line 4
# current = list_of_vcf_objects[4] #line 5
# merged_object = merge_vcf_objects(previous, current, list_of_samples)
#
# to_check = '\t'.join(['chromosome1', '127', '13;14', 'GTACGTACG', '<DUP>', '.', '.',
# 'ALIAS=CNV6230,CNV5711;NAME=nssv1389474,nssv1388955;VARSEQ=.;REMAP=.69625,.85344;SVCID=CNV6230,CNV5711;VARCALLSOID=SO:0001742;AC=3;SVLEN=4;END=131',
# '.', '.\t.\t.\t.'])
# assert merged_object == to_check

def test_keep_vcf_objects(self):
pass
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = convert_gvf_features_to_vcf_objects(
gvf_lines_obj_list, self.reference_lookup)
list_of_samples = ['JenMale6', 'Wilds2-3', 'Zon9', 'JenMale7']
previous_object = list_of_vcf_objects[1]
keep_vcf_objects(previous_object, list_of_samples)


def test_determine_merge_or_keep_vcf_objects(self):
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
Expand All @@ -137,8 +187,5 @@ def test_determine_merge_or_keep_vcf_objects(self):
assert merged_or_kept_objects[3].id == "13;14"
assert merged_or_kept_objects[3].info_dict["NAME"] == "nssv1389474,nssv1388955"




if __name__ == '__main__':
unittest.main()
Loading