diff --git a/convert_gvf_to_vcf/assistingconverter.py b/convert_gvf_to_vcf/assistingconverter.py index f0738f9..31e1d1a 100644 --- a/convert_gvf_to_vcf/assistingconverter.py +++ b/convert_gvf_to_vcf/assistingconverter.py @@ -95,7 +95,4 @@ def convert_gvf_attributes_to_vcf_values(column9_of_gvf, else: logger.info(f"catching attribute keys for review at a later date {attrib_key} {attrib_value}") catching_for_review.append(attrib_key) - # info_string = ''.join(f'{key}={value};' for key, value in vcf_info_values.items()).rstrip(';') - # print(type(vcf_info_values)) - # print(vcf_info_values) return gvf_attribute_dictionary, vcf_info_values, vcf_format_values diff --git a/convert_gvf_to_vcf/convertGVFtoVCF.py b/convert_gvf_to_vcf/convertGVFtoVCF.py index a336019..d38df05 100644 --- a/convert_gvf_to_vcf/convertGVFtoVCF.py +++ b/convert_gvf_to_vcf/convertGVFtoVCF.py @@ -87,10 +87,12 @@ def generate_vcf_header_metainfo(gvf_pragmas, vcf_header_key, pragma_name, pragma_value = get_pragma_name_and_value(non_essential_pragma, ": ", list_of_non_essential_pragma, pragma_to_vcf_map) if pragma_name.startswith("#Publication"): publication_tokens = get_pragma_tokens(pragma_value, ";", "=") - pragmas_to_add.append(generate_vcf_header_unstructured_line(publication_tokens[0], publication_tokens[1])) + for pub_token in publication_tokens: + pragmas_to_add.append(generate_vcf_header_unstructured_line(pub_token[0], pub_token[1])) elif pragma_name == "#Study": study_tokens = get_pragma_tokens(pragma_value, ";", "=") - pragmas_to_add.append(generate_vcf_header_unstructured_line(study_tokens[0], study_tokens[1])) + for s_token in study_tokens: + pragmas_to_add.append(generate_vcf_header_unstructured_line(s_token[0], s_token[1])) else: if vcf_header_key is not None: pragmas_to_add.append(generate_vcf_header_unstructured_line(vcf_header_key, pragma_value)) @@ -147,8 +149,9 @@ def parse_pragma(pragma_to_parse, delimiter): pragma_value = None logger.warning(f"WARNING: no value for the following pragma {pragma_to_parse}") return pragma_name, pragma_value - except ValueError: - logger.error(f"Skipping this, can't be parsed {pragma_to_parse}") + except AttributeError as e: + logger.error(f"Skipping this, can't be parsed {pragma_to_parse}: {e}") + raise AttributeError(f"Cannot parse {pragma_to_parse}") def get_pragma_name_and_value(pragma_to_parse, delimiter, pragma_list, pragma_name_to_vcf_dict): """Get pragma name and value and its corresponding VCF header key. @@ -175,7 +178,7 @@ def get_pragma_tokens(pragma_value, first_delimiter, second_delimiter): initial_list = pragma_value.split(first_delimiter) pragma_tokens = [] for element in initial_list: - pragma_tokens = element.split(second_delimiter) + pragma_tokens.append(element.split(second_delimiter)) return pragma_tokens # This is the main conversion logic diff --git a/tests/test_convert_gvf_to_vcf.py b/tests/test_convert_gvf_to_vcf.py index bcb49af..e3636ce 100644 --- a/tests/test_convert_gvf_to_vcf.py +++ b/tests/test_convert_gvf_to_vcf.py @@ -1,11 +1,13 @@ -#TODO: 5 test import os.path import unittest + from convert_gvf_to_vcf.lookup import Lookup -#from convert_gvf_to_vcf.utils import read_file -from convert_gvf_to_vcf.convertGVFtoVCF import generate_vcf_header_unstructured_line, read_in_gvf_file, convert_gvf_features_to_vcf_objects, generate_vcf_header_metainfo, generate_vcf_header_line, compare_vcf_objects, determine_merge_or_keep_vcf_objects -from convert_gvf_to_vcf.vcfline import VcfLine +from convert_gvf_to_vcf.convertGVFtoVCF import generate_vcf_header_unstructured_line, read_in_gvf_file, \ + convert_gvf_features_to_vcf_objects, generate_vcf_header_metainfo, generate_vcf_header_line, compare_vcf_objects, \ + determine_merge_or_keep_vcf_objects, merge_vcf_objects, parse_pragma, get_pragma_name_and_value, get_pragma_tokens, \ + keep_vcf_objects + class TestConvertGVFtoVCF(unittest.TestCase): @@ -29,20 +31,53 @@ def setUp(self): def test_generate_vcf_header_structured_lines(self): - pass + key_to_test = "fileformat" + value_to_test= "VCFv4.4" + actual_output = generate_vcf_header_unstructured_line(key_to_test, value_to_test) + assert actual_output == "##fileformat=VCFv4.4" def test_generate_custom_unstructured_meta_line(self): formatted_string = generate_vcf_header_unstructured_line("test_string_key", "test_string_value") assert formatted_string == "##test_string_key=test_string_value" def test_parse_pragma(self): - pass + # testing: pragma has a name and value + essential_pragma = "##file-date 2015-07-15" + delimiter = " " + name, value = parse_pragma(essential_pragma, delimiter) + assert name, value == "##file-date, 2015-07-15" + # testing: pragma has only name, no value, should print warning + name_only_pragma = "##file-date" + name, value = parse_pragma(name_only_pragma, delimiter) + assert name, value == "##file-date, None" + # testing: invalid pragmas + invalid_pragma = None + with self.assertRaises(AttributeError): + parse_pragma(invalid_pragma, delimiter) def test_get_pragma_name_and_value(self): - pass + pragma_to_test = "##file-date 2015-07-15" + delimiter = " " + list_of_pragma = ["##file-date", "##gff-version", "##gvf-version", "##species", "##genome-build"] + pragma_to_vcf_map = {'##file-date': 'fileDate', '##gff-version': 'gff-version', '##gvf-version': 'gvf-version', '##species': 'species', '##genome-build': 'genome-build', '#sample': 'sample', '#Study_accession': 'Study_accession', '#Study_type': 'Study_type', '#Display_name': 'Display_name', '#Publication': 'Publication', '#Study': 'Study', '#Assembly_name': 'Assembly_name', '#subject': 'subject'} + vcf_header_key, pragma_name, pragma_value = get_pragma_name_and_value(pragma_to_test, delimiter, list_of_pragma, pragma_to_vcf_map) + assert vcf_header_key == "fileDate" + assert pragma_name == "##file-date" + assert pragma_value == "2015-07-15" def test_get_pragma_tokens(self): - pass + pragma_value = "First_author=Kim Brown;Description=Comparative genomic hybridization analysis of 3 laboratory and one wild zebrafish populations for Copy Number Variants" + pragma_tokens = get_pragma_tokens(pragma_value, ";", "=") + assert len(pragma_tokens) == 2 + # Testing: expected + assert pragma_tokens[0][0] == "First_author" + assert pragma_tokens[0][1] == "Kim Brown" + assert pragma_tokens[1][0] == "Description" + assert pragma_tokens[1][1] == "Comparative genomic hybridization analysis of 3 laboratory and one wild zebrafish populations for Copy Number Variants" + # Testing: not expected + unexpected_pragma_tokens = [['A', '1'], ['B', '2']] + with self.assertRaises(AssertionError): + self.assertEqual(unexpected_pragma_tokens, pragma_tokens) def test_generate_vcf_metainfo(self): gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) @@ -59,15 +94,20 @@ def test_generate_vcf_metainfo(self): ) print(unique_pragmas_to_add) assert unique_pragmas_to_add == ['##fileformat=VCFv4.4', '##gff-version=3', '##source=DGVa', '##gvf-version=1.06', - '##species=http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=7955', '##fileDate=2015-07-15', - '##genome-build=NCBIGRCz10', '##Study_accession=nstd62', '##Study_type=Control Set', - '##Display_name=Brown_et_al_2012', '##Publication_year=2012', + '##species=http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=7955', + '##fileDate=2015-07-15', '##genome-build=NCBIGRCz10', '##Study_accession=nstd62', + '##Study_type=Control Set', '##Display_name=Brown_et_al_2012', '##PMID=22203992', + '##Journal=Proceedings of the National Academy of Sciences of the United States of America', + '##Paper_title=Extensive genetic diversity and substructuring among zebrafish strains revealed through copy number variant analysis.', + '##Publication_year=2012', '##First_author=Kim Brown', '##Description=Comparative genomic hybridization analysis of 3 laboratory and one wild zebrafish populations for Copy Number Variants', - '##Assembly_name=GRCz10', '##subject=subject_name=Wilds2-3', '##subject=subject_name=Zon9', '##subject=subject_name=JenMale7;subject_sex=Male', - '##subject=subject_name=JenMale6;subject_sex=Male', '##sample=sample_name=JenMale6;subject_name=JenMale6', '##sample=sample_name=Wilds2-3;subject_name=Wilds2-3', + '##Assembly_name=GRCz10', '##subject=subject_name=Wilds2-3', '##subject=subject_name=Zon9', + '##subject=subject_name=JenMale7;subject_sex=Male', '##subject=subject_name=JenMale6;subject_sex=Male', + '##sample=sample_name=JenMale6;subject_name=JenMale6', '##sample=sample_name=Wilds2-3;subject_name=Wilds2-3', '##sample=sample_name=Zon9;subject_name=Zon9', '##sample=sample_name=JenMale7;subject_name=JenMale7'] + assert unique_alt_lines_to_add == ['##ALT=', '##ALT='] assert unique_info_lines_to_add == ['##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO='] @@ -94,7 +134,9 @@ def test_gvf_features_to_vcf_objects(self): def test_compare_vcf_objects(self): gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) + print(gvf_lines_obj_list) header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = convert_gvf_features_to_vcf_objects(gvf_lines_obj_list, self.reference_lookup) + print(list_of_vcf_objects) # compare object, if equal, True, if not equal, False # (next function will make true = current and merge; false= previous) expected_flags_for_list_of_vcf_objects = [False, # line 1 vs 2 False, # line 2 vs 3 @@ -106,21 +148,29 @@ def test_compare_vcf_objects(self): actual_flags_for_list_of_vcf_objects = compare_vcf_objects(list_of_vcf_objects) assert actual_flags_for_list_of_vcf_objects == expected_flags_for_list_of_vcf_objects - def test_merge_vcf_objects(self): - # gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) - # header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects( - # gvf_lines_obj_list, self.reference_lookup) - # list_of_samples = ['JenMale6', 'Wilds2-3', 'Zon9', 'JenMale7'] - # # use lines 4 and 5 of gvf file - # previous = list_of_vcf_objects[3] # line 4 - # current = list_of_vcf_objects[4] #line 5 - # merged_object = merge_vcf_objects(previous, current, list_of_samples) - # to_check = ('chromosome1', 127, '13;14', 'GTACGTACG', '', '.', '.', 'ID=13,14;SVCID=CNV6230,CNV5711;ALIAS=CNV6230,CNV5711;END=131;NAME=nssv1389474,nssv1388955;VARCALLSOID=SO:0001742;AC=3;SVLEN=4;REMAP=.69625,.85344;VARSEQ=.', '.', '.\t.\t.\t.') - # assert merged_object == to_check #TODO: the info_string is different each time, ensure order is preserved - pass + # def test_merge_vcf_objects(self): + # gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) + # header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = convert_gvf_features_to_vcf_objects( + # gvf_lines_obj_list, self.reference_lookup) + # list_of_samples = ['JenMale6', 'Wilds2-3', 'Zon9', 'JenMale7'] + # # # use lines 4 and 5 of gvf file + # previous = list_of_vcf_objects[3] # line 4 + # current = list_of_vcf_objects[4] #line 5 + # merged_object = merge_vcf_objects(previous, current, list_of_samples) + # + # to_check = '\t'.join(['chromosome1', '127', '13;14', 'GTACGTACG', '', '.', '.', + # 'ALIAS=CNV6230,CNV5711;NAME=nssv1389474,nssv1388955;VARSEQ=.;REMAP=.69625,.85344;SVCID=CNV6230,CNV5711;VARCALLSOID=SO:0001742;AC=3;SVLEN=4;END=131', + # '.', '.\t.\t.\t.']) + # assert merged_object == to_check def test_keep_vcf_objects(self): - pass + gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) + header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = convert_gvf_features_to_vcf_objects( + gvf_lines_obj_list, self.reference_lookup) + list_of_samples = ['JenMale6', 'Wilds2-3', 'Zon9', 'JenMale7'] + previous_object = list_of_vcf_objects[1] + keep_vcf_objects(previous_object, list_of_samples) + def test_determine_merge_or_keep_vcf_objects(self): gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) @@ -137,8 +187,5 @@ def test_determine_merge_or_keep_vcf_objects(self): assert merged_or_kept_objects[3].id == "13;14" assert merged_or_kept_objects[3].info_dict["NAME"] == "nssv1389474,nssv1388955" - - - if __name__ == '__main__': unittest.main() diff --git a/tests/test_vcfline.py b/tests/test_vcfline.py index 9fb4f6a..fbaadac 100644 --- a/tests/test_vcfline.py +++ b/tests/test_vcfline.py @@ -1,11 +1,11 @@ -#TODO: 9 tests +#TODO: 6 tests import os import unittest from convert_gvf_to_vcf.convertGVFtoVCF import generate_vcf_header_structured_lines, convert_gvf_features_to_vcf_objects, \ generate_vcf_header_metainfo from convert_gvf_to_vcf.gvffeature import GvfFeatureline -from convert_gvf_to_vcf.utils import read_yaml, generate_symbolic_allele_dict, read_in_gvf_file +from convert_gvf_to_vcf.utils import read_in_gvf_file from convert_gvf_to_vcf.vcfline import VcfLine from convert_gvf_to_vcf.lookup import Lookup @@ -28,6 +28,7 @@ def setUp(self): gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=." f_list = gvf_feature_line.split("\t") gvf_line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], f_list[7], f_list[8]) + # Set up of data structures # Dictionary of standard structured meta-information lines for this VCF file lines_standard_alt = [] @@ -57,6 +58,15 @@ def setUp(self): self.standard_lines_dictionary, self.all_possible_lines_dictionary, self.reference_lookup) + # Set up the other GVF line object + other_gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=776614,776914;End_range=786127,786427;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=." + other_f_list = other_gvf_feature_line.split("\t") + other_gvf_line_object = GvfFeatureline(other_f_list[0], other_f_list[1], other_f_list[2], other_f_list[3], other_f_list[4], other_f_list[5], other_f_list[6], + other_f_list[7], other_f_list[8]) + self.other_v = VcfLine(other_gvf_line_object, + self.standard_lines_dictionary, + self.all_possible_lines_dictionary, + self.reference_lookup) def test_add_padded_base(self): test_ref = "A" @@ -92,10 +102,10 @@ def test_generate_symbolic_allele(self): self.all_possible_lines_dictionary, self.reference_lookup.symbolic_allele_dictionary) assert output_symbolic_allele == '' - print(info_field) assert info_field == {'END': '78', 'IMPRECISE': None, 'CIPOS': None, 'CIEND': None, 'SVLEN': '1'} - assert output_lines_standard_alt == ['##ALT=', '##ALT='] - assert output_lines_standard_info == ['##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO='] + assert output_lines_standard_alt == ['##ALT=', '##ALT=', '##ALT='] + assert output_lines_standard_info == ['##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO=', '##INFO='] + def test_get_alt(self): alt_allele = self.v.get_alt(self.standard_lines_dictionary, self.all_possible_lines_dictionary, self.reference_lookup) @@ -105,14 +115,19 @@ def test__str__(self): pass def test_merge_and_add(self): - # previous="1" - # current ="2" - # delimiter =";" - # merged_string = merge_and_add(previous, current, delimiter) - # assert len(merged_string) > 1 - pass + # testing merge for different elements + merged_string = self.v.merge_and_add("1", "2",";") + assert merged_string == "1;2" + # testing non merge for same elements + non_merged_string = self.v.merge_and_add("1", "1", ";") + assert non_merged_string == "1" + + def test_order_format_keys(self): + set_of_format_keys = {"AD", "GT"} + ordered_list_of_format_keys = self.v.order_format_keys(set_of_format_keys) + assert ordered_list_of_format_keys == ['GT', 'AD'] - def test_put_GT_format_key_first(self): + def test_merge_format_keys(self): pass def test_format_sample_values(self): @@ -128,7 +143,6 @@ def test_format_sample_values(self): header_standard_lines_dictionary) for vcf_obj in list_of_vcf_objects: sample_name_dict_format_kv = vcf_obj.vcf_values_for_format - # sample_format_values_string = format_sample_values(sample_name_dict_format_kv, samples) sample_format_values_list = vcf_obj.combine_format_values_by_sample(sample_name_dict_format_kv, samples) assert isinstance(sample_format_values_list, list) number_of_tokens_should_have = len(samples) @@ -136,18 +150,12 @@ def test_format_sample_values(self): assert actual_number_of_tokens == number_of_tokens_should_have, f"must have {number_of_tokens_should_have}" assert sample_format_values_list == ['.:.', '.:.', '.:.', '0:1:3'], "List must match expected value" - def test_info_list_to_dict(self): - pass - def test_merge_info_dicts(self): pass def test_merge_info_string(self): pass - def test_merge_format_keys(self): - pass - def test_merge(self): pass