EBIvariation · khetherin · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/convert_gvf_to_vcf/assistingconverter.py b/convert_gvf_to_vcf/assistingconverter.py
@@ -95,7 +95,4 @@ def convert_gvf_attributes_to_vcf_values(column9_of_gvf,
         else:
             logger.info(f"catching attribute keys for review at a later date {attrib_key} {attrib_value}")
             catching_for_review.append(attrib_key)
-    # info_string = ''.join(f'{key}={value};' for key, value in vcf_info_values.items()).rstrip(';')
-    # print(type(vcf_info_values))
-    # print(vcf_info_values)
     return gvf_attribute_dictionary, vcf_info_values, vcf_format_values
diff --git a/convert_gvf_to_vcf/convertGVFtoVCF.py b/convert_gvf_to_vcf/convertGVFtoVCF.py
@@ -87,10 +87,12 @@ def generate_vcf_header_metainfo(gvf_pragmas,
         vcf_header_key, pragma_name, pragma_value = get_pragma_name_and_value(non_essential_pragma, ": ", list_of_non_essential_pragma, pragma_to_vcf_map)
         if pragma_name.startswith("#Publication"):
             publication_tokens = get_pragma_tokens(pragma_value, ";", "=")
-            pragmas_to_add.append(generate_vcf_header_unstructured_line(publication_tokens[0], publication_tokens[1]))
+            for pub_token in publication_tokens:
+                pragmas_to_add.append(generate_vcf_header_unstructured_line(pub_token[0], pub_token[1]))
         elif pragma_name == "#Study":
             study_tokens = get_pragma_tokens(pragma_value, ";", "=")
-            pragmas_to_add.append(generate_vcf_header_unstructured_line(study_tokens[0], study_tokens[1]))
+            for s_token in study_tokens:
+                pragmas_to_add.append(generate_vcf_header_unstructured_line(s_token[0], s_token[1]))
         else:
             if vcf_header_key is not None:
                 pragmas_to_add.append(generate_vcf_header_unstructured_line(vcf_header_key, pragma_value))
@@ -147,8 +149,9 @@ def parse_pragma(pragma_to_parse, delimiter):
             pragma_value = None
             logger.warning(f"WARNING: no value for the following pragma {pragma_to_parse}")
         return pragma_name, pragma_value
-    except ValueError:
-        logger.error(f"Skipping this, can't be parsed {pragma_to_parse}")
+    except AttributeError as e:
+        logger.error(f"Skipping this, can't be parsed {pragma_to_parse}: {e}")
+        raise AttributeError(f"Cannot parse {pragma_to_parse}")
 
 def get_pragma_name_and_value(pragma_to_parse, delimiter, pragma_list, pragma_name_to_vcf_dict):
     """Get pragma name and value and its corresponding VCF header key.
@@ -175,7 +178,7 @@ def get_pragma_tokens(pragma_value, first_delimiter, second_delimiter):
     initial_list = pragma_value.split(first_delimiter)
     pragma_tokens = []
     for element in initial_list:
-        pragma_tokens = element.split(second_delimiter)
+        pragma_tokens.append(element.split(second_delimiter))
     return pragma_tokens
 
 # This is the main conversion logic

diff --git a/tests/test_convert_gvf_to_vcf.py b/tests/test_convert_gvf_to_vcf.py
@@ -1,11 +1,13 @@
-#TODO: 5 test
 import os.path
 import unittest
 
+
 from convert_gvf_to_vcf.lookup import Lookup
-#from convert_gvf_to_vcf.utils import read_file
-from convert_gvf_to_vcf.convertGVFtoVCF import generate_vcf_header_unstructured_line, read_in_gvf_file, convert_gvf_features_to_vcf_objects, generate_vcf_header_metainfo, generate_vcf_header_line, compare_vcf_objects, determine_merge_or_keep_vcf_objects
-from convert_gvf_to_vcf.vcfline import VcfLine
+from convert_gvf_to_vcf.convertGVFtoVCF import generate_vcf_header_unstructured_line, read_in_gvf_file, \
+    convert_gvf_features_to_vcf_objects, generate_vcf_header_metainfo, generate_vcf_header_line, compare_vcf_objects, \
+    determine_merge_or_keep_vcf_objects, merge_vcf_objects, parse_pragma, get_pragma_name_and_value, get_pragma_tokens, \
+    keep_vcf_objects
+
 
 
 class TestConvertGVFtoVCF(unittest.TestCase):
@@ -29,20 +31,53 @@ def setUp(self):
 
 
     def test_generate_vcf_header_structured_lines(self):
-        pass
+        key_to_test = "fileformat"
+        value_to_test= "VCFv4.4"
+        actual_output = generate_vcf_header_unstructured_line(key_to_test, value_to_test)
+        assert actual_output == "##fileformat=VCFv4.4"
 
     def test_generate_custom_unstructured_meta_line(self):
         formatted_string = generate_vcf_header_unstructured_line("test_string_key", "test_string_value")
         assert formatted_string == "##test_string_key=test_string_value"
 
     def test_parse_pragma(self):
-        pass
+        # testing: pragma has a name and value
+        essential_pragma = "##file-date 2015-07-15"
+        delimiter = " "
+        name, value = parse_pragma(essential_pragma, delimiter)
+        assert name, value == "##file-date, 2015-07-15"
+        # testing: pragma has only name, no value, should print warning
+        name_only_pragma = "##file-date"
+        name, value = parse_pragma(name_only_pragma, delimiter)
+        assert name, value == "##file-date, None"
+        # testing: invalid pragmas
+        invalid_pragma = None
+        with self.assertRaises(AttributeError):
+            parse_pragma(invalid_pragma, delimiter)
 
     def test_get_pragma_name_and_value(self):
-        pass
+        pragma_to_test = "##file-date 2015-07-15"
+        delimiter = " "
+        list_of_pragma = ["##file-date", "##gff-version", "##gvf-version", "##species", "##genome-build"]
+        pragma_to_vcf_map = {'##file-date': 'fileDate', '##gff-version': 'gff-version', '##gvf-version': 'gvf-version', '##species': 'species', '##genome-build': 'genome-build', '#sample': 'sample', '#Study_accession': 'Study_accession', '#Study_type': 'Study_type', '#Display_name': 'Display_name', '#Publication': 'Publication', '#Study': 'Study', '#Assembly_name': 'Assembly_name', '#subject': 'subject'}
+        vcf_header_key, pragma_name, pragma_value = get_pragma_name_and_value(pragma_to_test, delimiter, list_of_pragma, pragma_to_vcf_map)
+        assert vcf_header_key == "fileDate"
+        assert pragma_name == "##file-date"
+        assert pragma_value == "2015-07-15"
 
     def test_get_pragma_tokens(self):
-        pass
+        pragma_value = "First_author=Kim Brown;Description=Comparative genomic hybridization analysis of 3 laboratory and one wild zebrafish populations for Copy Number Variants"
+        pragma_tokens = get_pragma_tokens(pragma_value, ";", "=")
+        assert len(pragma_tokens) == 2
+        # Testing: expected
+        assert pragma_tokens[0][0] == "First_author"
+        assert pragma_tokens[0][1] == "Kim Brown"
+        assert pragma_tokens[1][0] == "Description"
+        assert pragma_tokens[1][1] == "Comparative genomic hybridization analysis of 3 laboratory and one wild zebrafish populations for Copy Number Variants"
+        # Testing: not expected
+        unexpected_pragma_tokens = [['A', '1'], ['B', '2']]
+        with self.assertRaises(AssertionError):
+            self.assertEqual(unexpected_pragma_tokens, pragma_tokens)
 
     def test_generate_vcf_metainfo(self):
         gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
@@ -59,15 +94,20 @@ def test_generate_vcf_metainfo(self):
         )
         print(unique_pragmas_to_add)
         assert unique_pragmas_to_add == ['##fileformat=VCFv4.4', '##gff-version=3', '##source=DGVa', '##gvf-version=1.06',
-                                         '##species=http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=7955', '##fileDate=2015-07-15',
-                                         '##genome-build=NCBIGRCz10', '##Study_accession=nstd62', '##Study_type=Control Set',
-                                         '##Display_name=Brown_et_al_2012', '##Publication_year=2012',
+                                         '##species=http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=7955',
+                                         '##fileDate=2015-07-15', '##genome-build=NCBIGRCz10', '##Study_accession=nstd62',
+                                         '##Study_type=Control Set', '##Display_name=Brown_et_al_2012', '##PMID=22203992',
+                                         '##Journal=Proceedings of the National Academy of Sciences of the United States of America',
+                                         '##Paper_title=Extensive genetic diversity and substructuring among zebrafish strains revealed through copy number variant analysis.',
+                                         '##Publication_year=2012', '##First_author=Kim Brown',
                                          '##Description=Comparative genomic hybridization analysis of 3 laboratory and one wild zebrafish populations for Copy Number Variants',
-                                         '##Assembly_name=GRCz10', '##subject=subject_name=Wilds2-3', '##subject=subject_name=Zon9', '##subject=subject_name=JenMale7;subject_sex=Male',
-                                         '##subject=subject_name=JenMale6;subject_sex=Male', '##sample=sample_name=JenMale6;subject_name=JenMale6', '##sample=sample_name=Wilds2-3;subject_name=Wilds2-3',
+                                         '##Assembly_name=GRCz10', '##subject=subject_name=Wilds2-3', '##subject=subject_name=Zon9',
+                                         '##subject=subject_name=JenMale7;subject_sex=Male', '##subject=subject_name=JenMale6;subject_sex=Male',
+                                         '##sample=sample_name=JenMale6;subject_name=JenMale6', '##sample=sample_name=Wilds2-3;subject_name=Wilds2-3',
                                          '##sample=sample_name=Zon9;subject_name=Zon9', '##sample=sample_name=JenMale7;subject_name=JenMale7']
 
 
+
         assert unique_alt_lines_to_add == ['##ALT=<ID=DEL,Description="Deletion">', '##ALT=<ID=DUP,Description="Duplication">']
         assert unique_info_lines_to_add ==  ['##INFO=<ID=ID,Number=.,Type=String,Description="A unique identifier">', '##INFO=<ID=NAME,Number=.,Type=String,Description="Name">', '##INFO=<ID=ALIAS,Number=.,Type=String,Description="Secondary Name">', '##INFO=<ID=VARCALLSOID,Number=.,Type=String,Description="Variant call Sequence ontology ID">', '##INFO=<ID=SVCID,Number=.,Type=Integer,Description="submitter variant call ID">', '##INFO=<ID=REMAP,Number=.,Type=Float,Description="Remap score">', '##INFO=<ID=VARSEQ,Number=.,Type=String,Description="Alleles found in an individual (or group of individuals).">', '##INFO=<ID=END,Number=1,Type=Integer,Description="End position on CHROM (used with symbolic alleles; see below) or End position of the longest variant described in this record">', '##INFO=<ID=SVLEN,Number=A,Type=String,Description="Length of structural variant">', '##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">', '##INFO=<ID=CIPOS,Number=.,Type=Integer,Description="Confidence interval around POS for symbolic structural variants">', '##INFO=<ID=CIEND,Number=.,Type=Integer,Description="Confidence interval around END for symbolic structural variants">', '##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">', '##INFO=<ID=DBXREF,Number=.,Type=String,Description="A database cross-reference">', '##INFO=<ID=AD,Number=R,Type=Integer,Description="Total read depth for each allele">']
 
@@ -94,7 +134,9 @@ def test_gvf_features_to_vcf_objects(self):
 
     def test_compare_vcf_objects(self):
         gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
+        print(gvf_lines_obj_list)
         header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = convert_gvf_features_to_vcf_objects(gvf_lines_obj_list, self.reference_lookup)
+        print(list_of_vcf_objects)
         # compare object, if equal, True, if not equal, False # (next function will make true = current and merge; false= previous)
         expected_flags_for_list_of_vcf_objects = [False, # line 1 vs 2
                                                   False, # line 2 vs 3
@@ -106,21 +148,29 @@ def test_compare_vcf_objects(self):
         actual_flags_for_list_of_vcf_objects = compare_vcf_objects(list_of_vcf_objects)
         assert actual_flags_for_list_of_vcf_objects == expected_flags_for_list_of_vcf_objects
 
-    def test_merge_vcf_objects(self):
-        # gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
-        # header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(
-        #     gvf_lines_obj_list, self.reference_lookup)
-        # list_of_samples = ['JenMale6', 'Wilds2-3', 'Zon9', 'JenMale7']
-        # # use lines 4 and 5 of gvf file
-        # previous = list_of_vcf_objects[3] # line 4
-        # current = list_of_vcf_objects[4] #line 5
-        # merged_object = merge_vcf_objects(previous, current, list_of_samples)
-        # to_check = ('chromosome1', 127, '13;14', 'GTACGTACG', '<DUP>', '.', '.', 'ID=13,14;SVCID=CNV6230,CNV5711;ALIAS=CNV6230,CNV5711;END=131;NAME=nssv1389474,nssv1388955;VARCALLSOID=SO:0001742;AC=3;SVLEN=4;REMAP=.69625,.85344;VARSEQ=.', '.', '.\t.\t.\t.')
-        # assert merged_object == to_check #TODO: the info_string is different each time, ensure order is preserved
-        pass
+    # def test_merge_vcf_objects(self):
+    #     gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
+    #     header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = convert_gvf_features_to_vcf_objects(
+    #         gvf_lines_obj_list, self.reference_lookup)
+    #     list_of_samples = ['JenMale6', 'Wilds2-3', 'Zon9', 'JenMale7']
+    #     # # use lines 4 and 5 of gvf file
+    #     previous = list_of_vcf_objects[3] # line 4
+    #     current = list_of_vcf_objects[4] #line 5
+    #     merged_object = merge_vcf_objects(previous, current, list_of_samples)
+    #
+        # to_check = '\t'.join(['chromosome1', '127', '13;14', 'GTACGTACG', '<DUP>', '.', '.',
+        #                       'ALIAS=CNV6230,CNV5711;NAME=nssv1389474,nssv1388955;VARSEQ=.;REMAP=.69625,.85344;SVCID=CNV6230,CNV5711;VARCALLSOID=SO:0001742;AC=3;SVLEN=4;END=131',
+        #                       '.', '.\t.\t.\t.'])
+        # assert merged_object == to_check
 
     def test_keep_vcf_objects(self):
-        pass
+        gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
+        header_standard_lines_dictionary, vcf_data_lines, list_of_vcf_objects = convert_gvf_features_to_vcf_objects(
+                gvf_lines_obj_list, self.reference_lookup)
+        list_of_samples = ['JenMale6', 'Wilds2-3', 'Zon9', 'JenMale7']
+        previous_object = list_of_vcf_objects[1]
+        keep_vcf_objects(previous_object, list_of_samples)
+
 
     def test_determine_merge_or_keep_vcf_objects(self):
         gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
@@ -137,8 +187,5 @@ def test_determine_merge_or_keep_vcf_objects(self):
         assert merged_or_kept_objects[3].id == "13;14"
         assert merged_or_kept_objects[3].info_dict["NAME"] == "nssv1389474,nssv1388955"
 
-
-
-
 if __name__ == '__main__':
     unittest.main()