Skip to content
112 changes: 91 additions & 21 deletions convert_gvf_to_vcf/convertGVFtoVCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,18 +504,21 @@ def __str__(self):
return string_to_return

#step 9 using custom unstructured meta-information line = generate_custom_unstructured_metainfomation_line
def generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, list_of_vcf_objects):
def generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non_essential, list_of_vcf_objects):
""" Generates a list of metainformation lines for the VCF header
:param lines_custom_unstructured: a list of formatted unstructured metainformation lines using a custom key value pair
:param gvf_pragmas: list of gvf pragmas to convert
:param gvf_non_essential: list of non-essential gvf pragmas to convert
:param list_of_vcf_objects: list of vcf objects
:return: unique_pragmas_to_add: a list of pragmas, this list contains no duplicates
:return: unique_pragmas_to_add, sample_names: a list of pragmas (this list contains no duplicates), list of sample names
"""
pragmas_to_add = []
unique_pragmas_to_add = []
sample_names = []
# MANDATORY: file format for VCF
pragma_fileformat = generate_custom_unstructured_metainfomation_line("fileformat", "VCFv4.4",lines_custom_unstructured)
pragmas_to_add.append(pragma_fileformat)
#Go through essential pragmas
#TODO: list of pragmas to add:reference=file, contig, phasing,INFO#
for pragma in gvf_pragmas:
# file date
Expand Down Expand Up @@ -548,15 +551,57 @@ def generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, list_of
pragmas_to_add.append(pragma_genome_build)
else:
pass
# Go through non-essential pragmas
for non_essential_pragma in gvf_non_essential:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These sort of long list of if statements could be refactored to have a generic method and dioctionary that maps the start of the pragma with correct key and potentially a parser function.
I will suggest these edits in a different PR.

if non_essential_pragma.startswith("#Study_accession"):
study_accession = non_essential_pragma.split(": ")[1]
non_essential_pragma_study_accession = generate_custom_unstructured_metainfomation_line("Study_accession", study_accession, lines_custom_unstructured)
pragmas_to_add.append(non_essential_pragma_study_accession)
elif non_essential_pragma.startswith("#Study_type"):
study_type = non_essential_pragma.split(": ")[1]
non_essential_pragma_study_type = generate_custom_unstructured_metainfomation_line("Study_type", study_type, lines_custom_unstructured)
pragmas_to_add.append(non_essential_pragma_study_type)
elif non_essential_pragma.startswith("#Display_name"):
display_name = non_essential_pragma.split(": ")[1]
non_essential_pragma_display_name = generate_custom_unstructured_metainfomation_line("Display_name", display_name,lines_custom_unstructured)
pragmas_to_add.append(non_essential_pragma_display_name)
elif non_essential_pragma.startswith("#Publication"):
publication = non_essential_pragma.split(": ")[1]
non_essential_pragma_publication = generate_custom_unstructured_metainfomation_line("Publication", publication, lines_custom_unstructured)
pragmas_to_add.append(non_essential_pragma_publication)
elif non_essential_pragma.startswith("#Study"):
study = non_essential_pragma.split(": ")[1]
non_essential_pragma_study = generate_custom_unstructured_metainfomation_line("Study", study, lines_custom_unstructured)
pragmas_to_add.append(non_essential_pragma_study)
elif non_essential_pragma.startswith("#Assembly_name"):
assembly_name = non_essential_pragma.split(": ")[1]
non_essential_pragma_assembly_name = generate_custom_unstructured_metainfomation_line("Assembly_name", assembly_name, lines_custom_unstructured)
pragmas_to_add.append(non_essential_pragma_assembly_name)
elif non_essential_pragma.startswith("#subject"):
subject = non_essential_pragma.split(": ")[1]
non_essential_pragma_subject = generate_custom_unstructured_metainfomation_line("subject", subject, lines_custom_unstructured)
pragmas_to_add.append(non_essential_pragma_subject)
elif non_essential_pragma.startswith("#sample"):
sample_information = non_essential_pragma.split(": ")[1]
non_essential_pragma_sample = generate_custom_unstructured_metainfomation_line("sample", sample_information,
lines_custom_unstructured)
pragmas_to_add.append(non_essential_pragma_sample)
list_of_sample_information = sample_information.split(";")
for sample_info in list_of_sample_information:
if sample_info.startswith("sample_name"):
sample_name = sample_info.split("=")[1]
sample_names.append(sample_name)
else:
print("Skipping unknown non-essential GVF pragma:", non_essential_pragma)

print("Total number of samples in this VCF: ", len(sample_names))

for pragma in pragmas_to_add:
if pragma not in unique_pragmas_to_add:
unique_pragmas_to_add.append(pragma)

#TODO: add the pragmas for the GVF non-essentials
return unique_pragmas_to_add
return unique_pragmas_to_add, sample_names

# step 10
# TODO: finish the below for sample names
def generate_vcf_header_line(samples):
""" Generates the VCF header line
:param samples: list of samples, these will appear in the header line
Expand Down Expand Up @@ -627,23 +672,49 @@ def gvf_features_to_vcf_objects(gvf_lines_obj_list,
# print("for ", key, " the number of vcf objects is: ", len(vcf_obj_list))
return vcf_data_lines, list_of_vcf_objects

def format_vcf_datalines(list_of_vcf_objects):
""" Iterates through a list of VCF objects and formats them as a VCF dataline.

def populate_sample_formats(list_of_sample_names):
""" Populates a dictionary using a list of sample names. Dictionary key is sample name, value is the sample's format value.
:param list_of_sample_names: list of sample names
:return:sample_name_format_value: dictionary of sample names => sample format value
"""
sample_name_format_value = {}
for sample in list_of_sample_names:
sample_name_format_value[sample] = "sampleFORMAThere" #TODO: fill this in
return sample_name_format_value

def format_sample_values(sample_name_format_value):
""" Creates a partial vcf data line of sample format values.
:param sample_name_format_value: dictionary of sample names => sample format value
:return: sample_format_values_string: formatted string
"""
sample_format_values_string = ""
for key in sample_name_format_value:
sample_format_values_string = sample_format_values_string + sample_name_format_value[key] + "\t"
return sample_format_values_string

def format_vcf_datalines(list_of_vcf_objects, list_of_sample_names):
""" Iterates through a list of VCF objects and sample names and formats them as a VCF dataline.
:param list_of_vcf_objects: list of vcf objects
:param list_of_sample_names: list of sample names
:return: formatted_vcf_datalines: list of formatted vcf datalines
"""
sample_name_format_value = populate_sample_formats(list_of_sample_names)
sample_format_values_string = format_sample_values(sample_name_format_value)

formatted_vcf_datalines = []
for vcf_obj in list_of_vcf_objects:
vcf_line = (f"{vcf_obj.chrom}\t"
f"{vcf_obj.pos}\t"
f"{vcf_obj.id}\t"
f"{vcf_obj.ref}\t" #TODO: should this always be empty
f"{vcf_obj.alt}\t" #TODO: should this always be empty
f"{vcf_obj.qual}\t" #TODO: should this always be empty
f"{vcf_obj.filter}\t" #TODO: should this always be empty
f"{vcf_obj.info}\t"
f"{vcf_obj.format}\tsampleFORMAThere" #TODO: fill this in
)
f"{vcf_obj.pos}\t"
f"{vcf_obj.id}\t"
f"{vcf_obj.ref}\t" #TODO: should this always be empty
f"{vcf_obj.alt}\t" #TODO: should this always be empty
f"{vcf_obj.qual}\t" #TODO: should this always be empty
f"{vcf_obj.filter}\t" #TODO: should this always be empty
f"{vcf_obj.info}\t"
f"{vcf_obj.format}\t"
f"{sample_format_values_string}"
)
formatted_vcf_datalines.append(vcf_line)
return formatted_vcf_datalines

Expand Down Expand Up @@ -689,18 +760,17 @@ def main():
all_possible_FILTER_lines,
all_possible_FORMAT_lines)

# 10c
print("Writing to the following VCF output: ", args.vcf_output)
print("Generating the VCF header and the meta-information lines")

with open(args.vcf_output, "w") as vcf_output:
unique_pragmas_to_add = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, list_of_vcf_objects)
unique_pragmas_to_add, samples = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non_essential, list_of_vcf_objects)
for pragma in unique_pragmas_to_add:
vcf_output.write(f"{pragma}\n")
samples = ["samA"] # TODO: this is a placeholder, need to add a function to read gvf pragmas and collect the samples into a list
header_fields = generate_vcf_header_line(samples)
vcf_output.write(f"{header_fields}\n")
print("Generating the VCF datalines")
formatted_vcf_datalines = format_vcf_datalines(list_of_vcf_objects)
formatted_vcf_datalines = format_vcf_datalines(list_of_vcf_objects, samples)
for line in formatted_vcf_datalines:
vcf_output.write(f"{line}\n")
vcf_output.close()
Expand Down
151 changes: 4 additions & 147 deletions tests/input/zebrafish.gvf
Original file line number Diff line number Diff line change
Expand Up @@ -9,158 +9,15 @@
#Publication: PMID=22203992;Journal=Proceedings of the National Academy of Sciences of the United States of America;Paper_title=Extensive genetic diversity and substructuring among zebrafish strains revealed through copy number variant analysis.;Publication_year=2012
#Study: First_author=Kim Brown;Description=Comparative genomic hybridization analysis of 3 laboratory and one wild zebrafish populations for Copy Number Variants
#Assembly_name: GRCz10
#subject: subject_name=Zon5
#subject: subject_name=Zon4
#subject: subject_name=Enders4
#subject: subject_name=Wilds1-7
#subject: subject_name=DF6
#subject: subject_name=Wilds2-6
#subject: subject_name=Wilds1-1
#subject: subject_name=Wilds1-5
#subject: subject_name=DF7
#subject: subject_name=Wilds2-5
#subject: subject_name=Utah2
#subject: subject_name=KarpMale4;subject_sex=Male
#subject: subject_name=DF9
#subject: subject_name=Wilds1-2
#subject: subject_name=KarpMale6;subject_sex=Male
#subject: subject_name=Enders2
#subject: subject_name=Wilds1-6
#subject: subject_name=Wilds2-9
#subject: subject_name=Zon3
#subject: subject_name=Utah5
#subject: subject_name=Wilds2-10
#subject: subject_name=ZIRC3
#subject: subject_name=JenFemal3;subject_sex=Female
#subject: subject_name=Zon1
#subject: subject_name=ZIRC5
#subject: subject_name=Utah9
#subject: subject_name=Enders8
#subject: subject_name=ZIRC1
#subject: subject_name=ZIRC8
#subject: subject_name=Wilds2-1
#subject: subject_name=Wilds2-4
#subject: subject_name=ZIRC7
#subject: subject_name=Enders1
#subject: subject_name=KarpMale5;subject_sex=Male
#subject: subject_name=DF8
#subject: subject_name=JenFemale10;subject_sex=Female
#subject: subject_name=Utah3
#subject: subject_name=JenFemal2;subject_sex=Female
#subject: subject_name=Wilds1-6 orig
#subject: subject_name=Enders3
#subject: subject_name=KarpMale3;subject_sex=Male
#subject: subject_name=JenFemale8;subject_sex=Female
#subject: subject_name=JenMale6;subject_sex=Male
#subject: subject_name=JenFemale9;subject_sex=Female
#subject: subject_name=DF10
#subject: subject_name=Wilds1-4
#subject: subject_name=DF1
#subject: subject_name=Zon2
#subject: subject_name=Zon9
#subject: subject_name=Wilds1-3
#subject: subject_name=JenFemal1;subject_sex=Female
#subject: subject_name=Zon10
#subject: subject_name=Wilds2-3
#subject: subject_name=JenMale2;subject_sex=Male
#subject: subject_name=Zon9
#subject: subject_name=JenMale7;subject_sex=Male
#subject: subject_name=ZIRC4
#subject: subject_name=Zon8
#subject: subject_name=KarpMale2;subject_sex=Male
#subject: subject_name=Wilds1-8
#subject: subject_name=ZIRC9
#subject: subject_name=Utah4
#subject: subject_name=Zon7
#subject: subject_name=Wilds2-2
#subject: subject_name=DF5
#subject: subject_name=DF2
#subject: subject_name=Utah6
#subject: subject_name=Wilds2-8
#subject: subject_name=ZIRC6
#subject: subject_name=ZIRC2
#subject: subject_name=Wilds2-7
#subject: subject_name=Utah8
#subject: subject_name=DF3
#subject: subject_name=Zon6
#subject: subject_name=Utah7
#subject: subject_name=Utah1
#subject: subject_name=DF4
#sample: sample_name=Wilds2-2;subject_name=Wilds2-2
#sample: sample_name=DF5;subject_name=DF5
#sample: sample_name=DF2;subject_name=DF2
#sample: sample_name=Utah6;subject_name=Utah6
#sample: sample_name=Wilds2-8;subject_name=Wilds2-8
#sample: sample_name=ZIRC6;subject_name=ZIRC6
#sample: sample_name=ZIRC2;subject_name=ZIRC2
#sample: sample_name=Wilds2-7;subject_name=Wilds2-7
#sample: sample_name=Utah8;subject_name=Utah8
#sample: sample_name=DF3;subject_name=DF3
#sample: sample_name=Zon6;subject_name=Zon6
#sample: sample_name=Utah7;subject_name=Utah7
#sample: sample_name=Utah1;subject_name=Utah1
#sample: sample_name=DF4;subject_name=DF4
#sample: sample_name=Zon5;subject_name=Zon5
#sample: sample_name=Zon4;subject_name=Zon4
#sample: sample_name=Enders4;subject_name=Enders4
#sample: sample_name=Wilds1-7;subject_name=Wilds1-7
#sample: sample_name=DF6;subject_name=DF6
#sample: sample_name=Wilds2-6;subject_name=Wilds2-6
#sample: sample_name=Wilds1-1;subject_name=Wilds1-1
#sample: sample_name=Wilds1-5;subject_name=Wilds1-5
#sample: sample_name=DF7;subject_name=DF7
#sample: sample_name=Wilds2-5;subject_name=Wilds2-5
#sample: sample_name=Utah2;subject_name=Utah2
#sample: sample_name=KarpMale4;subject_name=KarpMale4
#sample: sample_name=DF9;subject_name=DF9
#sample: sample_name=Wilds1-2;subject_name=Wilds1-2
#sample: sample_name=KarpMale6;subject_name=KarpMale6
#sample: sample_name=Enders2;subject_name=Enders2
#sample: sample_name=Wilds1-6;subject_name=Wilds1-6
#sample: sample_name=Wilds2-9;subject_name=Wilds2-9
#sample: sample_name=Zon3;subject_name=Zon3
#sample: sample_name=Utah5;subject_name=Utah5
#sample: sample_name=Wilds2-10;subject_name=Wilds2-10
#sample: sample_name=ZIRC3;subject_name=ZIRC3
#sample: sample_name=JenFemal3;subject_name=JenFemal3
#sample: sample_name=Zon1;subject_name=Zon1
#sample: sample_name=ZIRC5;subject_name=ZIRC5
#sample: sample_name=Utah9;subject_name=Utah9
#sample: sample_name=Enders8;subject_name=Enders8
#sample: sample_name=ZIRC1;subject_name=ZIRC1
#sample: sample_name=ZIRC8;subject_name=ZIRC8
#sample: sample_name=Wilds2-1;subject_name=Wilds2-1
#sample: sample_name=Wilds2-4;subject_name=Wilds2-4
#sample: sample_name=ZIRC7;subject_name=ZIRC7
#sample: sample_name=Enders1;subject_name=Enders1
#sample: sample_name=KarpMale5;subject_name=KarpMale5
#sample: sample_name=DF8;subject_name=DF8
#sample: sample_name=JenFemale10;subject_name=JenFemale10
#sample: sample_name=Utah3;subject_name=Utah3
#sample: sample_name=JenFemal2;subject_name=JenFemal2
#sample: sample_name=Wilds1-6 orig;subject_name=Wilds1-6 orig
#sample: sample_name=Enders3;subject_name=Enders3
#sample: sample_name=KarpMale3;subject_name=KarpMale3
#sample: sample_name=JenFemale8;subject_name=JenFemale8
#subject: subject_name=JenMale6;subject_sex=Male
#sample: sample_name=JenMale6;subject_name=JenMale6
#sample: sample_name=JenFemale9;subject_name=JenFemale9
#sample: sample_name=DF10;subject_name=DF10
#sample: sample_name=Wilds1-4;subject_name=Wilds1-4
#sample: sample_name=DF1;subject_name=DF1
#sample: sample_name=Zon2;subject_name=Zon2
#sample: sample_name=Zon9;subject_name=Zon9
#sample: sample_name=Wilds1-3;subject_name=Wilds1-3
#sample: sample_name=JenFemal1;subject_name=JenFemal1
#sample: sample_name=Zon10;subject_name=Zon10
#sample: sample_name=Wilds2-3;subject_name=Wilds2-3
#sample: sample_name=JenMale2;subject_name=JenMale2
#sample: sample_name=Zon9;subject_name=Zon9
#sample: sample_name=JenMale7;subject_name=JenMale7
#sample: sample_name=ZIRC4;subject_name=ZIRC4
#sample: sample_name=Zon8;subject_name=Zon8
#sample: sample_name=KarpMale2;subject_name=KarpMale2
#sample: sample_name=Wilds1-8;subject_name=Wilds1-8
#sample: sample_name=ZIRC9;subject_name=ZIRC9
#sample: sample_name=Utah4;subject_name=Utah4
#sample: sample_name=Zon7;subject_name=Zon7
#testing_unknown_pragma
1 DGVa copy_number_loss 776614 786127 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=.
1 DGVa copy_number_loss 1277246 1320592 . + . ID=12;Name=nssv1406143;Alias=CNV22899;variant_call_so_id=SO:0001743;parent=nsv811095;Start_range=.,1277246;End_range=1320592,.;submitter_variant_call_id=CNV22899;sample_name=Zon9;remap_score=.87402;Variant_seq=.
1 DGVa copy_number_gain 1284210 1320592 . + . ID=13;Name=nssv1389474;Alias=CNV6230;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV6230;sample_name=JenMale7;remap_score=.69625;Variant_seq=.
Expand Down
Loading