Skip to content
Merged
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
# convertGVFtoVCF
converts GVF file to VCF file

Dependencies
Install biopython:
pip install biopython

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can add this to the requirements file.

Since the requirements file is new in the main branch, you will either need to do it in your next PR, or you can dive into the joys of git rebase/merge to see how to update your branch with the newest changes from the main branch. In theory it's as simple as:

git checkout main
git pull origin main
git checkout populate_ref_allele
git rebase main

In practice there can be merge conflicts you need to resolve, but hopefully in this case there won't be.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will address the issue with the requirements in #7 and that should also fix the tests

41 changes: 37 additions & 4 deletions convert_gvf_to_vcf/convertGVFtoVCF.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import argparse
import os

from Bio import SeqIO

# setting up paths to useful directories
convert_gvf_to_vcf_folder = os.path.dirname(__file__)
Expand Down Expand Up @@ -225,6 +225,20 @@ def read_gvf_info_attributes(gvf_info_attributes_file):
gvf_attribute_dict[gvf_key_id] = gvf_attribute_tokens
return gvf_attribute_dict

def extract_reference_allele(fasta_file, chromosome_name, position):
""" Extracts the reference allele from the assembly.
:param fasta_file: FASTA file of the assembly
:param chromosome_name: name of the sequence
:param position: position
:return: reference_allele: base found at this chromosome_name at this position within this fasta_file
"""
# using .index instead of .todict for memory efficiency: https://biopython.org/docs/1.76/api/Bio.SeqIO.html#input-multiple-records
records_dictionary = SeqIO.index(fasta_file, "fasta")
zero_indexed_position = position - 1 # minus one because zero indexed
reference_allele = records_dictionary[chromosome_name].seq[zero_indexed_position]
records_dictionary.close()
return reference_allele

def get_gvf_attributes(column9_of_gvf):
"""Get a dictionary of GVF attributes
:param column9_of_gvf: attributes column - the final column of the GVF file
Expand Down Expand Up @@ -409,6 +423,7 @@ class VcfLine:
def __init__(self, gvf_feature_line_object,
dgva_attribute_dict,
gvf_attribute_dict,
assembly_file,
lines_custom_structured,
lines_standard_ALT,
lines_standard_INFO,
Expand All @@ -432,9 +447,10 @@ def __init__(self, gvf_feature_line_object,
all_possible_FILTER_lines,
all_possible_FORMAT_lines)

self.assembly = assembly_file
# DATALINE
self.chrom = gvf_feature_line_object.seqid
self.pos = gvf_feature_line_object.start
self.pos = int(gvf_feature_line_object.start)
self.id = self.vcf_value["ID"] # attributes: ID
self.ref = self.get_ref()
self.alt = self.vcf_value["Variant_seq"] # attributes: variant_seq
Expand All @@ -443,7 +459,7 @@ def __init__(self, gvf_feature_line_object,

# INFO
#TODO: specific SV info keys populated from gvf_feature_line
self.key = self.chrom + "_" + self.pos
self.key = self.chrom + "_" + str(self.pos)
self.info = "pending_aggregation" # TODO: add info field for self.info
self.source = gvf_feature_line_object.source

Expand Down Expand Up @@ -497,7 +513,12 @@ def get_ref(self):
if "Reference_seq" in self.vcf_value.keys():
return self.vcf_value["Reference_seq"] # attributes:reference_seq
else:
return "." # TODO: how shall we fill this in with this scenario?
if self.assembly:
reference_allele = extract_reference_allele(self.assembly, self.chrom, self.pos)
else:
print("WARNING: No reference provided. Placeholder inserted for Reference allele.")
reference_allele = "."
return reference_allele

def __str__(self):
string_to_return = '\t'.join((self.chrom, self.pos, self.key, self.qual, self.filter, self.info, self.source, self.phase, self.end, self.so_type, self.sample_name, self.format))
Expand Down Expand Up @@ -616,6 +637,7 @@ def generate_vcf_header_line(samples):
def gvf_features_to_vcf_objects(gvf_lines_obj_list,
dgva_attribute_dict,
gvf_attribute_dict,
assembly_file,
lines_custom_structured,
lines_standard_ALT,
lines_standard_INFO,
Expand All @@ -628,6 +650,7 @@ def gvf_features_to_vcf_objects(gvf_lines_obj_list,
""" Creates VCF objects from GVF feature lines and stores the VCF objects.
:param gvf_lines_obj_list: list of GVF feature line objects
:param gvf_attribute_dict: dictionary of GVF INFO attributes
:param assembly_file: FASTA file to assembly
:param dgva_attribute_dict: dictionary af DGVa specific INFO attributes
:param lines_custom_structured: list to store custom structured metainformation lines
:param lines_standard_ALT: ALT lines for this VCF file
Expand All @@ -650,6 +673,7 @@ def gvf_features_to_vcf_objects(gvf_lines_obj_list,
vcf_object = VcfLine(gvf_featureline,
dgva_attribute_dict,
gvf_attribute_dict,
assembly_file,
lines_custom_structured,
lines_standard_ALT,
lines_standard_INFO,
Expand Down Expand Up @@ -724,9 +748,12 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument("gvf_input", help="GVF input file.")
parser.add_argument("vcf_output", help="VCF output file.")
parser.add_argument("-a", "--assembly", help="FASTA assembly file")
args = parser.parse_args()
print("The provided input file is: ", args.gvf_input)
print("The provided output file is: ", args.vcf_output)
if args.assembly:
print("The provided assembly file is: ", args.assembly)

all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines()
all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines()
Expand All @@ -747,9 +774,15 @@ def main():
gvf_info_attributes_file = os.path.join(etc_folder, 'gvfINFOattributes.tsv')
dgva_attribute_dict = read_dgva_info_attributes(dgva_info_attributes_file=dgva_info_attributes_file) # needed to generate custom strings
gvf_attribute_dict = read_gvf_info_attributes(gvf_info_attributes_file=gvf_info_attributes_file)
if args.assembly:
assembly_file = os.path.abspath(args.assembly)
else:
assembly_file = None

vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
dgva_attribute_dict,
gvf_attribute_dict,
assembly_file,
lines_custom_structured,
lines_standard_ALT,
lines_standard_INFO,
Expand Down
4 changes: 4 additions & 0 deletions tests/input/zebrafish.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>chromosome1
ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
>chromosome2
TTTTTTT
8 changes: 4 additions & 4 deletions tests/input/zebrafish.gvf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#sample: sample_name=Zon9;subject_name=Zon9
#sample: sample_name=JenMale7;subject_name=JenMale7
#testing_unknown_pragma
1 DGVa copy_number_loss 776614 786127 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=.
1 DGVa copy_number_loss 1277246 1320592 . + . ID=12;Name=nssv1406143;Alias=CNV22899;variant_call_so_id=SO:0001743;parent=nsv811095;Start_range=.,1277246;End_range=1320592,.;submitter_variant_call_id=CNV22899;sample_name=Zon9;remap_score=.87402;Variant_seq=.
1 DGVa copy_number_gain 1284210 1320592 . + . ID=13;Name=nssv1389474;Alias=CNV6230;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV6230;sample_name=JenMale7;remap_score=.69625;Variant_seq=.
1 DGVa copy_number_gain 1284210 1320592 . + . ID=14;Name=nssv1388955;Alias=CNV5711;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV5711;sample_name=JenMale6;remap_score=.85344;Variant_seq=.;allele_count=3
chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=.
chromosome1 DGVa copy_number_loss 127 132 . + . ID=12;Name=nssv1406143;Alias=CNV22899;variant_call_so_id=SO:0001743;parent=nsv811095;Start_range=.,1277246;End_range=1320592,.;submitter_variant_call_id=CNV22899;sample_name=Zon9;remap_score=.87402;Variant_seq=.
chromosome1 DGVa copy_number_gain 128 132 . + . ID=13;Name=nssv1389474;Alias=CNV6230;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV6230;sample_name=JenMale7;remap_score=.69625;Variant_seq=.
chromosome1 DGVa copy_number_gain 128 132 . + . ID=14;Name=nssv1388955;Alias=CNV5711;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV5711;sample_name=JenMale6;remap_score=.85344;Variant_seq=.;allele_count=3
19 changes: 16 additions & 3 deletions tests/test_convert_gvf_to_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def setUp(self):
self.dgva_input_file = os.path.join(self.input_folder_parent, "etc","dgvaINFOattributes.tsv")
self.gvf_input_file = os.path.join(self.input_folder_parent, "etc","gvfINFOattributes.tsv")
self.output_file = os.path.join(input_folder, "input", "a.vcf")
self.assembly = os.path.join(input_folder, "input", "zebrafish.fa")

def test_read_in_gvf_file(self):
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
Expand All @@ -38,6 +39,7 @@ def test_gvf_features_to_vcf_objects(self):
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
dgva_attribute_dict = read_dgva_info_attributes(self.dgva_input_file)
gvf_attribute_dict = read_gvf_info_attributes(self.gvf_input_file)
assembly_file = self.assembly
# custom meta-information lines for this VCF file
lines_custom_structured = []
lines_custom_unstructured = []
Expand All @@ -53,7 +55,9 @@ def test_gvf_features_to_vcf_objects(self):
all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines()

vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list, dgva_attribute_dict,
gvf_attribute_dict, lines_custom_structured,
gvf_attribute_dict,
assembly_file,
lines_custom_structured,
lines_standard_ALT, lines_standard_INFO,
lines_standard_FILTER, lines_standard_FORMAT,
all_possible_ALT_lines,
Expand All @@ -65,12 +69,13 @@ def test_gvf_features_to_vcf_objects(self):
assert len(list_of_vcf_objects) > 1

def test_get_ref(self):
gvf_feature_line = "1 DGVa copy_number_loss 776614 786127 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=."
gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=."
f_list = gvf_feature_line.split("\t")
line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], f_list[7], f_list[8])
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
dgva_attribute_dict = read_dgva_info_attributes(self.dgva_input_file)
gvf_attribute_dict = read_gvf_info_attributes(self.gvf_input_file)
assembly_file = self.assembly
# custom meta-information lines for this VCF file
lines_custom_structured = []
lines_custom_unstructured = []
Expand All @@ -88,6 +93,7 @@ def test_get_ref(self):
v = VcfLine(line_object,
dgva_attribute_dict,
gvf_attribute_dict,
assembly_file,
lines_custom_structured,
lines_standard_ALT,
lines_standard_INFO,
Expand All @@ -98,12 +104,13 @@ def test_get_ref(self):
all_possible_FILTER_lines,
all_possible_FORMAT_lines)
reference_allele = v.get_ref()
assert reference_allele == "."
assert len(reference_allele) != 0

def test_generate_vcf_metainformation(self):
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
dgva_attribute_dict = read_dgva_info_attributes(self.dgva_input_file)
gvf_attribute_dict = read_gvf_info_attributes(self.gvf_input_file)
assembly_file = self.assembly
# custom meta-information lines for this VCF file
lines_custom_structured = []
lines_custom_unstructured = []
Expand All @@ -121,6 +128,7 @@ def test_generate_vcf_metainformation(self):
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
dgva_attribute_dict,
gvf_attribute_dict,
assembly_file,
lines_custom_structured,
lines_standard_ALT,
lines_standard_INFO,
Expand Down Expand Up @@ -158,6 +166,7 @@ def test_generate_vcf_header_line(self):
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
dgva_attribute_dict,
gvf_attribute_dict,
self.assembly,
lines_custom_structured,
lines_standard_ALT,
lines_standard_INFO,
Expand Down Expand Up @@ -196,6 +205,7 @@ def test_populate_sample_formats(self):
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
dgva_attribute_dict,
gvf_attribute_dict,
self.assembly,
lines_custom_structured,
lines_standard_ALT,
lines_standard_INFO,
Expand Down Expand Up @@ -234,6 +244,7 @@ def test_format_sample_values(self):
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
dgva_attribute_dict,
gvf_attribute_dict,
self.assembly,
lines_custom_structured,
lines_standard_ALT,
lines_standard_INFO,
Expand All @@ -255,6 +266,7 @@ def test_format_vcf_datalines(self):
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
dgva_attribute_dict = read_dgva_info_attributes(self.dgva_input_file)
gvf_attribute_dict = read_gvf_info_attributes(self.gvf_input_file)
assembly_file = self.assembly
# custom meta-information lines for this VCF file
lines_custom_structured = []
lines_custom_unstructured = []
Expand All @@ -272,6 +284,7 @@ def test_format_vcf_datalines(self):
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
dgva_attribute_dict,
gvf_attribute_dict,
assembly_file,
lines_custom_structured,
lines_standard_ALT,
lines_standard_INFO,
Expand Down
Loading