Skip to content

Commit 2f66b9f

Browse files
khetherintcezard
andauthored
Populate ref allele (#5)
* add assembly parameter * simplified input for testing * added a new test file * added dependencies installation instructions * updated input fasta file * updated input fasta file * updated input gvf file * added function extract_reference_allele * removed TODO * added unit test * tidy up * handling for if no reference provided and reference_seq not in GVF * tidy * tidy up comments * Fix tests --------- Co-authored-by: tcezard <tcezard@ebi.ac.uk>
1 parent 5f78c58 commit 2f66b9f

File tree

5 files changed

+65
-11
lines changed

5 files changed

+65
-11
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
11
# convertGVFtoVCF
22
converts GVF file to VCF file
3+
4+
Dependencies
5+
Install biopython:
6+
pip install biopython

convert_gvf_to_vcf/convertGVFtoVCF.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import argparse
22
import os
3-
3+
from Bio import SeqIO
44

55
# setting up paths to useful directories
66
convert_gvf_to_vcf_folder = os.path.dirname(__file__)
@@ -225,6 +225,20 @@ def read_gvf_info_attributes(gvf_info_attributes_file):
225225
gvf_attribute_dict[gvf_key_id] = gvf_attribute_tokens
226226
return gvf_attribute_dict
227227

228+
def extract_reference_allele(fasta_file, chromosome_name, position):
229+
""" Extracts the reference allele from the assembly.
230+
:param fasta_file: FASTA file of the assembly
231+
:param chromosome_name: name of the sequence
232+
:param position: position
233+
:return: reference_allele: base found at this chromosome_name at this position within this fasta_file
234+
"""
235+
# using .index instead of .todict for memory efficiency: https://biopython.org/docs/1.76/api/Bio.SeqIO.html#input-multiple-records
236+
records_dictionary = SeqIO.index(fasta_file, "fasta")
237+
zero_indexed_position = position - 1 # minus one because zero indexed
238+
reference_allele = records_dictionary[chromosome_name].seq[zero_indexed_position]
239+
records_dictionary.close()
240+
return reference_allele
241+
228242
def get_gvf_attributes(column9_of_gvf):
229243
"""Get a dictionary of GVF attributes
230244
:param column9_of_gvf: attributes column - the final column of the GVF file
@@ -409,6 +423,7 @@ class VcfLine:
409423
def __init__(self, gvf_feature_line_object,
410424
dgva_attribute_dict,
411425
gvf_attribute_dict,
426+
assembly_file,
412427
lines_custom_structured,
413428
lines_standard_ALT,
414429
lines_standard_INFO,
@@ -432,9 +447,10 @@ def __init__(self, gvf_feature_line_object,
432447
all_possible_FILTER_lines,
433448
all_possible_FORMAT_lines)
434449

450+
self.assembly = assembly_file
435451
# DATALINE
436452
self.chrom = gvf_feature_line_object.seqid
437-
self.pos = gvf_feature_line_object.start
453+
self.pos = int(gvf_feature_line_object.start)
438454
self.id = self.vcf_value["ID"] # attributes: ID
439455
self.ref = self.get_ref()
440456
self.alt = self.vcf_value["Variant_seq"] # attributes: variant_seq
@@ -443,7 +459,7 @@ def __init__(self, gvf_feature_line_object,
443459

444460
# INFO
445461
#TODO: specific SV info keys populated from gvf_feature_line
446-
self.key = self.chrom + "_" + self.pos
462+
self.key = self.chrom + "_" + str(self.pos)
447463
self.info = "pending_aggregation" # TODO: add info field for self.info
448464
self.source = gvf_feature_line_object.source
449465

@@ -497,7 +513,12 @@ def get_ref(self):
497513
if "Reference_seq" in self.vcf_value.keys():
498514
return self.vcf_value["Reference_seq"] # attributes:reference_seq
499515
else:
500-
return "." # TODO: how shall we fill this in with this scenario?
516+
if self.assembly:
517+
reference_allele = extract_reference_allele(self.assembly, self.chrom, self.pos)
518+
else:
519+
print("WARNING: No reference provided. Placeholder inserted for Reference allele.")
520+
reference_allele = "."
521+
return reference_allele
501522

502523
def __str__(self):
503524
string_to_return = '\t'.join((self.chrom, self.pos, self.key, self.qual, self.filter, self.info, self.source, self.phase, self.end, self.so_type, self.sample_name, self.format))
@@ -616,6 +637,7 @@ def generate_vcf_header_line(samples):
616637
def gvf_features_to_vcf_objects(gvf_lines_obj_list,
617638
dgva_attribute_dict,
618639
gvf_attribute_dict,
640+
assembly_file,
619641
lines_custom_structured,
620642
lines_standard_ALT,
621643
lines_standard_INFO,
@@ -628,6 +650,7 @@ def gvf_features_to_vcf_objects(gvf_lines_obj_list,
628650
""" Creates VCF objects from GVF feature lines and stores the VCF objects.
629651
:param gvf_lines_obj_list: list of GVF feature line objects
630652
:param gvf_attribute_dict: dictionary of GVF INFO attributes
653+
:param assembly_file: FASTA file to assembly
631654
:param dgva_attribute_dict: dictionary af DGVa specific INFO attributes
632655
:param lines_custom_structured: list to store custom structured metainformation lines
633656
:param lines_standard_ALT: ALT lines for this VCF file
@@ -650,6 +673,7 @@ def gvf_features_to_vcf_objects(gvf_lines_obj_list,
650673
vcf_object = VcfLine(gvf_featureline,
651674
dgva_attribute_dict,
652675
gvf_attribute_dict,
676+
assembly_file,
653677
lines_custom_structured,
654678
lines_standard_ALT,
655679
lines_standard_INFO,
@@ -724,9 +748,12 @@ def main():
724748
parser = argparse.ArgumentParser()
725749
parser.add_argument("gvf_input", help="GVF input file.")
726750
parser.add_argument("vcf_output", help="VCF output file.")
751+
parser.add_argument("-a", "--assembly", help="FASTA assembly file")
727752
args = parser.parse_args()
728753
print("The provided input file is: ", args.gvf_input)
729754
print("The provided output file is: ", args.vcf_output)
755+
if args.assembly:
756+
print("The provided assembly file is: ", args.assembly)
730757

731758
all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines()
732759
all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines()
@@ -747,9 +774,15 @@ def main():
747774
gvf_info_attributes_file = os.path.join(etc_folder, 'gvfINFOattributes.tsv')
748775
dgva_attribute_dict = read_dgva_info_attributes(dgva_info_attributes_file=dgva_info_attributes_file) # needed to generate custom strings
749776
gvf_attribute_dict = read_gvf_info_attributes(gvf_info_attributes_file=gvf_info_attributes_file)
777+
if args.assembly:
778+
assembly_file = os.path.abspath(args.assembly)
779+
else:
780+
assembly_file = None
781+
750782
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
751783
dgva_attribute_dict,
752784
gvf_attribute_dict,
785+
assembly_file,
753786
lines_custom_structured,
754787
lines_standard_ALT,
755788
lines_standard_INFO,

tests/input/zebrafish.fa

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>chromosome1
2+
ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
3+
>chromosome2
4+
TTTTTTT

tests/input/zebrafish.gvf

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#sample: sample_name=Zon9;subject_name=Zon9
1919
#sample: sample_name=JenMale7;subject_name=JenMale7
2020
#testing_unknown_pragma
21-
1 DGVa copy_number_loss 776614 786127 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=.
22-
1 DGVa copy_number_loss 1277246 1320592 . + . ID=12;Name=nssv1406143;Alias=CNV22899;variant_call_so_id=SO:0001743;parent=nsv811095;Start_range=.,1277246;End_range=1320592,.;submitter_variant_call_id=CNV22899;sample_name=Zon9;remap_score=.87402;Variant_seq=.
23-
1 DGVa copy_number_gain 1284210 1320592 . + . ID=13;Name=nssv1389474;Alias=CNV6230;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV6230;sample_name=JenMale7;remap_score=.69625;Variant_seq=.
24-
1 DGVa copy_number_gain 1284210 1320592 . + . ID=14;Name=nssv1388955;Alias=CNV5711;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV5711;sample_name=JenMale6;remap_score=.85344;Variant_seq=.;allele_count=3
21+
chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=.
22+
chromosome1 DGVa copy_number_loss 127 132 . + . ID=12;Name=nssv1406143;Alias=CNV22899;variant_call_so_id=SO:0001743;parent=nsv811095;Start_range=.,1277246;End_range=1320592,.;submitter_variant_call_id=CNV22899;sample_name=Zon9;remap_score=.87402;Variant_seq=.
23+
chromosome1 DGVa copy_number_gain 128 132 . + . ID=13;Name=nssv1389474;Alias=CNV6230;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV6230;sample_name=JenMale7;remap_score=.69625;Variant_seq=.
24+
chromosome1 DGVa copy_number_gain 128 132 . + . ID=14;Name=nssv1388955;Alias=CNV5711;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV5711;sample_name=JenMale6;remap_score=.85344;Variant_seq=.;allele_count=3

tests/test_convert_gvf_to_vcf.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def setUp(self):
1919
self.dgva_input_file = os.path.join(self.input_folder_parent, "etc","dgvaINFOattributes.tsv")
2020
self.gvf_input_file = os.path.join(self.input_folder_parent, "etc","gvfINFOattributes.tsv")
2121
self.output_file = os.path.join(input_folder, "input", "a.vcf")
22+
self.assembly = os.path.join(input_folder, "input", "zebrafish.fa")
2223

2324
def test_read_in_gvf_file(self):
2425
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
@@ -38,6 +39,7 @@ def test_gvf_features_to_vcf_objects(self):
3839
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
3940
dgva_attribute_dict = read_dgva_info_attributes(self.dgva_input_file)
4041
gvf_attribute_dict = read_gvf_info_attributes(self.gvf_input_file)
42+
assembly_file = self.assembly
4143
# custom meta-information lines for this VCF file
4244
lines_custom_structured = []
4345
lines_custom_unstructured = []
@@ -53,7 +55,9 @@ def test_gvf_features_to_vcf_objects(self):
5355
all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines()
5456

5557
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list, dgva_attribute_dict,
56-
gvf_attribute_dict, lines_custom_structured,
58+
gvf_attribute_dict,
59+
assembly_file,
60+
lines_custom_structured,
5761
lines_standard_ALT, lines_standard_INFO,
5862
lines_standard_FILTER, lines_standard_FORMAT,
5963
all_possible_ALT_lines,
@@ -65,12 +69,13 @@ def test_gvf_features_to_vcf_objects(self):
6569
assert len(list_of_vcf_objects) > 1
6670

6771
def test_get_ref(self):
68-
gvf_feature_line = "1 DGVa copy_number_loss 776614 786127 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=."
72+
gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=."
6973
f_list = gvf_feature_line.split("\t")
7074
line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], f_list[7], f_list[8])
7175
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
7276
dgva_attribute_dict = read_dgva_info_attributes(self.dgva_input_file)
7377
gvf_attribute_dict = read_gvf_info_attributes(self.gvf_input_file)
78+
assembly_file = self.assembly
7479
# custom meta-information lines for this VCF file
7580
lines_custom_structured = []
7681
lines_custom_unstructured = []
@@ -88,6 +93,7 @@ def test_get_ref(self):
8893
v = VcfLine(line_object,
8994
dgva_attribute_dict,
9095
gvf_attribute_dict,
96+
assembly_file,
9197
lines_custom_structured,
9298
lines_standard_ALT,
9399
lines_standard_INFO,
@@ -98,12 +104,13 @@ def test_get_ref(self):
98104
all_possible_FILTER_lines,
99105
all_possible_FORMAT_lines)
100106
reference_allele = v.get_ref()
101-
assert reference_allele == "."
107+
assert len(reference_allele) != 0
102108

103109
def test_generate_vcf_metainformation(self):
104110
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
105111
dgva_attribute_dict = read_dgva_info_attributes(self.dgva_input_file)
106112
gvf_attribute_dict = read_gvf_info_attributes(self.gvf_input_file)
113+
assembly_file = self.assembly
107114
# custom meta-information lines for this VCF file
108115
lines_custom_structured = []
109116
lines_custom_unstructured = []
@@ -121,6 +128,7 @@ def test_generate_vcf_metainformation(self):
121128
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
122129
dgva_attribute_dict,
123130
gvf_attribute_dict,
131+
assembly_file,
124132
lines_custom_structured,
125133
lines_standard_ALT,
126134
lines_standard_INFO,
@@ -158,6 +166,7 @@ def test_generate_vcf_header_line(self):
158166
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
159167
dgva_attribute_dict,
160168
gvf_attribute_dict,
169+
self.assembly,
161170
lines_custom_structured,
162171
lines_standard_ALT,
163172
lines_standard_INFO,
@@ -196,6 +205,7 @@ def test_populate_sample_formats(self):
196205
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
197206
dgva_attribute_dict,
198207
gvf_attribute_dict,
208+
self.assembly,
199209
lines_custom_structured,
200210
lines_standard_ALT,
201211
lines_standard_INFO,
@@ -234,6 +244,7 @@ def test_format_sample_values(self):
234244
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
235245
dgva_attribute_dict,
236246
gvf_attribute_dict,
247+
self.assembly,
237248
lines_custom_structured,
238249
lines_standard_ALT,
239250
lines_standard_INFO,
@@ -255,6 +266,7 @@ def test_format_vcf_datalines(self):
255266
gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file)
256267
dgva_attribute_dict = read_dgva_info_attributes(self.dgva_input_file)
257268
gvf_attribute_dict = read_gvf_info_attributes(self.gvf_input_file)
269+
assembly_file = self.assembly
258270
# custom meta-information lines for this VCF file
259271
lines_custom_structured = []
260272
lines_custom_unstructured = []
@@ -272,6 +284,7 @@ def test_format_vcf_datalines(self):
272284
vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list,
273285
dgva_attribute_dict,
274286
gvf_attribute_dict,
287+
assembly_file,
275288
lines_custom_structured,
276289
lines_standard_ALT,
277290
lines_standard_INFO,

0 commit comments

Comments
 (0)