diff --git a/convert_gvf_to_vcf/convertGVFtoVCF.py b/convert_gvf_to_vcf/convertGVFtoVCF.py index 8ea14d2..5457efb 100644 --- a/convert_gvf_to_vcf/convertGVFtoVCF.py +++ b/convert_gvf_to_vcf/convertGVFtoVCF.py @@ -1,124 +1,110 @@ import argparse import os + from Bio import SeqIO # setting up paths to useful directories convert_gvf_to_vcf_folder = os.path.dirname(__file__) etc_folder = os.path.join(convert_gvf_to_vcf_folder, 'etc') -def read_reserved_key(header_type): - """ - Reads in the reserved INFO or FORMAT keys and returns a dictionary of the reserved INFO_lines which can be used to - populate the header - """ - reserved_lines = {} - reserved_info_keys_file = os.path.join(etc_folder, f'Reserved{header_type}keys.tsv') - with open(reserved_info_keys_file) as info_keys_file: - next(info_keys_file) # Skip the header - info_keys_content = info_keys_file.readlines() - for info in info_keys_content: - info_tokens = info.rstrip().split("\t") - keyid = info_tokens[0] - number=info_tokens[1] - type_for_key=info_tokens[2] - description=info_tokens[3] - reserved_info_string = f'##{header_type}=' - reserved_lines[keyid] = reserved_info_string - return reserved_lines - -def read_sv_key(header_type): - """ - Reads in INFO, ALT or FORMAT keys for structural variants and return a dictionary of SV specific lines - """ - sv_lines = {} - sv_keys_file = os.path.join(etc_folder, f'sv{header_type}keys.tsv') - with open(sv_keys_file) as open_file: - next(open_file) # Skip the header - for line in open_file: - sv_tokens = line.rstrip().split("\t") - sv_key_id = sv_tokens[0] - sv_line = sv_tokens[1] - sv_lines[sv_key_id]= sv_line - return sv_lines - -def generate_custom_structured_metainfomation_line(vcfkey, vcfkey_id, vcfkey_number, vcfkey_type, vcfkey_description, - optional_extrafields=None): +def read_file(prefix, header_type): + """Reads in {reserved/sv}{INFO/FORMAT}keys.tsv files and returns the dictionary where the key is the KEYID + (usually column 1) and the value is a list of file tokens + :param prefix: prefix of files to read i.e. sv or reserved + :param header_type: type of header file to read i.e. INFO or FORMAT + :return: file lines: a dictionary where the key is the KEYID and the value is a list of file tokens + """ + file_lines = {} + keys_tsv_file = os.path.join(etc_folder, f'{prefix}{header_type}keys.tsv') + try: + with open(keys_tsv_file) as keys_file: + next(keys_file) # Skip the header + for line in keys_file: + file_tokens = line.rstrip().split("\t") + key_id = file_tokens[0] + number_of_tokens = len(file_tokens) + if number_of_tokens >= 2: + for token in range(number_of_tokens): + value_to_add = file_tokens[token] + file_lines.setdefault(key_id, []).append(value_to_add) + except FileNotFoundError: + print(f'File not found: {keys_tsv_file}') + return file_lines + + +def generate_custom_structured_metainformation_line(vcf_key, vcf_key_id, vcf_key_number, vcf_key_type, vcf_key_description, + optional_extra_fields=None): """ Generates a custom structured meta-information line for INFO/FILTER/FORMAT/ALT - :param vcfkey: required field INFO, FILTER, FORMAT, ALT - :param vcfkey_id: required field for structured lines ID - :param vcfkey_number: The number of values that can be included or special character: A or R or G or . - :param vcfkey_type: Values are Integer, Float, Character, String - :param vcfkey_description: Description - :param optional_extrafields: an optional field, dictionary of custom fields and their values + :param vcf_key: required field INFO, FILTER, FORMAT, ALT + :param vcf_key_id: required field for structured lines ID + :param vcf_key_number: The number of values that can be included or special character: A or R or G or . + :param vcf_key_type: Values are Integer, Float, Character, String + :param vcf_key_description: Description + :param optional_extra_fields: an optional field, dictionary of custom fields and their values :return: custom_structured_string """ extra_keys_kv_lines = [] - if optional_extrafields: - for extra_field in optional_extrafields: - kv_line = "," + extra_field + "=" + '"' + optional_extrafields[extra_field] + '"' + if optional_extra_fields: + for extra_field in optional_extra_fields: + kv_line = "," + extra_field + "=" + '"' + optional_extra_fields[extra_field] + '"' extra_keys_kv_lines.append(kv_line) vcf_key_extra_keys = ''.join(extra_keys_kv_lines) - custom_structured_string = f'##{vcfkey}=' + custom_structured_string = (f'##{vcf_key}=<' + f'ID="{vcf_key_id}",' + f'Number="{vcf_key_number}",' + f'Type="{vcf_key_type}",' + f'Description="{vcf_key_description}"' + f'{vcf_key_extra_keys}>') return custom_structured_string -def generate_all_possible_standard_structured_alt_lines(): - """Generates a dictionary of all possible (i.e. structural variant ALT key) standard structured ALT lines. - :return: all_possible_ALT_lines: dictionary of ALT key tag ID => standard structured ALT line - """ - # note: svALTkey may be an incomplete list at the moment - # no reserved alt keys - all_possible_ALT_lines = read_sv_key('ALT') - return all_possible_ALT_lines - - -def generate_all_possible_standard_structured_info_lines(): - """ Generates a dictionary of all possible (i.e. reserved info key and structural variant info key) standard structured INFO lines. - :return: all_possible_INFO_lines: dictionary of INFO key tag ID => standard structured INFO line - """ - all_possible_INFO_lines = {} # dictionary of INFO key tag => standard structured INFO line - # generate all possible lines for the reserved info keys and structural variant info keys - all_possible_INFO_lines.update(read_reserved_key('INFO')) - all_possible_INFO_lines.update(read_sv_key('INFO')) - return all_possible_INFO_lines - - -def generate_all_possible_standard_structured_filter_lines(): - """ Generates a dictionary of all possible (i.e. reserved filter key and structural variant info key) standard structured INFO lines. - :return: all_possible_FILTER_lines: dictionary of FILTER key tag ID => standard structured FILTER line - """ - all_possible_FILTER_lines = {} # dictionary of INFO key tag => standard structured INFO line - #TODO: fill in the reading of filtered lines - return all_possible_FILTER_lines - - -def generate_all_possible_standard_structured_format_lines(): - """ Generates a dictionary of all possible (i.e. reserved format key and structural variant info key) standard structured FORMAT lines. - :return: all_possible_FORMAT_lines: dictionary of FORMAT key tag ID => standard structured FORMAT line +def generate_all_possible_standard_structured_lines(header_type): + """ Generates a fictionary of all possible standard structured lines for INFO/FILTER/FORMAT/ALT + :param header_type: type of header file to read i.e. ALT, FILTER, INFO or FORMAT + :return: dictionary of all possible standard structured lines keys for the header type """ - all_possible_FORMAT_lines = {} - # TABLE 2 - # FORMAT KEYS FOR STRUCTURAL VARIANTS - all_possible_FORMAT_lines.update(read_reserved_key('FORMAT')) - all_possible_FORMAT_lines.update(read_sv_key('FORMAT')) - return all_possible_FORMAT_lines - - -def generate_standard_structured_metainformation_line(vcf_key_id, standard_lines_for_vcfkey, all_possible_lines): - """Generates a list of standard structured metainformation lines. + all_possible_lines = {} + prefix = {} + prefix["reserved"] = True + prefix["sv"] = True + if header_type == 'ALT': + prefix["reserved"] = False + if prefix["reserved"]: + reserved_key = read_file("reserved", header_type) + for r_key in reserved_key: + key_id = reserved_key[r_key][0] + number = reserved_key[r_key][1] + type_for_key = reserved_key[r_key][2] + description = reserved_key[r_key][3] + reserved_string = f'##{header_type}=' + all_possible_lines[key_id] = reserved_string + if prefix['sv']: + sv_key = read_file("sv", header_type) + for s_key in sv_key: + sv_key_id = sv_key[s_key][0] + sv_line = sv_key[s_key][1] + all_possible_lines[sv_key_id] = sv_line + return all_possible_lines + + +def generate_standard_structured_metainformation_line(vcf_key_id, standard_lines_for_vcf_key, all_possible_lines): + """Generates a standard structured metainformation lines. :param vcf_key_id: VCF tag key id - :param standard_lines_for_vcfkey: lines_standard_NAME i.e a list of standard lines for this VCF file regarding INFO or ALT or FILTER or FORMAT - :param all_possible_lines: all_possible_NAME_lines i.e list of all possible lines for INFO or ALT or FILTER or FORMAT - :return: standard_lines_for_vcfkey + :param standard_lines_for_vcf_key: lines_standard_NAME i.e. list of standard lines for VCF INFO/ALT/FILTER/FORMAT + :param all_possible_lines: all_possible_NAME_lines i.e. list of all possible lines for INFO or ALT or FILTER or FORMAT + :return: standard_structured_line: a string """ standard_structured_line = all_possible_lines[vcf_key_id] - standard_lines_for_vcfkey.append(standard_structured_line) - return standard_lines_for_vcfkey + standard_lines_for_vcf_key.append(standard_structured_line) + return standard_structured_line -def generate_custom_unstructured_metainfomation_line(vcf_unstructured_key, vcf_unstructured_value, lines_custom_unstructured): - """ Generates a formatted unstructured metainformation line using a custom key value pair. This is stored in the list called lines_custom_unstructured. +def generate_custom_unstructured_metainformation_line(vcf_unstructured_key, + vcf_unstructured_value, + lines_custom_unstructured): + """ Generates a formatted unstructured metainformation line using a custom key value pair. + This is stored in the list called lines_custom_unstructured. :param lines_custom_unstructured: list to store custom unstructured metainformation lines :param vcf_unstructured_key: key for custom unstructured metainformation line :param vcf_unstructured_value: value for custom unstructured metainformation line @@ -128,6 +114,7 @@ def generate_custom_unstructured_metainfomation_line(vcf_unstructured_key, vcf_u lines_custom_unstructured.append(custom_unstructured_string) return custom_unstructured_string + def read_info_attributes(info_attributes_file): """ Read in the file containing specific INFO attributes. :param info_attributes_file: A file containing the specific attributes @@ -143,26 +130,51 @@ def read_info_attributes(info_attributes_file): return attribute_dict -def extract_reference_allele(fasta_file, chromosome_name, position): +def read_sequence_ontology_symbolic_allele(so_symbolic_allele_file): + """ Read in the file containing sequence ontology symbolic allele and returns a dictionary. + :param: so_symbolic_allele_file - the file of sequence ontology symbolic alleles. + :return: symbolic allele dictionary - symbolic alleles as key and list of variant types as the value. + """ + symbolic_allele_dict = {} + with open(so_symbolic_allele_file) as so_symbolic_allele: + next(so_symbolic_allele) + for line in so_symbolic_allele: + allele_tokens = line.rstrip().split("\t") + symb_allele = allele_tokens[0] + sequence_ontology_id = allele_tokens[2] + name = allele_tokens[3] + description = allele_tokens[4] + symbolic_allele_dict.setdefault(name, []).append(sequence_ontology_id) + symbolic_allele_dict.setdefault(name, []).append(symb_allele) + symbolic_allele_dict.setdefault(name, []).append(description) + return symbolic_allele_dict + + +def extract_reference_allele(fasta_file, chromosome_name, position, end): """ Extracts the reference allele from the assembly. :param fasta_file: FASTA file of the assembly :param chromosome_name: name of the sequence :param position: position + :param end: end position :return: reference_allele: base found at this chromosome_name at this position within this fasta_file """ - # using .index instead of .todict for memory efficiency: https://biopython.org/docs/1.76/api/Bio.SeqIO.html#input-multiple-records + # using .index for memory efficiency: https://biopython.org/docs/1.76/api/Bio.SeqIO.html#input-multiple-records records_dictionary = SeqIO.index(fasta_file, "fasta") - zero_indexed_position = position - 1 # minus one because zero indexed - reference_allele = records_dictionary[chromosome_name].seq[zero_indexed_position] + zero_indexed_position = position - 1 # minus one because zero indexed + zero_indexed_end = end - 1 + reference_allele = "" + for position in range(zero_indexed_position, zero_indexed_end): + reference_allele = reference_allele + records_dictionary[chromosome_name].seq[position] records_dictionary.close() return reference_allele + def get_gvf_attributes(column9_of_gvf): """Get a dictionary of GVF attributes - :param column9_of_gvf: attributes column - the final column of the GVF file + :param column9_of_gvf: column - the final column of the GVF file :return: gvf_attribute_dictionary: a dictionary of attribute keys and their values """ - gvf_attribute_dictionary = {} # attribute key => value + gvf_attribute_dictionary = {} # attribute key => value # parse by semicolon this creates attribute # parse by equals sign this creates tag-values, if the value is a comma, create a list attributes_in_gvf_line = column9_of_gvf.split(";") @@ -181,46 +193,42 @@ def convert_gvf_attributes_to_vcf_values(column9_of_gvf, dgva_attribute_dict, gvf_attribute_dict, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines): - + standard_lines_dictionary, + all_possible_lines_dictionary): gvf_attribute_dictionary = get_gvf_attributes(column9_of_gvf) vcf_vals = {} catching_for_review = [] - # created a rough guide to attributes_for_custom_structured_metainfomation in dgvaINFOattributes.tsv = this probably should be refined at a later date + + # created a rough guide to attributes_for_custom_structured_metainformation in dgvaINFOattributes.tsv = this probably should be refined at a later date # TODO: edit dgvaINFOattributes.tsv i.e. replace unknown placeholders '.' with the actual answer, provide a more informative description + print(gvf_attribute_dictionary) for attrib_key in gvf_attribute_dictionary: # if dgva specific key, create custom string otherwise do standard if attrib_key in dgva_attribute_dict: lines_custom_structured.append( - generate_custom_structured_metainfomation_line(vcfkey="INFO", vcfkey_id=attrib_key, - vcfkey_number=dgva_attribute_dict[attrib_key][1], - vcfkey_type=dgva_attribute_dict[attrib_key][2], - vcfkey_description=dgva_attribute_dict[attrib_key][3], - optional_extrafields=None) + generate_custom_structured_metainformation_line(vcf_key="INFO", vcf_key_id=attrib_key, + vcf_key_number=dgva_attribute_dict[attrib_key][1], + vcf_key_type=dgva_attribute_dict[attrib_key][2], + vcf_key_description=dgva_attribute_dict[attrib_key][3], + optional_extra_fields=None) ) vcf_vals[attrib_key]=gvf_attribute_dictionary[attrib_key] elif attrib_key == "allele_count": #generate_standard_structured_metainformation_line("INFO", "AC", lines_standard_ALT, lines_standard_INFO, lines_standard_FILTER, lines_standard_FORMAT, all_possible_ALT_lines, all_possible_INFO_lines, all_possible_FILTER_lines, all_possible_FORMAT_lines) - lines_standard_INFO = generate_standard_structured_metainformation_line("AC", lines_standard_INFO, all_possible_INFO_lines) + lines_standard_info_to_add = generate_standard_structured_metainformation_line("AC", standard_lines_dictionary["INFO"], all_possible_lines_dictionary["INFO"]) + standard_lines_dictionary["INFO"].append(lines_standard_info_to_add) elif attrib_key == "allele_frequency": - lines_standard_INFO = generate_standard_structured_metainformation_line("AF", lines_standard_INFO, all_possible_INFO_lines) + lines_standard_info = generate_standard_structured_metainformation_line("AF", standard_lines_dictionary["INFO"], all_possible_lines_dictionary["INFO"]) elif attrib_key == "ciend": - lines_standard_INFO = generate_standard_structured_metainformation_line("CIEND", lines_standard_INFO, all_possible_INFO_lines) + lines_standard_info = generate_standard_structured_metainformation_line("CIEND", standard_lines_dictionary["INFO"], all_possible_lines_dictionary["INFO"]) elif attrib_key == "copy_number": - lines_standard_INFO = generate_standard_structured_metainformation_line("CN", lines_standard_INFO, all_possible_INFO_lines) + lines_standard_info = generate_standard_structured_metainformation_line("CN", standard_lines_dictionary["INFO"], all_possible_lines_dictionary["INFO"]) elif attrib_key == "insertion_length": - lines_standard_INFO = generate_standard_structured_metainformation_line("SVLEN", lines_standard_INFO, - all_possible_INFO_lines) + lines_standard_info = generate_standard_structured_metainformation_line("SVLEN", standard_lines_dictionary["INFO"], + all_possible_lines_dictionary["INFO"]) elif attrib_key == "mate_id": - lines_standard_INFO = generate_standard_structured_metainformation_line("MATEID", lines_standard_INFO, - all_possible_INFO_lines) + lines_standard_info = generate_standard_structured_metainformation_line("MATEID", standard_lines_dictionary["INFO"], + all_possible_lines_dictionary["INFO"]) elif attrib_key == "sample_name": #sample_names.append(sample_names) pass @@ -235,12 +243,12 @@ def convert_gvf_attributes_to_vcf_values(column9_of_gvf, attrib_key == "Reference_codon" or attrib_key == "Variant_aa" or attrib_key == "Reference_aa" or attrib_key == "breakpoint_detail" or attrib_key == "Sequence_context"): lines_custom_structured.append( - generate_custom_structured_metainfomation_line( - vcfkey="INFO", vcfkey_id=attrib_key, - vcfkey_number=gvf_attribute_dict[attrib_key][1], - vcfkey_type=gvf_attribute_dict[attrib_key][2], - vcfkey_description=gvf_attribute_dict[attrib_key][3], - optional_extrafields=None) + generate_custom_structured_metainformation_line( + vcf_key="INFO", vcf_key_id=attrib_key, + vcf_key_number=gvf_attribute_dict[attrib_key][1], + vcf_key_type=gvf_attribute_dict[attrib_key][2], + vcf_key_description=gvf_attribute_dict[attrib_key][3], + optional_extra_fields=None) ) elif attrib_key == "Dbxref": # custom info tag + pase and add to id? @@ -287,7 +295,7 @@ class GvfFeatureline: def __init__(self, seqid, source, so_type, start, end, score, strand, phase, attributes): """ Initialise GvfFeature line :param seqid: sequence ID i.e. chromosome number - :param source: source i.e DGVa + :param source: source i.e. DGVa :param so_type: sequence ontology structural variant type :param start: start position :param end: end position @@ -345,55 +353,47 @@ class VcfLine: def __init__(self, gvf_feature_line_object, dgva_attribute_dict, gvf_attribute_dict, + symbolic_allele_dictionary, assembly_file, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines): + standard_lines_dictionary, + all_possible_lines_dictionary): + # ATTRIBUTES self.vcf_value = convert_gvf_attributes_to_vcf_values(gvf_feature_line_object.attributes, dgva_attribute_dict, gvf_attribute_dict, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines) - + standard_lines_dictionary, + all_possible_lines_dictionary) self.assembly = assembly_file - # DATALINE + self.symbolic_allele_dictionary = symbolic_allele_dictionary + self.iupac_ambiguity_dictionary = self.build_iupac_ambiguity_code() + # GVF + self.source = gvf_feature_line_object.source + self.so_type = gvf_feature_line_object.feature_type #currently column 3 of gvf, but could be an attribute so perhapsVCF: INFO or FORMAT? + self.end = int(gvf_feature_line_object.end) + self.phase = gvf_feature_line_object.phase # this is always a placeholder'.' + + # VCF DATALINE self.chrom = gvf_feature_line_object.seqid self.pos = int(gvf_feature_line_object.start) self.id = self.vcf_value["ID"] # attributes: ID - self.ref = self.get_ref() - self.alt = self.vcf_value["Variant_seq"] # attributes: variant_seq + self.length = self.end - self.pos self.qual = gvf_feature_line_object.score # see EVA-3879: this is always '.' self.filter = "." # this is always a placeholder '.'; perhaps could add s50. # INFO #TODO: specific SV info keys populated from gvf_feature_line self.key = self.chrom + "_" + str(self.pos) - self.info = "pending_aggregation" # TODO: add info field for self.info - self.source = gvf_feature_line_object.source - - # # non-vcf - self.phase = gvf_feature_line_object.phase # this is always a placeholder'.' - self.end = gvf_feature_line_object.end - self.so_type = gvf_feature_line_object.feature_type #currently column 3 of gvf, but could be an attribute so perhapsVCF: INFO or FORMAT? + self.info = [] # TODO: add info field for self.info + # calculated last + self.ref = self.get_ref() + self.alt = self.get_alt(standard_lines_dictionary, all_possible_lines_dictionary) self.sample_name = self.vcf_value["sample_name"] # this should be each samples names format value # sample names needs to be populated in attributes # # higher priority self.format = "pending" #TODO: set this in convertgvfattributes - # # each item in the list exclude_from_info has its own place in the VCF file, so not part of info # exclude_from_info = ["ID", # done above # "Variant_seq", # done above @@ -428,38 +428,243 @@ def __init__(self, gvf_feature_line_object, # else: # pass + def add_padded_base(self, ref, alt, placed_before : bool): + """ Adds padded base to REF and ALT allele + :param ref: reference allele + :param alt: alt allele + :param placed_before: padded base is placed before ref or alt True or False + :return: (padded_base, self.pos, self.ref, self.alt) + """ + if placed_before: + padded_base_pos = self.pos - 1 + self.pos = padded_base_pos + padded_base = extract_reference_allele(self.assembly, self.chrom, self.pos, self.end) + ref = padded_base + ref + if alt == ".": + alt = padded_base + else: + alt = padded_base + alt + elif not placed_before: + padded_base_pos = self.pos + 1 + new_end = self.end + 1 + padded_base = extract_reference_allele(self.assembly, self.chrom, padded_base_pos, new_end) + ref = ref + padded_base + if alt == ".": + alt = padded_base + else: + alt = alt + padded_base + else: + print("WARNING: Variable placed_before unknown: " + str(placed_before)) + padded_base = None + return padded_base, self.pos, ref, alt + + def build_iupac_ambiguity_code(self): + """ Builds dictionary for the iupac ambiguity code + :return: iupac_ambiguity_dictionary: iupac code as key, list of values as value + """ + # see PMID: 20202974 (Table 1) for the official list + iupac_codes = ["R", "Y", "M", "K", "S", "D", "W", "H", "B", "V", "D", "N"] + R = ["A", "G"] + Y = ["C", "T"] + M = ["A", "C"] + K = ["G", "T"] + S = ["C", "G"] + W = ["A", "T"] + H = ["A", "C", "T"] + B = ["C", "G", "T"] + V = ["A", "C", "G"] + D = ["A", "G", "T"] + N = ["A", "C", "G", "T"] + iupac_values = [R, Y, M, K, S, D, W, H, B, V, D, N] + iupac_ambiguity_dictionary = dict(zip(iupac_codes, iupac_values)) + return iupac_ambiguity_dictionary + + def convert_iupac_ambiguity_code(self, iupac_ambiguity_dictionary, ref_to_convert): + """ Converts the REF allele if it contains IUPAC ambiguity cod + :param iupac_ambiguity_dictionary: dictionary of IUPAC ambiguity code and a list of values + :param ref_to_convert: reference allele to be converted + :return: self.ref + """ + converted_ref = "" + for base in ref_to_convert: + if base in iupac_ambiguity_dictionary: + iupac_value = min(iupac_ambiguity_dictionary[base]) + converted_base = iupac_value + else: + converted_base = base + converted_ref = converted_ref + converted_base + return converted_ref + + def check_ref(self, ref_allele_to_be_checked): + """ Checks whether a reference allele meets the requirements of the VCF specification + :param ref_allele_to_be_checked: reference allele to check + :return: checked_reference_allele: reference allele that meets the requirements of the VCF specification""" + if isinstance(ref_allele_to_be_checked, str): + if not all(bases in ref_allele_to_be_checked for bases in ["A", "C", "G", "T", "N"]): + checked_reference_allele = self.convert_iupac_ambiguity_code(self.iupac_ambiguity_dictionary, ref_allele_to_be_checked) + else: + checked_reference_allele = ref_allele_to_be_checked + else: + print("WARNING: Ref allele must be a string. Setting to missing value.", ref_allele_to_be_checked) + checked_reference_allele = "." + return checked_reference_allele + def get_ref(self): """ Gets the reference allele from attributes column or if not found, returns "." :return: reference allele """ if "Reference_seq" in self.vcf_value.keys(): - return self.vcf_value["Reference_seq"] # attributes:reference_seq + reference_allele = self.vcf_value["Reference_seq"] else: if self.assembly: - reference_allele = extract_reference_allele(self.assembly, self.chrom, self.pos) + reference_allele = extract_reference_allele(self.assembly, self.chrom, self.pos, self.end) else: print("WARNING: No reference provided. Placeholder inserted for Reference allele.") reference_allele = "." - return reference_allele + if reference_allele != ".": + reference_allele = self.check_ref(reference_allele) + return reference_allele + + + def generate_symbolic_allele(self, standard_lines_dictionary, all_possible_lines_dictionary): + """ Generates the symbolic allele and stores the corresponding metainformation lines. Also determines if variant is precise or imprecise. + :param lines_standard_alt: stores ALT metainformation lines + :param lines_standard_info: stores INFO metainformation lines + :param all_possible_alt_lines: list of all possible ALT lines + :param all_possible_info_lines: list of all possible INFO lines + :return: symbolic_allele, self.info, lines_standard_ALT, lines_standard_INFO + """ + symbolic_allele_id = self.symbolic_allele_dictionary[self.so_type][1] + symbolic_allele = f'<{symbolic_allele_id}>' + + lines_standard_alt = standard_lines_dictionary["ALT"] + lines_standard_info = standard_lines_dictionary["INFO"] + all_possible_alt_lines = all_possible_lines_dictionary["ALT"] + all_possible_info_lines = all_possible_lines_dictionary["INFO"] + + if symbolic_allele_id in all_possible_alt_lines: + lines_standard_alt.append(all_possible_alt_lines[symbolic_allele_id]) + + info_svlen = None + if self.length: + info_svlen = "SVLEN=" + str(self.length) + + start_range_lower_bound = self.vcf_value["Start_range"][0] + start_range_upper_bound = self.vcf_value["Start_range"][1] + end_range_lower_bound = self.vcf_value["End_range"][0] + end_range_upper_bound = self.vcf_value["End_range"][1] + + # setting up fields to be inserted into INFO + info_end = None + info_imprecise = None + info_cipos = None + info_ciend = None + + if start_range_lower_bound == "." or start_range_upper_bound == "." or end_range_lower_bound == "." or end_range_upper_bound == ".": + is_imprecise = False + info_end = "END=" + str(self.pos + len(self.ref) - 1) + else: + is_imprecise = True + info_imprecise = "IMPRECISE" + + cipos_lower_bound = int(start_range_lower_bound) - self.pos + cipos_upper_bound = int(start_range_upper_bound) - self.pos + info_cipos = "CIPOS=" + str(cipos_lower_bound) + "," + str(cipos_upper_bound) + + ciend_lower_bound = int(start_range_lower_bound) - self.pos + ciend_upper_bound = int(start_range_upper_bound) - self.pos + info_ciend = "CIEND=" + str(ciend_lower_bound) + "," + str(ciend_upper_bound) + + if symbolic_allele == "": + info_end ="END=" + str( self.pos + len(self.ref) - 1 ) + elif symbolic_allele in {"", "", "", ""}: + info_end = "END=" + str(self.pos + self.length) + elif symbolic_allele == "<*>": + info_end = "END=" + str(self.pos + len(self.ref)) + else: + "Cannot identify symbolic allele" + # for all variants (precise and imprecise) + self.info.append(info_end) + lines_standard_info.append(all_possible_info_lines["END"]) + self.info.append(info_svlen) + lines_standard_info.append(all_possible_info_lines["SVLEN"]) + + # for imprecise variants only + if is_imprecise: + self.info.append(info_imprecise) + lines_standard_info.append(all_possible_info_lines["IMPRECISE"]) + self.info.append(info_cipos) + lines_standard_info.append(all_possible_info_lines["CIPOS"]) + self.info.append(info_ciend) + lines_standard_info.append(all_possible_info_lines["CIEND"]) + return symbolic_allele, self.info, lines_standard_alt, lines_standard_info + + def get_alt(self, standard_lines_dictionary, all_possible_lines_dictionary): + """ Gets the ALT allele for the VCF file + :param lines_standard_alt: store ALT lines + :param lines_standard_info: store NFO lines + :param all_possible_alt_lines: dictionary of all possible ALT lines + :param all_possible_info_lines: dictionary of all possible INFO lines + :return: symbolic_allele, self.info, lines_standard_ALT, lines_standard_INFO + """ + lines_standard_alt = standard_lines_dictionary["ALT"] + lines_standard_info = standard_lines_dictionary["INFO"] + all_possible_alt_lines = all_possible_lines_dictionary["ALT"] + all_possible_info_lines = all_possible_lines_dictionary["INFO"] + + if any(base in self.vcf_value["Variant_seq"] for base in ["A", "C", "G", "T", "N"]): + alterative_allele = self.vcf_value["Variant_seq"] + elif self.vcf_value["Variant_seq"] == '.': + symbolic_allele, self.info, lines_standard_alt, lines_standard_info = self.generate_symbolic_allele(standard_lines_dictionary, all_possible_lines_dictionary) + if symbolic_allele is None: + alterative_allele = "." + elif (self.vcf_value["Variant_seq"] == "." or self.vcf_value["Variant_seq"] == "-") and symbolic_allele is not None: + alterative_allele = symbolic_allele + # add padded bases + if self.pos == 1: + #print("pos, ref, alt",self.pos,self.ref, alterative_allele) + padded_base, self.pos, self.ref, self.alt = self.add_padded_base(self.ref, alterative_allele, False) + self.ref = self.check_ref(self.ref) + else: + #print("pos, ref, alt", self.pos,self.ref, alterative_allele) + padded_base, self.pos, self.ref, self.alt = self.add_padded_base(self.ref, alterative_allele, True) + self.ref = self.check_ref(self.ref) + else: + alterative_allele = "." + print("Cannot identify symbolic allele. Variant type is not supported.") + else: + alterative_allele = "." + print("Could not determine the alterative allele.") + return alterative_allele def __str__(self): string_to_return = '\t'.join((self.chrom, self.pos, self.key, self.qual, self.filter, self.info, self.source, self.phase, self.end, self.so_type, self.sample_name, self.format)) return string_to_return #step 9 using custom unstructured meta-information line = generate_custom_unstructured_metainfomation_line -def generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non_essential, list_of_vcf_objects): +def generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non_essential, list_of_vcf_objects, + standard_lines_dictionary): """ Generates a list of metainformation lines for the VCF header :param lines_custom_unstructured: a list of formatted unstructured metainformation lines using a custom key value pair :param gvf_pragmas: list of gvf pragmas to convert :param gvf_non_essential: list of non-essential gvf pragmas to convert :param list_of_vcf_objects: list of vcf objects + :param lines_standard_alt: list of ALT lines + :param lines_standard_info: list of INFO lines + :param lines_standard_filter: list of FILTER lines + :param lines_standard_format: list of FORMAT lines :return: unique_pragmas_to_add, sample_names: a list of pragmas (this list contains no duplicates), list of sample names """ pragmas_to_add = [] unique_pragmas_to_add = [] sample_names = [] + + unique_alt_lines_to_add = [] + unique_info_lines_to_add = [] + unique_filter_lines_to_add = [] + unique_format_lines_to_add = [] # MANDATORY: file format for VCF - pragma_fileformat = generate_custom_unstructured_metainfomation_line("fileformat", "VCFv4.4",lines_custom_unstructured) + pragma_fileformat = generate_custom_unstructured_metainformation_line("fileformat", "VCFv4.4", lines_custom_unstructured) pragmas_to_add.append(pragma_fileformat) #Go through essential pragmas #TODO: list of pragmas to add:reference=file, contig, phasing,INFO# @@ -467,30 +672,30 @@ def generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non # file date if pragma.startswith("##file-date"): date = pragma.split(" ")[1].replace("-", "") - pragma_filedate = generate_custom_unstructured_metainfomation_line("fileDate", date, lines_custom_unstructured) + pragma_filedate = generate_custom_unstructured_metainformation_line("fileDate", date, lines_custom_unstructured) pragmas_to_add.append(pragma_filedate) # source for vcf_obj in list_of_vcf_objects: - pragma_source = generate_custom_unstructured_metainfomation_line("source", vcf_obj.source, lines_custom_unstructured) + pragma_source = generate_custom_unstructured_metainformation_line("source", vcf_obj.source, lines_custom_unstructured) pragmas_to_add.append(pragma_source) # reference #TODO: add this # contig: recommended, see section 1.4.7 of VCF specification # TODO: add this # phasing # not required if pragma.startswith("##gff-version"): gff_version_number = pragma.split(" ")[1] - pragma_gff_version_number = generate_custom_unstructured_metainfomation_line("gff-version", gff_version_number, lines_custom_unstructured) + pragma_gff_version_number = generate_custom_unstructured_metainformation_line("gff-version", gff_version_number, lines_custom_unstructured) pragmas_to_add.append(pragma_gff_version_number) elif pragma.startswith("##gvf-version"): gvf_version_number = pragma.split(" ")[1] - pragma_gvf_version_number = generate_custom_unstructured_metainfomation_line("gvf-version", gvf_version_number, lines_custom_unstructured) + pragma_gvf_version_number = generate_custom_unstructured_metainformation_line("gvf-version", gvf_version_number, lines_custom_unstructured) pragmas_to_add.append(pragma_gvf_version_number) elif pragma.startswith("##species"): species_value = pragma.split(" ")[1] - pragma_species_value = generate_custom_unstructured_metainfomation_line("species", species_value, lines_custom_unstructured) + pragma_species_value = generate_custom_unstructured_metainformation_line("species", species_value, lines_custom_unstructured) pragmas_to_add.append(pragma_species_value) elif pragma.startswith("##genome-build"): genome_build = pragma.split("genome-build ")[1] - pragma_genome_build = generate_custom_unstructured_metainfomation_line("genome-build", genome_build, lines_custom_unstructured) + pragma_genome_build = generate_custom_unstructured_metainformation_line("genome-build", genome_build, lines_custom_unstructured) pragmas_to_add.append(pragma_genome_build) else: pass @@ -498,35 +703,35 @@ def generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non for non_essential_pragma in gvf_non_essential: if non_essential_pragma.startswith("#Study_accession"): study_accession = non_essential_pragma.split(": ")[1] - non_essential_pragma_study_accession = generate_custom_unstructured_metainfomation_line("Study_accession", study_accession, lines_custom_unstructured) + non_essential_pragma_study_accession = generate_custom_unstructured_metainformation_line("Study_accession", study_accession, lines_custom_unstructured) pragmas_to_add.append(non_essential_pragma_study_accession) elif non_essential_pragma.startswith("#Study_type"): study_type = non_essential_pragma.split(": ")[1] - non_essential_pragma_study_type = generate_custom_unstructured_metainfomation_line("Study_type", study_type, lines_custom_unstructured) + non_essential_pragma_study_type = generate_custom_unstructured_metainformation_line("Study_type", study_type, lines_custom_unstructured) pragmas_to_add.append(non_essential_pragma_study_type) elif non_essential_pragma.startswith("#Display_name"): display_name = non_essential_pragma.split(": ")[1] - non_essential_pragma_display_name = generate_custom_unstructured_metainfomation_line("Display_name", display_name,lines_custom_unstructured) + non_essential_pragma_display_name = generate_custom_unstructured_metainformation_line("Display_name", display_name, lines_custom_unstructured) pragmas_to_add.append(non_essential_pragma_display_name) elif non_essential_pragma.startswith("#Publication"): publication = non_essential_pragma.split(": ")[1] - non_essential_pragma_publication = generate_custom_unstructured_metainfomation_line("Publication", publication, lines_custom_unstructured) + non_essential_pragma_publication = generate_custom_unstructured_metainformation_line("Publication", publication, lines_custom_unstructured) pragmas_to_add.append(non_essential_pragma_publication) elif non_essential_pragma.startswith("#Study"): study = non_essential_pragma.split(": ")[1] - non_essential_pragma_study = generate_custom_unstructured_metainfomation_line("Study", study, lines_custom_unstructured) + non_essential_pragma_study = generate_custom_unstructured_metainformation_line("Study", study, lines_custom_unstructured) pragmas_to_add.append(non_essential_pragma_study) elif non_essential_pragma.startswith("#Assembly_name"): assembly_name = non_essential_pragma.split(": ")[1] - non_essential_pragma_assembly_name = generate_custom_unstructured_metainfomation_line("Assembly_name", assembly_name, lines_custom_unstructured) + non_essential_pragma_assembly_name = generate_custom_unstructured_metainformation_line("Assembly_name", assembly_name, lines_custom_unstructured) pragmas_to_add.append(non_essential_pragma_assembly_name) elif non_essential_pragma.startswith("#subject"): subject = non_essential_pragma.split(": ")[1] - non_essential_pragma_subject = generate_custom_unstructured_metainfomation_line("subject", subject, lines_custom_unstructured) + non_essential_pragma_subject = generate_custom_unstructured_metainformation_line("subject", subject, lines_custom_unstructured) pragmas_to_add.append(non_essential_pragma_subject) elif non_essential_pragma.startswith("#sample"): sample_information = non_essential_pragma.split(": ")[1] - non_essential_pragma_sample = generate_custom_unstructured_metainfomation_line("sample", sample_information, + non_essential_pragma_sample = generate_custom_unstructured_metainformation_line("sample", sample_information, lines_custom_unstructured) pragmas_to_add.append(non_essential_pragma_sample) list_of_sample_information = sample_information.split(";") @@ -542,7 +747,19 @@ def generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non for pragma in pragmas_to_add: if pragma not in unique_pragmas_to_add: unique_pragmas_to_add.append(pragma) - return unique_pragmas_to_add, sample_names + for alt_line in standard_lines_dictionary["ALT"]: + if alt_line not in unique_alt_lines_to_add: + unique_alt_lines_to_add.append(alt_line) + for info_line in standard_lines_dictionary["INFO"]: + if info_line not in unique_info_lines_to_add: + unique_info_lines_to_add.append(info_line) + for filter_line in standard_lines_dictionary["FILTER"]: + if filter_line not in unique_filter_lines_to_add: + unique_filter_lines_to_add.append(filter_line) + for format_line in standard_lines_dictionary["FORMAT"]: + if format_line not in unique_format_lines_to_add: + unique_format_lines_to_add.append(format_line) + return unique_pragmas_to_add, sample_names, unique_alt_lines_to_add, unique_info_lines_to_add, unique_filter_lines_to_add, unique_format_lines_to_add # step 10 def generate_vcf_header_line(samples): @@ -559,30 +776,24 @@ def generate_vcf_header_line(samples): def gvf_features_to_vcf_objects(gvf_lines_obj_list, dgva_attribute_dict, gvf_attribute_dict, + symbolic_allele_dictionary, assembly_file, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines): + standard_lines_dictionary, + all_possible_lines_dictionary): + """ Creates VCF objects from GVF feature lines and stores the VCF objects. :param gvf_lines_obj_list: list of GVF feature line objects :param gvf_attribute_dict: dictionary of GVF INFO attributes + :param symbolic_allele_dictionary: dictionary of symbolic alleles :param assembly_file: FASTA file to assembly :param dgva_attribute_dict: dictionary af DGVa specific INFO attributes :param lines_custom_structured: list to store custom structured metainformation lines - :param lines_standard_ALT: ALT lines for this VCF file - :param lines_standard_INFO: INFO lines for this VCF file - :param lines_standard_FILTER: FILTER lines for this VCF file - :param lines_standard_FORMAT: FORMAT lines for this VCF file - :param all_possible_ALT_lines: dict of all possible ALT lines - :param all_possible_INFO_lines: dict of all possible INFO lines - :param all_possible_FILTER_lines: dict of all possible FILTER lines - :param all_possible_FORMAT_lines: dict of all possible FORMAT lines + :param lines_standard_alt: ALT lines for this VCF file + :param lines_standard_info: INFO lines for this VCF file + :param lines_standard_filter: FILTER lines for this VCF file + :param lines_standard_format: FORMAT lines for this VCF file + :param all_possible_lines_dictionary: dict of dictionaries. all possible ALT/INFO/FILTER/FORMAT lines :return: vcf_data_lines, list_of_vcf_objects: dictionary of lists and a list of VCF objects """ vcf_data_lines = {} # DICTIONARY OF LISTS @@ -595,16 +806,12 @@ def gvf_features_to_vcf_objects(gvf_lines_obj_list, vcf_object = VcfLine(gvf_featureline, dgva_attribute_dict, gvf_attribute_dict, + symbolic_allele_dictionary, assembly_file, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines) + standard_lines_dictionary, + all_possible_lines_dictionary) + list_of_vcf_objects.append(vcf_object) if vcf_object.key in vcf_data_lines: vcf_data_lines[vcf_object.key].append(vcf_object) @@ -650,6 +857,7 @@ def format_vcf_datalines(list_of_vcf_objects, list_of_sample_names): formatted_vcf_datalines = [] for vcf_obj in list_of_vcf_objects: + vcf_info_string = ";".join(vcf_obj.info) vcf_line = (f"{vcf_obj.chrom}\t" f"{vcf_obj.pos}\t" f"{vcf_obj.id}\t" @@ -657,7 +865,8 @@ def format_vcf_datalines(list_of_vcf_objects, list_of_sample_names): f"{vcf_obj.alt}\t" #TODO: should this always be empty f"{vcf_obj.qual}\t" #TODO: should this always be empty f"{vcf_obj.filter}\t" #TODO: should this always be empty - f"{vcf_obj.info}\t" + #f"{vcf_obj.info}\t" + f"{vcf_info_string}\t" f"{vcf_obj.format}\t" f"{sample_format_values_string}" ) @@ -677,25 +886,43 @@ def main(): if args.assembly: print("The provided assembly file is: ", args.assembly) - all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines() - all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines() - all_possible_FILTER_lines = generate_all_possible_standard_structured_filter_lines() - all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines() + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + # merging + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } # custom meta-information lines for this VCF file lines_custom_structured = [] lines_custom_unstructured = [] # standard structured meta-information lines for this VCF file - lines_standard_ALT = [] - lines_standard_INFO = [] - lines_standard_FILTER = [] - lines_standard_FORMAT = [] + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary ={ + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(args.gvf_input) dgva_info_attributes_file = os.path.join(etc_folder, 'dgvaINFOattributes.tsv') gvf_info_attributes_file = os.path.join(etc_folder, 'gvfINFOattributes.tsv') + symbolic_allele_file = os.path.join(etc_folder, 'svALTkeys.tsv') + dgva_attribute_dict = read_info_attributes(info_attributes_file=dgva_info_attributes_file) # needed to generate custom strings gvf_attribute_dict = read_info_attributes(info_attributes_file=gvf_info_attributes_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(symbolic_allele_file) + if args.assembly: assembly_file = os.path.abspath(args.assembly) else: @@ -703,24 +930,28 @@ def main(): vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list, dgva_attribute_dict, gvf_attribute_dict, + symbolic_allele_dictionary, assembly_file, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines) + standard_lines_dictionary, + all_possible_lines_dictionary) + print("Writing to the following VCF output: ", args.vcf_output) print("Generating the VCF header and the meta-information lines") with open(args.vcf_output, "w") as vcf_output: - unique_pragmas_to_add, samples = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non_essential, list_of_vcf_objects) + unique_pragmas_to_add, samples, unique_alt_lines_to_add, unique_info_lines_to_add, unique_filter_lines_to_add, unique_format_lines_to_add = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non_essential, list_of_vcf_objects, standard_lines_dictionary) for pragma in unique_pragmas_to_add: vcf_output.write(f"{pragma}\n") + for alt_lines in unique_alt_lines_to_add: + vcf_output.write(f"{alt_lines}\n") + for info_lines in unique_info_lines_to_add: + vcf_output.write(f"{info_lines}\n") + for filter_lines in unique_filter_lines_to_add: + vcf_output.write(f"{filter_lines}\n") + for format_lines in unique_format_lines_to_add: + vcf_output.write(f"{format_lines}\n") header_fields = generate_vcf_header_line(samples) vcf_output.write(f"{header_fields}\n") print("Generating the VCF datalines") diff --git a/convert_gvf_to_vcf/etc/svALTkeys.tsv b/convert_gvf_to_vcf/etc/svALTkeys.tsv index c5b2efc..b1f5b75 100644 --- a/convert_gvf_to_vcf/etc/svALTkeys.tsv +++ b/convert_gvf_to_vcf/etc/svALTkeys.tsv @@ -1,12 +1,23 @@ -KEY LINE -INV ##ALT= -INS ##ALT= -DUP ##ALT= -DEL ##ALT= -CNV ##ALT= -CNV:TR ##ALT= -DUP:TANDEM ##ALT= -DEL:ME ##ALT= -INS:ME ##ALT= -R ##ALT= -M ##ALT= \ No newline at end of file +SymbolicAllele Line SequenceOntologyID Name Description +None None SO:0001784 complex_structural_alteration Complex structural alteration +DEL:ME:LINE1 "##ALT=" SO:0002069 LINE1_deletion LINE1 deletion +None None SO:1000032 delins Deletion-Insertion +DEL:ME:ALU "##ALT=" SO:0002070 Alu_deletion Alu deletion +None None SO:1000005 complex_substitution Complex substitution +DUP:TANDEM "##ALT=" SO:1000173 tandem_duplication Tandem duplication +INS "##ALT=" SO:0001838 novel_sequence_insertion Novel sequence insertion +None None SO:0001059 sequence_alteration Sequence alteration +BND None SO:0002061 intrachromosomal_translocation Intrachromosomal translocation +INS:ME:SVA "##ALT=" SO:0002065 SVA_insertion SVA insertion +INV "##ALT=" SO:1000036 inversion Inversion +INS:ME:LINE1 "##ALT=" SO:0002064 LINE1_insertion LINE1 insertion +INS:ME "##ALT=" SO:0001837 mobile_element_insertion Mobile element insertion +BND None SO:0002060 interchromosomal_translocation Interchromosomal translocation +INS:ME:ALU "##ALT=" SO:0002063 Alu_insertion Alu insertion +CNV "##ALT=" SO:0001019 copy_number_variation Copy number variation +DUP "##ALT=" SO:0001742 copy_number_gain Copy number gain +DUP "##ALT=" SO:1000035 duplication Duplication +INS "##ALT=" SO:0000667 insertion Insertion +DEL "##ALT=" SO:0000159 deletion Deletion +DEL "##ALT=" SO:0001743 copy_number_loss Copy number loss +CNV:TR "##ALT=" SO:0002096 short_tandem_repeat_variation Short tandem repeat variation diff --git a/tests/input/zebrafish.fa b/tests/input/zebrafish.fa index 1e7a59f..a14a218 100644 --- a/tests/input/zebrafish.fa +++ b/tests/input/zebrafish.fa @@ -1,4 +1,4 @@ >chromosome1 -ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTRCGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT >chromosome2 TTTTTTT \ No newline at end of file diff --git a/tests/input/zebrafish.gvf b/tests/input/zebrafish.gvf index 88280fd..58c5ea7 100644 --- a/tests/input/zebrafish.gvf +++ b/tests/input/zebrafish.gvf @@ -18,7 +18,9 @@ #sample: sample_name=Zon9;subject_name=Zon9 #sample: sample_name=JenMale7;subject_name=JenMale7 #testing_unknown_pragma -chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=. +chromosome1 DGVa copy_number_loss 1 2 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,1;End_range=2,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=. +chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=776614,776914;End_range=786127,786427;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=. chromosome1 DGVa copy_number_loss 127 132 . + . ID=12;Name=nssv1406143;Alias=CNV22899;variant_call_so_id=SO:0001743;parent=nsv811095;Start_range=.,1277246;End_range=1320592,.;submitter_variant_call_id=CNV22899;sample_name=Zon9;remap_score=.87402;Variant_seq=. chromosome1 DGVa copy_number_gain 128 132 . + . ID=13;Name=nssv1389474;Alias=CNV6230;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV6230;sample_name=JenMale7;remap_score=.69625;Variant_seq=. chromosome1 DGVa copy_number_gain 128 132 . + . ID=14;Name=nssv1388955;Alias=CNV5711;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,1284210;End_range=1320592,.;submitter_variant_call_id=CNV5711;sample_name=JenMale6;remap_score=.85344;Variant_seq=.;allele_count=3 +chromosome1 DGVa copy_number_gain 128 129 . + . ID=14;Name=nssv1388955;Alias=CNV5711;variant_call_so_id=SO:0001742;parent=nsv811095;Start_range=.,128;End_range=129,.;submitter_variant_call_id=CNV5711;sample_name=JenMale6;remap_score=.85344;Variant_seq=.;allele_count=3 \ No newline at end of file diff --git a/tests/test_convert_gvf_to_vcf.py b/tests/test_convert_gvf_to_vcf.py index 338f1a7..1f89ec4 100644 --- a/tests/test_convert_gvf_to_vcf.py +++ b/tests/test_convert_gvf_to_vcf.py @@ -1,12 +1,11 @@ import os.path import unittest -from convert_gvf_to_vcf.convertGVFtoVCF import generate_custom_unstructured_metainfomation_line, read_in_gvf_file, \ +from convert_gvf_to_vcf.convertGVFtoVCF import generate_custom_unstructured_metainformation_line, read_in_gvf_file, \ gvf_features_to_vcf_objects, format_vcf_datalines, \ - generate_vcf_metainformation, generate_all_possible_standard_structured_info_lines, \ - generate_all_possible_standard_structured_alt_lines, generate_all_possible_standard_structured_filter_lines, \ - generate_all_possible_standard_structured_format_lines, generate_vcf_header_line, populate_sample_formats, \ - format_sample_values, read_info_attributes + generate_vcf_metainformation, generate_all_possible_standard_structured_lines, \ + generate_vcf_header_line, populate_sample_formats, \ + format_sample_values, read_info_attributes, read_sequence_ontology_symbolic_allele from convert_gvf_to_vcf.convertGVFtoVCF import VcfLine, GvfFeatureline @@ -18,54 +17,277 @@ def setUp(self): # the inputs below are INFO attribute files self.dgva_input_file = os.path.join(self.input_folder_parent, "etc","dgvaINFOattributes.tsv") self.gvf_input_file = os.path.join(self.input_folder_parent, "etc","gvfINFOattributes.tsv") + self.symbolic_allele_file = os.path.join(self.input_folder_parent,"etc", 'svALTkeys.tsv') self.output_file = os.path.join(input_folder, "input", "a.vcf") self.assembly = os.path.join(input_folder, "input", "zebrafish.fa") - + #1 def test_read_in_gvf_file(self): gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) assert len(gvf_pragmas) > 1 assert len(gvf_non_essential) > 1 assert len(gvf_lines_obj_list) > 1 - + #2 def test_read_info_attributes(self): dgva_attribute_dict = read_info_attributes(self.dgva_input_file) assert len(dgva_attribute_dict) > 1 gvf_attribute_dict = read_info_attributes(self.gvf_input_file) assert len(gvf_attribute_dict) > 1 - + #3 + def test_read_sequence_ontology_symbolic_allele(self): + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) + assert len(symbolic_allele_dictionary) > 1 + #4 def test_gvf_features_to_vcf_objects(self): gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) dgva_attribute_dict = read_info_attributes(self.dgva_input_file) gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) assembly_file = self.assembly # custom meta-information lines for this VCF file lines_custom_structured = [] lines_custom_unstructured = [] # standard structured meta-information lines for this VCF file - lines_standard_ALT = [] - lines_standard_INFO = [] - lines_standard_FILTER = [] - lines_standard_FORMAT = [] + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } # Dictionary for all possible VCF meta-information lines - all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines() - all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines() - all_possible_FILTER_lines = generate_all_possible_standard_structured_filter_lines() - all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines() - + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list, dgva_attribute_dict, gvf_attribute_dict, + symbolic_allele_dictionary, assembly_file, lines_custom_structured, - lines_standard_ALT, lines_standard_INFO, - lines_standard_FILTER, lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines + standard_lines_dictionary, + all_possible_lines_dictionary ) assert len(vcf_data_lines) > 1 assert len(list_of_vcf_objects) > 1 + #5 + def test_add_padded_base(self): + gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=." + f_list = gvf_feature_line.split("\t") + line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], f_list[7], f_list[8]) + gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) + dgva_attribute_dict = read_info_attributes(self.dgva_input_file) + gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) + assembly_file = self.assembly + # custom meta-information lines for this VCF file + lines_custom_structured = [] + lines_custom_unstructured = [] + # standard structured meta-information lines for this VCF file + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } + + # Dictionary for all possible VCF meta-information lines + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } + + v = VcfLine(line_object, + dgva_attribute_dict, + gvf_attribute_dict, + symbolic_allele_dictionary, + assembly_file, + lines_custom_structured, + standard_lines_dictionary, + all_possible_lines_dictionary) + + test_ref = "A" + test_alt = "T" + padded_base, pos, ref, alt = v.add_padded_base(test_ref, test_alt, True) + assert padded_base is not None + assert pos is not None + assert ref is not None + assert alt is not None + #6 + def test_build_iupac_ambiguity_code(self): + gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=." + f_list = gvf_feature_line.split("\t") + line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], + f_list[7], f_list[8]) + gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) + dgva_attribute_dict = read_info_attributes(self.dgva_input_file) + gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) + assembly_file = self.assembly + # custom meta-information lines for this VCF file + lines_custom_structured = [] + lines_custom_unstructured = [] + # standard structured meta-information lines for this VCF file + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } + # Dictionary for all possible VCF meta-information lines + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } + v = VcfLine(line_object, + dgva_attribute_dict, + gvf_attribute_dict, + symbolic_allele_dictionary, + assembly_file, + lines_custom_structured, + standard_lines_dictionary, + all_possible_lines_dictionary) + + my_ipuac_dictionary = v.build_iupac_ambiguity_code() + assert len(my_ipuac_dictionary) > 0 + #7 + def test_convert_iupac_ambiguity_code(self): + gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=." + f_list = gvf_feature_line.split("\t") + line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], + f_list[7], f_list[8]) + gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) + dgva_attribute_dict = read_info_attributes(self.dgva_input_file) + gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) + assembly_file = self.assembly + # custom meta-information lines for this VCF file + lines_custom_structured = [] + lines_custom_unstructured = [] + # standard structured meta-information lines for this VCF file + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } + # Dictionary for all possible VCF meta-information lines + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } + v = VcfLine(line_object, + dgva_attribute_dict, + gvf_attribute_dict, + symbolic_allele_dictionary, + assembly_file, + lines_custom_structured, + standard_lines_dictionary, + all_possible_lines_dictionary) + + + my_ipuac_dictionary = v.build_iupac_ambiguity_code() + ref_to_convert = "TAGD" + converted_ref_allele = v.convert_iupac_ambiguity_code(my_ipuac_dictionary, ref_to_convert) + assert converted_ref_allele not in ["R", "Y", "M", "K", "S", "D", "W", "H", "B", "V", "D", "N"] + #8 + def test_check_ref(self): + gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=." + f_list = gvf_feature_line.split("\t") + line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], + f_list[7], f_list[8]) + gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) + dgva_attribute_dict = read_info_attributes(self.dgva_input_file) + gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) + assembly_file = self.assembly + # custom meta-information lines for this VCF file + lines_custom_structured = [] + lines_custom_unstructured = [] + # standard structured meta-information lines for this VCF file + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } + # Dictionary for all possible VCF meta-information lines + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } + v = VcfLine(line_object, + dgva_attribute_dict, + gvf_attribute_dict, + symbolic_allele_dictionary, + assembly_file, + lines_custom_structured, + standard_lines_dictionary, + all_possible_lines_dictionary) + + reference_allele_to_check = "TGCR" + new_ref = v.check_ref(reference_allele_to_check) + iupac_code = ["R", "Y", "M", "K", "S", "D", "W", "H", "B", "V", "D", "N"] + assert all(code not in new_ref for code in iupac_code) + #9 def test_get_ref(self): gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 78 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=.,776614;End_range=786127,.;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=." f_list = gvf_feature_line.split("\t") @@ -73,292 +295,379 @@ def test_get_ref(self): gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) dgva_attribute_dict = read_info_attributes(self.dgva_input_file) gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) assembly_file = self.assembly # custom meta-information lines for this VCF file lines_custom_structured = [] lines_custom_unstructured = [] # standard structured meta-information lines for this VCF file - lines_standard_ALT = [] - lines_standard_INFO = [] - lines_standard_FILTER = [] - lines_standard_FORMAT = [] + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } # Dictionary for all possible VCF meta-information lines - all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines() - all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines() - all_possible_FILTER_lines = generate_all_possible_standard_structured_filter_lines() - all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines() + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } v = VcfLine(line_object, dgva_attribute_dict, gvf_attribute_dict, + symbolic_allele_dictionary, assembly_file, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines) + standard_lines_dictionary, + all_possible_lines_dictionary) reference_allele = v.get_ref() assert len(reference_allele) != 0 - - def test_generate_vcf_metainformation(self): + #10 + def test_generate_symbolic_allele(self): + gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 81 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=77,78;End_range=80,81;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=." + f_list = gvf_feature_line.split("\t") + line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], f_list[7], f_list[8]) gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) dgva_attribute_dict = read_info_attributes(self.dgva_input_file) gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) + assembly_file = self.assembly # custom meta-information lines for this VCF file lines_custom_structured = [] lines_custom_unstructured = [] # standard structured meta-information lines for this VCF file - lines_standard_ALT = [] - lines_standard_INFO = [] - lines_standard_FILTER = [] - lines_standard_FORMAT = [] + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } # Dictionary for all possible VCF meta-information lines - all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines() - all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines() - all_possible_FILTER_lines = generate_all_possible_standard_structured_filter_lines() - all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines() - - vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list, - dgva_attribute_dict, - gvf_attribute_dict, - self.assembly, - lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines - ) - - lines_custom_unstructured = ['##fileformat=VCFv4.4','##fileDate=20150715', '##source=DGVa','##source=DGVa', '##genome-build=NCBI GRCz10'] - unique_pragmas_to_add, samples = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, - gvf_non_essential, list_of_vcf_objects) - assert len(unique_pragmas_to_add) > 1 and len(samples) > 1 + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") - def test_generate_vcf_metainformation(self): + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } + v = VcfLine(line_object, + dgva_attribute_dict, + gvf_attribute_dict, + symbolic_allele_dictionary, + assembly_file, + lines_custom_structured, + standard_lines_dictionary, + all_possible_lines_dictionary) + output_symbolic_allele, self.info, output_lines_standard_ALT, output_lines_standard_INFO = v.generate_symbolic_allele(standard_lines_dictionary, all_possible_lines_dictionary) + assert len(output_symbolic_allele) > 1 + #11 + def test_get_alt(self): + gvf_feature_line = "chromosome1 DGVa copy_number_loss 77 81 . + . ID=1;Name=nssv1412199;Alias=CNV28955;variant_call_so_id=SO:0001743;parent=nsv811094;Start_range=77,78;End_range=80,81;submitter_variant_call_id=CNV28955;sample_name=Wilds2-3;remap_score=.98857;Variant_seq=." + f_list = gvf_feature_line.split("\t") + line_object = GvfFeatureline(f_list[0], f_list[1], f_list[2], f_list[3], f_list[4], f_list[5], f_list[6], f_list[7], f_list[8]) gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) dgva_attribute_dict = read_info_attributes(self.dgva_input_file) gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) assembly_file = self.assembly # custom meta-information lines for this VCF file lines_custom_structured = [] lines_custom_unstructured = [] # standard structured meta-information lines for this VCF file - lines_standard_ALT = [] - lines_standard_INFO = [] - lines_standard_FILTER = [] - lines_standard_FORMAT = [] + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } # Dictionary for all possible VCF meta-information lines - all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines() - all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines() - all_possible_FILTER_lines = generate_all_possible_standard_structured_filter_lines() - all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines() + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } + v = VcfLine(line_object, + dgva_attribute_dict, + gvf_attribute_dict, + symbolic_allele_dictionary, + assembly_file, + lines_custom_structured, + standard_lines_dictionary, + all_possible_lines_dictionary) + alt_allele = v.get_alt(standard_lines_dictionary, all_possible_lines_dictionary) + assert len(alt_allele) > 0 + #12 + def test_generate_vcf_metainformation(self): + gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) + dgva_attribute_dict = read_info_attributes(self.dgva_input_file) + gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) + # custom meta-information lines for this VCF file + lines_custom_structured = [] + lines_custom_unstructured = [] + # standard structured meta-information lines for this VCF file + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } + # Dictionary for all possible VCF meta-information lines + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + # merging + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list, dgva_attribute_dict, gvf_attribute_dict, - assembly_file, + symbolic_allele_dictionary, + self.assembly, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines + standard_lines_dictionary, + all_possible_lines_dictionary ) lines_custom_unstructured = ['##fileformat=VCFv4.4','##fileDate=20150715', '##source=DGVa','##source=DGVa', '##genome-build=NCBI GRCz10'] - unique_pragmas_to_add, samples = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, - gvf_non_essential, list_of_vcf_objects) - assert len(unique_pragmas_to_add) > 1 and len(samples) > 1 - + unique_pragmas_to_add, sample_names, unique_alt_lines_to_add, unique_info_lines_to_add, unique_filter_lines_to_add, unique_format_lines_to_add = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, + gvf_non_essential, list_of_vcf_objects, standard_lines_dictionary) + assert len(unique_pragmas_to_add) > 1 and len(sample_names) > 1 + #13 def test_generate_vcf_header_line(self): gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) dgva_attribute_dict = read_info_attributes(self.dgva_input_file) gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) # custom meta-information lines for this VCF file lines_custom_structured = [] lines_custom_unstructured = [] # standard structured meta-information lines for this VCF file - lines_standard_ALT = [] - lines_standard_INFO = [] - lines_standard_FILTER = [] - lines_standard_FORMAT = [] + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } # Dictionary for all possible VCF meta-information lines - all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines() - all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines() - all_possible_FILTER_lines = generate_all_possible_standard_structured_filter_lines() - all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines() - + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list, dgva_attribute_dict, gvf_attribute_dict, + symbolic_allele_dictionary, self.assembly, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines + standard_lines_dictionary, + all_possible_lines_dictionary ) lines_custom_unstructured = ['##fileformat=VCFv4.4','##fileDate=20150715', '##source=DGVa','##source=DGVa', '##genome-build=NCBI GRCz10'] - unique_pragmas_to_add, samples = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, - gvf_non_essential, list_of_vcf_objects) + unique_pragmas_to_add, samples, unique_alt_lines_to_add, unique_info_lines_to_add, unique_filter_lines_to_add, unique_format_lines_to_add = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non_essential, list_of_vcf_objects, standard_lines_dictionary) header_fields = generate_vcf_header_line(samples) assert len(header_fields) > 1 - + #14 def test_populate_sample_formats(self): gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) dgva_attribute_dict = read_info_attributes(self.dgva_input_file) gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) # custom meta-information lines for this VCF file lines_custom_structured = [] lines_custom_unstructured = [] # standard structured meta-information lines for this VCF file - lines_standard_ALT = [] - lines_standard_INFO = [] - lines_standard_FILTER = [] - lines_standard_FORMAT = [] + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } # Dictionary for all possible VCF meta-information lines - all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines() - all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines() - all_possible_FILTER_lines = generate_all_possible_standard_structured_filter_lines() - all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines() + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list, dgva_attribute_dict, gvf_attribute_dict, + symbolic_allele_dictionary, self.assembly, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines + standard_lines_dictionary, + all_possible_lines_dictionary ) lines_custom_unstructured = ['##fileformat=VCFv4.4','##fileDate=20150715', '##source=DGVa','##source=DGVa', '##genome-build=NCBI GRCz10'] - unique_pragmas_to_add, samples = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, - gvf_non_essential, list_of_vcf_objects) + unique_pragmas_to_add, samples, unique_alt_lines_to_add, unique_info_lines_to_add, unique_filter_lines_to_add, unique_format_lines_to_add = generate_vcf_metainformation( + lines_custom_unstructured, gvf_pragmas, gvf_non_essential, list_of_vcf_objects, standard_lines_dictionary) sample_name_format_value = populate_sample_formats(samples) assert len(sample_name_format_value) > 1 - - + #15 def test_format_sample_values(self): gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) dgva_attribute_dict = read_info_attributes(self.dgva_input_file) gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) # custom meta-information lines for this VCF file lines_custom_structured = [] lines_custom_unstructured = [] # standard structured meta-information lines for this VCF file - lines_standard_ALT = [] - lines_standard_INFO = [] - lines_standard_FILTER = [] - lines_standard_FORMAT = [] + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } + # Dictionary for all possible VCF meta-information lines - all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines() - all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines() - all_possible_FILTER_lines = generate_all_possible_standard_structured_filter_lines() - all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines() + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list, dgva_attribute_dict, gvf_attribute_dict, + symbolic_allele_dictionary, self.assembly, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines - ) + standard_lines_dictionary, + all_possible_lines_dictionary) + lines_custom_unstructured = ['##fileformat=VCFv4.4','##fileDate=20150715', '##source=DGVa','##source=DGVa', '##genome-build=NCBI GRCz10'] - unique_pragmas_to_add, samples = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, - gvf_non_essential, list_of_vcf_objects) + unique_pragmas_to_add, samples, unique_alt_lines_to_add, unique_info_lines_to_add, unique_filter_lines_to_add, unique_format_lines_to_add = generate_vcf_metainformation( + lines_custom_unstructured, gvf_pragmas, gvf_non_essential, list_of_vcf_objects, standard_lines_dictionary) sample_name_format_value = populate_sample_formats(samples) sample_format_values_string = format_sample_values(sample_name_format_value) assert isinstance(sample_format_values_string, str) - + #16 def test_format_vcf_datalines(self): gvf_pragmas, gvf_non_essential, gvf_lines_obj_list = read_in_gvf_file(self.input_file) dgva_attribute_dict = read_info_attributes(self.dgva_input_file) gvf_attribute_dict = read_info_attributes(self.gvf_input_file) + symbolic_allele_dictionary = read_sequence_ontology_symbolic_allele(self.symbolic_allele_file) assembly_file = self.assembly - # custom meta-information lines for this VCF file + # custom meta-information lines for this VCF fileT lines_custom_structured = [] lines_custom_unstructured = [] # standard structured meta-information lines for this VCF file - lines_standard_ALT = [] - lines_standard_INFO = [] - lines_standard_FILTER = [] - lines_standard_FORMAT = [] + lines_standard_alt = [] + lines_standard_info = [] + lines_standard_filter = [] + lines_standard_format = [] + # merging + standard_lines_dictionary = { + "ALT": lines_standard_alt, + "INFO": lines_standard_info, + "FILTER": lines_standard_filter, + "FORMAT": lines_standard_format, + } # Dictionary for all possible VCF meta-information lines - all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines() - all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines() - all_possible_FILTER_lines = generate_all_possible_standard_structured_filter_lines() - all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines() - + all_possible_alt_lines = generate_all_possible_standard_structured_lines("ALT") + all_possible_info_lines = generate_all_possible_standard_structured_lines("INFO") + all_possible_filter_lines = generate_all_possible_standard_structured_lines("FILTER") + all_possible_format_lines = generate_all_possible_standard_structured_lines("FORMAT") + all_possible_lines_dictionary = { + "ALT": all_possible_alt_lines, + "INFO": all_possible_info_lines, + "FILTER": all_possible_filter_lines, + "FORMAT": all_possible_format_lines, + } vcf_data_lines, list_of_vcf_objects = gvf_features_to_vcf_objects(gvf_lines_obj_list, dgva_attribute_dict, gvf_attribute_dict, + symbolic_allele_dictionary, assembly_file, lines_custom_structured, - lines_standard_ALT, - lines_standard_INFO, - lines_standard_FILTER, - lines_standard_FORMAT, - all_possible_ALT_lines, - all_possible_INFO_lines, - all_possible_FILTER_lines, - all_possible_FORMAT_lines - ) + standard_lines_dictionary, + all_possible_lines_dictionary) lines_custom_unstructured = ['##fileformat=VCFv4.4','##fileDate=20150715', '##source=DGVa','##source=DGVa', '##genome-build=NCBI GRCz10'] - unique_pragmas_to_add, samples = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, - gvf_non_essential, list_of_vcf_objects) + unique_pragmas_to_add, samples, unique_alt_lines_to_add, unique_info_lines_to_add, unique_filter_lines_to_add, unique_format_lines_to_add = generate_vcf_metainformation(lines_custom_unstructured, gvf_pragmas, gvf_non_essential, list_of_vcf_objects, standard_lines_dictionary) formatted_vcf_datalines = format_vcf_datalines(list_of_vcf_objects, samples) assert len(formatted_vcf_datalines) > 1 - + #17 def test_generate_custom_unstructured_metainfomation_line(self): lines_custom_unstructured = ['##fileformat=VCFv4.4', '##fileDate=20150715', '##source=DGVa', '##source=DGVa', '##genome-build=NCBI GRCz10'] - formatted_string = generate_custom_unstructured_metainfomation_line("test_string_key", "test_string_value", lines_custom_unstructured) + formatted_string = generate_custom_unstructured_metainformation_line("test_string_key", "test_string_value", lines_custom_unstructured) assert formatted_string == "##test_string_key=test_string_value" - def test_generate_all_possible_standard_structured_info_lines(self): - all_possible_INFO_lines = generate_all_possible_standard_structured_info_lines() - assert len(all_possible_INFO_lines) > 0 - - def test_generate_all_possible_standard_structured_alt_lines(self): - all_possible_ALT_lines = generate_all_possible_standard_structured_alt_lines() - assert len(all_possible_ALT_lines) > 0 - - #TODO: uncomment this test once the function generate_all_possible_standard_structured_filter_lines has been filled in - - # def test_generate_all_possible_standard_structured_filter_lines(self): - # all_possible_FILTER_lines = generate_all_possible_standard_structured_filter_lines() - # assert len(all_possible_FILTER_lines) > 0 - - def test_generate_all_possible_standard_structured_format_lines(self): - all_possible_FORMAT_lines = generate_all_possible_standard_structured_format_lines() - assert len(all_possible_FORMAT_lines) > 0 - if __name__ == '__main__': unittest.main()