Skip to content

Commit 8b255f0

Browse files
authored
Statistics (#29)
gather counts for GVF and VCF files into a report
1 parent e08d2b5 commit 8b255f0

File tree

9 files changed

+20141
-28
lines changed

9 files changed

+20141
-28
lines changed

convert_gvf_to_vcf/assistingconverter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def convert_gvf_attributes_to_vcf_values(column9_of_gvf,
6969
gvf_attribute_dictionary = get_gvf_attributes(column9_of_gvf)
7070
vcf_info_values = {} # key is info field value; value is value
7171
vcf_format_values = {} # key is format field value; value is value
72-
catching_for_review = []
72+
dropped_gvf_attributes = []
7373
for attrib_key, attrib_value in gvf_attribute_dictionary.items():
7474
if attrib_key in mapping_attribute_dict:
7575
field_values = mapping_attribute_dict[attrib_key]
@@ -96,5 +96,5 @@ def convert_gvf_attributes_to_vcf_values(column9_of_gvf,
9696
logger.warning(f"Unsupported Field: {field}")
9797
else:
9898
logger.info(f"catching attribute keys for review at a later date {attrib_key} {attrib_value}")
99-
catching_for_review.append(attrib_key)
99+
dropped_gvf_attributes.append(attrib_key)
100100
return gvf_attribute_dictionary, vcf_info_values, vcf_format_values
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""
2+
The purpose of this file is to calculate the (conversion) statistics of the GVF and VCF file.
3+
"""
4+
import os
5+
from datetime import datetime
6+
import hashlib
7+
from collections import Counter
8+
9+
class FileStatistics:
10+
"""
11+
The responsibility of this class is to accumulate counts and calculate statistics of a GVF to VCF file conversion.
12+
"""
13+
def __init__(self, gvf_file_path, gvf_pragmas, samples):
14+
self.gvf_file_path = gvf_file_path
15+
# file name
16+
self.gvf_file_name = os.path.basename(self.gvf_file_path)
17+
self.gvf_extension = os.path.splitext(self.gvf_file_path)[1]
18+
# file metadata
19+
self.gvf_metadata_stats = os.stat(self.gvf_file_path)
20+
self.gvf_file_size = self.gvf_metadata_stats.st_size
21+
gvf_last_modified_timestamp_raw = self.gvf_metadata_stats.st_mtime
22+
self.gvf_last_modified_timestamp = datetime.fromtimestamp(gvf_last_modified_timestamp_raw)
23+
self.gvf_version = self.find_version(gvf_pragmas)
24+
self.gvf_md5 = self.get_file_md5(self.gvf_file_path)
25+
# biological samples
26+
self.sample_number = len(samples) #
27+
self.vcf_sample_number_count = Counter()
28+
#TODO: add missing samples in VCF - awaiting bug fix
29+
#TODO: number of times a sample is missing in the merged vcf line, count per sample
30+
# chromsomes
31+
self.gvf_chromosome_count = Counter()
32+
self.vcf_chromosome_count = Counter()
33+
# # variants
34+
self.gvf_feature_line_count = 0
35+
self.gvf_sv_count = Counter()
36+
self.vcf_data_line_count = 0 # used in property
37+
self.vcf_number_of_merges = 0
38+
self.vcf_alt_alleles_count = Counter() #used in property
39+
# # attribute mapping
40+
self.vcf_info_counter = Counter() # used in property
41+
self.vcf_format_counter = Counter()
42+
self.vcf_variant_region_SOID = Counter()
43+
self.vcf_variant_call_SOID = Counter()
44+
45+
@property
46+
def vcf_alt_missing(self):
47+
return self.vcf_alt_alleles_count['.']
48+
49+
@property
50+
def vcf_imprecise_variants(self):
51+
return self.vcf_info_counter["IMPRECISE"]
52+
53+
@property
54+
def vcf_precise_variants(self):
55+
return self.vcf_data_line_count - self.vcf_imprecise_variants
56+
57+
@staticmethod
58+
def get_file_md5(path_to_file):
59+
hash_md5 = hashlib.md5()
60+
with open(path_to_file, "rb") as f:
61+
# Read file in 4KB chunks until you get to an empty byte string
62+
for chunk in iter(lambda: f.read(4096), b""):
63+
hash_md5.update(chunk)
64+
md5 = hash_md5.hexdigest()
65+
return md5
66+
67+
@staticmethod
68+
def find_version(header_list):
69+
""" Finds the gvf version in the header.
70+
:params: header_list
71+
:return: gvf version or if not found None
72+
"""
73+
for line in header_list:
74+
if "##gvf-version" in line:
75+
try:
76+
return line.split('=')[1].strip()
77+
except IndexError:
78+
return f"GVF version unknown: {line}"
79+
return None
80+
81+
def __str__(self):
82+
summary_string = (f"======GVF======\n"
83+
f"File name = {self.gvf_file_name}\n"
84+
f"File path = {self.gvf_file_path}\n"
85+
f"File extension = {self.gvf_extension}\n"
86+
f"File size = {self.gvf_file_size} bytes\n"
87+
f"Last mod Timestamp = {self.gvf_last_modified_timestamp}\n"
88+
f"Version = {self.gvf_version}\n"
89+
f"md5 string = {self.gvf_md5}\n"
90+
f"gvf chrom = {self.gvf_chromosome_count}\n"
91+
f"vcf chrom = {self.vcf_chromosome_count}\n"
92+
f"gvf_feature_line_count = {self.gvf_feature_line_count}\n"
93+
f"vcf_data_line_count = {self.vcf_data_line_count}\n"
94+
f"vcf_number_of_merges = {self.vcf_number_of_merges}\n"
95+
f"vcf_alt_allele_count = {self.vcf_alt_alleles_count}\n"
96+
f"vcf_info_counter = {self.vcf_info_counter}\n"
97+
f"vcf_format_counter = {self.vcf_format_counter}\n"
98+
f"vcf_sample_count = {self.vcf_sample_number_count}\n"
99+
f"vcf_alt_missing = {self.vcf_alt_missing}\n"
100+
f"vcf_imprecise_variants = {self.vcf_imprecise_variants}\n"
101+
f"vcf_precise_variants = {self.vcf_precise_variants}\n"
102+
f"=====\n"
103+
)
104+
return summary_string
105+
106+
def print_report(self, file_to_write):
107+
key_width = 30
108+
text_report = f'''
109+
GVF
110+
{"File name":<{key_width}}\t{self.gvf_file_name:>}
111+
{"File path":<{key_width}}\t{self.gvf_file_path:>}
112+
{"File extension":<{key_width}}\t{self.gvf_extension:>}
113+
{"File size":<{key_width}}\t{self.gvf_file_size:>}
114+
{"Last modified":<{key_width}}\t{self.gvf_last_modified_timestamp}
115+
{"Version":<{key_width}}\t{self.gvf_version:>}
116+
{"md5":<{key_width}}\t{self.gvf_md5:>}
117+
118+
{"COUNTS":^{key_width}}
119+
{"GVF Chromosome counts:":<}\n\t\t\t\t{self.gvf_chromosome_count}
120+
{"Number of GVF feature lines:":<{key_width}}\t{self.gvf_feature_line_count:>}
121+
{"Number of samples:":<{key_width}}\t{self.sample_number}
122+
{"Number of structural variants seen:":<{key_width}}\n\t\t\t\t{self.gvf_sv_count}
123+
VCF
124+
{"COUNTS":^{key_width}}
125+
{"VCF Chromosome counts:":<{key_width}}\n\t\t\t\t{self.vcf_chromosome_count}
126+
{"VCF ALT allele counts:":<{key_width}}\n\t\t\t\t{self.vcf_alt_alleles_count}
127+
{"Number of VCF data lines:":<{key_width}}\t{self.vcf_data_line_count:>}
128+
{"Number of VCF data line merges:":<{key_width}}\t{self.vcf_number_of_merges:>}
129+
{"Number of times each VCF sample has been seen:":<{key_width}}\n\t\t\t\t{self.vcf_sample_number_count}
130+
{"Number of times INFO keys seen"}\n\t\t\t\t{self.vcf_info_counter}
131+
{"VCF variant region Sequence Ontology ID counts"}\n\t\t\t\t{self.vcf_variant_region_SOID}
132+
{"VCF variant call Sequence Ontology ID counts"}\n\t\t\t\t{self.vcf_variant_call_SOID}
133+
134+
'''
135+
#
136+
with open(file_to_write, "w") as stats_file:
137+
stats_file.write(text_report)

convert_gvf_to_vcf/convertGVFtoVCF.py

Lines changed: 62 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
import argparse
22
import os
3+
from collections import Counter
34
from ebi_eva_common_pyutils.logger import logging_config as log_cfg
5+
6+
from convert_gvf_to_vcf.conversionstatistics import FileStatistics
47
from convert_gvf_to_vcf.lookup import Lookup
58
from convert_gvf_to_vcf.utils import read_in_gvf_header, read_in_gvf_data
69
from convert_gvf_to_vcf.vcfline import VcfLineBuilder
710

11+
812
logger = log_cfg.get_logger(__name__)
913

1014
# setting up paths to useful directories
@@ -236,7 +240,7 @@ def get_pragma_tokens(pragma_value, first_delimiter, second_delimiter):
236240
return pragma_tokens
237241

238242

239-
def write_header(vcf_output, pragmas_for_vcf, header_lines_per_type, is_missing_format_value, samples):
243+
def write_header(vcf_output, pragmas_for_vcf, header_lines_per_type, header_fields, samples):
240244
logger.info(f"Total number of samples in this VCF: {len(samples)}")
241245
vcf_header_file = vcf_output + '_header'
242246
with open(vcf_header_file, "w") as vcf_header_output:
@@ -250,7 +254,6 @@ def write_header(vcf_output, pragmas_for_vcf, header_lines_per_type, is_missing_
250254

251255
# Part 2 of VCF file: Write the VCF header line.
252256
# Write the header.
253-
header_fields = generate_vcf_header_line(is_missing_format=is_missing_format_value, samples=samples)
254257
vcf_header_output.write(f"{header_fields}\n")
255258
return vcf_header_file
256259

@@ -259,14 +262,13 @@ def convert(gvf_input, vcf_output, assembly):
259262
logger.info("Running the GVF to VCF converter")
260263
logger.info(f"The provided input file is: {gvf_input}")
261264
logger.info(f"The provided output file is: {vcf_output}")
265+
262266
if assembly:
263267
logger.info(f"The provided assembly file is: {assembly}")
264268
assembly_file = os.path.abspath(assembly)
265269
assert os.path.isfile(assembly_file), "Assembly file does not exist"
270+
assert os.path.isfile(gvf_input), "GVF file does not exist"
266271

267-
# Read input file and separate out its components
268-
logger.info(f"Reading in the following GVF header from {gvf_input}")
269-
gvf_pragmas, gvf_pragma_comments = read_in_gvf_header(gvf_input)
270272
# Creating lookup object to store important dictionaries and log what has been stored.
271273
reference_lookup = Lookup(assembly_file)
272274
logger.info("Creating the reference lookup object.")
@@ -275,51 +277,80 @@ def convert(gvf_input, vcf_output, assembly):
275277
logger.info(f"Storing the assembly file: {assembly_file}")
276278
logger.info("Storing the IUPAC ambiguity dictionary.")
277279

280+
# Read input file and separate out its components
281+
logger.info(f"Reading in the following GVF header from {gvf_input}")
282+
gvf_pragmas, gvf_pragma_comments = read_in_gvf_header(gvf_input)
283+
278284
# Preparation work:
279285
# Store the VCF metainformation and ensure preservation of important GVF data.
280286
# This information will be useful when creating the VCF header.
281-
# TODO: refactor function generate_vcf_metainfo
282287
(
283288
pragmas_for_vcf,
284289
samples
285290
) = convert_gvf_pragmas_for_vcf_header(gvf_pragmas, gvf_pragma_comments, reference_lookup)
286-
287-
# TODO: place the all_header_lines_per_type_dict into the reference_lookup.
291+
report = FileStatistics(gvf_file_path=gvf_input, gvf_pragmas=gvf_pragmas, samples=samples)
288292

289293
# Create data structure to store all possible outcomes for header lines (for fields ALT, INFO, FILTER, FORMAT)
290294
all_header_lines_per_type_dict = {
291295
htype: generate_vcf_header_structured_lines(htype, reference_lookup.mapping_attribute_dict) for htype in
292296
["ALT", "INFO", "FILTER", "FORMAT"]
293297
}
298+
294299
vcf_builder = VcfLineBuilder(all_header_lines_per_type_dict, reference_lookup, samples)
295300
is_missing_format_value = True
301+
# VCF dataline generation
296302
# Convert each feature line in the GVF file to a VCF object (stores all the data for a line in the VCF file).
297303
# NOTE: Main Logic lives here.
298304
vcf_data_file = vcf_output + '_data_lines'
299305
with open(vcf_data_file, "w") as open_data_lines:
300306
logger.info("Generating the VCF datalines")
301307
previous_vcf_line = None
302-
for gvf_lines_obj in read_in_gvf_data(gvf_input):
303-
current_vcf_line = vcf_builder.build_vcf_line(gvf_lines_obj)
308+
for gvf_entry in read_in_gvf_data(gvf_input):
309+
# record GVF counts
310+
report.gvf_feature_line_count += 1
311+
report.gvf_chromosome_count[gvf_entry.seqid] += 1
312+
report.gvf_sv_count.update([gvf_entry.feature_type])
313+
# create the VCF line object
314+
current_vcf_line = vcf_builder.build_vcf_line(gvf_entry)
304315
# is_missing_format_value will only be true if all the format field are missing.
305316
is_missing_format_value = is_missing_format_value and current_vcf_line.format_keys == ['.']
306317
# Each GVF feature has been converted to a VCF object so begin comparing and merging the VCF objects.
307318
if previous_vcf_line:
308319
if current_vcf_line == previous_vcf_line:
309320
current_vcf_line.merge(previous_vcf_line, list_of_sample_names=samples)
321+
report.vcf_number_of_merges += 1
310322
else:
311-
open_data_lines.write(str(previous_vcf_line) + "\n")
323+
# TODO: address this in next bug fix
324+
assert current_vcf_line > previous_vcf_line, f"File not sorted.\ncurrent_vcf_line.pos {current_vcf_line.pos} is smaller than previous_vcf_line.pos {previous_vcf_line.pos}. See the following line:\n{str(gvf_entry)}"
325+
record_vcf_entry(open_data_lines, previous_vcf_line, report)
312326
previous_vcf_line = current_vcf_line
327+
# Process the final previous_vcf_line
313328
if previous_vcf_line:
314-
open_data_lines.write(str(previous_vcf_line) + "\n")
329+
record_vcf_entry(open_data_lines, previous_vcf_line, report)
315330
else:
316331
logger.warning("No feature lines were found for this GVF file.")
317332

333+
# VCF header generation
318334
header_lines_per_type = vcf_builder.build_vcf_header()
319-
vcf_header_file = write_header(vcf_output, pragmas_for_vcf, header_lines_per_type,
320-
is_missing_format_value, samples)
335+
header_fields = generate_vcf_header_line(is_missing_format=is_missing_format_value, samples=samples)
336+
vcf_header_file = write_header(vcf_output, pragmas_for_vcf, header_lines_per_type, header_fields, samples)
321337

322338
logger.info(f"Combining the header and data lines to the following VCF output: {vcf_output}")
339+
construct_vcf_output(vcf_header_file, vcf_data_file, vcf_output)
340+
341+
logger.info("Remove the temporary files")
342+
cleanup_temp_files([vcf_header_file, vcf_data_file])
343+
344+
logger.info("Printing the summary of conversion report.")
345+
vcf_output_directory = os.path.dirname(vcf_output)
346+
stats_summary_file = os.path.join(vcf_output_directory, "summary_stats.txt")
347+
report.print_report(stats_summary_file)
348+
349+
logger.info("GVF to VCF conversion complete")
350+
351+
352+
#helper functions for convert
353+
def construct_vcf_output(vcf_header_file, vcf_data_file, vcf_output):
323354
with open(vcf_output, "w") as vcf_output:
324355
with open(vcf_header_file, "r") as vcf_header_fh:
325356
for line in vcf_header_fh:
@@ -328,12 +359,23 @@ def convert(gvf_input, vcf_output, assembly):
328359
for line in vcf_data_fh:
329360
vcf_output.write(line)
330361
vcf_output.close()
331-
logger.info("Remove the temporary files")
332-
if os.path.exists(vcf_header_file):
333-
os.remove(vcf_header_file)
334-
if os.path.exists(vcf_data_file):
335-
os.remove(vcf_data_file)
336-
logger.info("GVF to VCF conversion complete")
362+
363+
def record_vcf_entry(open_data_lines, previous_vcf_line, report):
364+
# write the VCF line and record counts (use update when iterable)
365+
open_data_lines.write(str(previous_vcf_line) + "\n")
366+
report.vcf_data_line_count += 1
367+
report.vcf_chromosome_count[previous_vcf_line.chrom] += 1
368+
report.vcf_alt_alleles_count[previous_vcf_line.alt] += 1
369+
report.vcf_info_counter.update(previous_vcf_line.info_dict.keys())
370+
report.vcf_variant_region_SOID.update([previous_vcf_line.info_dict.get("VARREGSOID")])
371+
report.vcf_variant_call_SOID.update([previous_vcf_line.info_dict.get("VARCALLSOID")])
372+
report.vcf_sample_number_count.update(previous_vcf_line.order_sample_names)
373+
report.vcf_format_counter.update(previous_vcf_line.vcf_values_for_format)
374+
375+
def cleanup_temp_files(list_of_temp_files):
376+
for temp_file in list_of_temp_files:
377+
if os.path.exists(temp_file):
378+
os.remove(temp_file)
337379

338380
def main():
339381
# Parse command line arguments

convert_gvf_to_vcf/vcfline.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,16 @@ def __eq__(self, other_vcf_line):
479479
return (self.chrom == other_vcf_line.chrom) and (self.pos == other_vcf_line.pos) and (self.ref == other_vcf_line.ref)
480480
return False
481481

482+
def __gt__(self, other_vcf_line):
483+
""" This checks ordering by pos within the SAME chromosome for the merging algorithm.
484+
Order is not enforced between different chromosomes to avoid cross-chromosome ordering.
485+
:param: other_vcf_line: VcfLine object to compare against.
486+
:return: boolean - True if position comes after the previous vcf line on the same chromosome.
487+
"""
488+
if isinstance(other_vcf_line, VcfLine):
489+
return (self.chrom != other_vcf_line.chrom) or (self.pos > other_vcf_line.pos)
490+
return False
491+
482492
@property
483493
def format_keys(self):
484494
set_of_format_key = set([format_key

0 commit comments

Comments
 (0)