-
Notifications
You must be signed in to change notification settings - Fork 88
Expand file tree
/
Copy pathbasic_stats.py
More file actions
376 lines (327 loc) · 19.2 KB
/
basic_stats.py
File metadata and controls
376 lines (327 loc) · 19.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
############################################################################
# Copyright (c) 2015-2022 Saint Petersburg State University
# Copyright (c) 2011-2015 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################
from __future__ import with_statement
from __future__ import division
import os
import re
from os.path import join
from quast_libs import fastaparser, qconfig, qutils, reporting, plotter
from quast_libs.circos import set_window_size
from quast_libs.log import get_logger
from quast_libs.diputils import length_of_haplotypes
logger = get_logger(qconfig.LOGGER_DEFAULT_NAME)
MIN_HISTOGRAM_POINTS = 5
MIN_GC_WINDOW_SIZE = qconfig.GC_window_size // 2
def GC_content(contigs_fpath, skip=False):
"""
Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows)
"""
total_GC_amount = 0
total_contig_length = 0
GC_contigs_bin_num = int(100 / qconfig.GC_contig_bin_size) + 1
GC_contigs_distribution_x = [i * qconfig.GC_contig_bin_size for i in range(0, GC_contigs_bin_num)] # list of X-coordinates, i.e. GC %
GC_contigs_distribution_y = [0] * GC_contigs_bin_num # list of Y-coordinates, i.e. # contigs with GC % = x
GC_bin_num = int(100 / qconfig.GC_bin_size) + 1
GC_distribution_x = [i * qconfig.GC_bin_size for i in range(0, GC_bin_num)] # list of X-coordinates, i.e. GC %
GC_distribution_y = [0] * GC_bin_num # list of Y-coordinates, i.e. # windows with GC % = x
total_GC = None
if skip:
return total_GC, (GC_distribution_x, GC_distribution_y), (GC_contigs_distribution_x, GC_contigs_distribution_y)
for name, seq_full in fastaparser.read_fasta(contigs_fpath): # in tuples: (name, seq)
contig_ACGT_len = len(seq_full) - seq_full.count("N")
if not contig_ACGT_len:
continue
contig_GC_len = seq_full.count("G") + seq_full.count("C")
contig_GC_percent = 100.0 * contig_GC_len / contig_ACGT_len
GC_contigs_distribution_y[int(contig_GC_percent // qconfig.GC_contig_bin_size)] += 1
n = qconfig.GC_window_size
# non-overlapping windows
for seq in (seq_full[i:i+n] for i in range(0, len(seq_full), n)):
GC_percent = get_GC_percent(seq)
if GC_percent is not None:
GC_distribution_y[int(int(GC_percent / qconfig.GC_bin_size) * qconfig.GC_bin_size)] += 1
total_GC_amount += contig_GC_len
total_contig_length += contig_ACGT_len
if total_contig_length == 0:
total_GC = None
else:
total_GC = total_GC_amount * 100.0 / total_contig_length
return total_GC, (GC_distribution_x, GC_distribution_y), (GC_contigs_distribution_x, GC_contigs_distribution_y)
def get_GC_percent(seq):
if len(seq) < MIN_GC_WINDOW_SIZE:
return None
ACGT_len = len(seq) - seq.count("N")
# skip block if it has less than half of ACGT letters (it also helps with "ends of contigs")
if ACGT_len < len(seq) // 2:
return None
GC_len = seq.count("G") + seq.count("C")
GC_percent = 100 * GC_len // ACGT_len
return GC_percent
def save_icarus_GC(ref_fpath, gc_fpath):
chr_index = 0
window_size = qconfig.GC_window_size_large if qconfig.large_genome else qconfig.GC_window_size # non-overlapping windows
with open(gc_fpath, 'w') as out_f:
for name, seq_full in fastaparser.read_fasta(ref_fpath):
out_f.write('#' + name + ' ' + str(chr_index) + '\n')
for i in range(0, len(seq_full), window_size):
seq = seq_full[i:i + window_size]
GC_percent = get_GC_percent(seq)
if GC_percent is not None:
out_f.write(str(chr_index) + ' ' + str(GC_percent) + '\n')
def save_circos_GC(ref_fpath, reference_length, gc_fpath):
window_size = set_window_size(reference_length)
with open(gc_fpath, 'w') as out_f:
for name, seq_full in fastaparser.read_fasta(ref_fpath):
for i in range(0, len(seq_full), window_size):
seq = seq_full[i:i + window_size]
GC_percent = get_GC_percent(seq)
if GC_percent is not None:
out_f.write('\t'.join([name, str(i), str(i + len(seq)), str(GC_percent) + '\n']))
def binning_coverage(cov_values, nums_contigs):
min_bins_cnt = 5
bin_sizes = []
low_thresholds = []
high_thresholds = []
cov_by_bins = []
max_cov = max(len(v) for v in cov_values)
for values, num_contigs in zip(cov_values, nums_contigs):
assembly_len = sum(values)
bases_by_cov = []
for coverage, bases in enumerate(values):
bases_by_cov.extend([coverage] * bases)
q1 = bases_by_cov[assembly_len // 4]
q2 = bases_by_cov[assembly_len // 2]
q3 = bases_by_cov[assembly_len * 3 // 4]
iqr = q3 - q1
low_thresholds.append(int(q2 - 1.5 * iqr))
high_thresholds.append(int(q2 + 1.5 * iqr))
bin_sizes.append(int(2 * iqr / num_contigs ** (1.0 / 3)))
bin_size = max(min(bin_sizes), 1)
low_threshold = max(min(low_thresholds), 0)
high_threshold = min(max(high_thresholds), max_cov)
if (high_threshold - low_threshold) // bin_size < min_bins_cnt and bin_size > 1:
bin_size = max((high_threshold - low_threshold) // min_bins_cnt, 1)
low_threshold -= low_threshold % bin_size
high_threshold -= high_threshold % bin_size
max_points = (high_threshold // bin_size) + 1 # add last bin
if high_threshold - low_threshold < MIN_HISTOGRAM_POINTS:
low_threshold -= MIN_HISTOGRAM_POINTS // 2
high_threshold += MIN_HISTOGRAM_POINTS // 2
max_points = (high_threshold // bin_size) + 1
offset = 0
if low_threshold > bin_size: # add first bin
offset = low_threshold // bin_size - 1
max_points -= offset
else:
low_threshold = 0
for index, values in enumerate(cov_values):
cov_by_bins.append([0] * int(max_points))
for coverage, bases in enumerate(values):
bin_idx = coverage // bin_size - offset
if coverage < low_threshold:
bin_idx = 0
elif coverage >= high_threshold:
bin_idx = max_points - 1
cov_by_bins[int(index)][int(bin_idx)] += bases
return cov_by_bins, bin_size, low_threshold, high_threshold, max_cov
def draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath):
total_len = dict()
contigs_dict = dict()
contigs_with_coverage = [contigs_fpath for contigs_fpath in contigs_fpaths if coverage_dict[contigs_fpath]]
for contigs_fpath in contigs_fpaths:
total_len[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.TOTALLEN)
contigs_dict[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.CONTIGS)
cov_values = [coverage_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]
num_contigs = [contigs_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage]
common_coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage(cov_values, num_contigs)
histogram_title = 'Coverage histogram (bin size: ' + str(bin_size) + 'x)'
plotter.coverage_histogram(contigs_with_coverage, common_coverage_values, output_dirpath + '/coverage_histogram',
histogram_title, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold)
for contigs_fpath in contigs_with_coverage:
coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage([coverage_dict[contigs_fpath]],
[contigs_dict[contigs_fpath]])
label = qutils.label_from_fpath(contigs_fpath)
corr_label = qutils.label_from_fpath_for_fname(contigs_fpath)
histogram_title = label + ' coverage histogram (bin size: ' + str(bin_size) + 'x)'
histogram_fpath = os.path.join(output_dirpath, corr_label + '_coverage_histogram')
plotter.coverage_histogram([contigs_fpath], coverage_values, histogram_fpath,
histogram_title, draw_bars=True, bin_size=bin_size, max_cov=max_cov,
low_threshold=low_threshold, high_threshold=high_threshold)
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
logger.print_timestamp()
logger.main_info("Running Basic statistics processor...")
if not os.path.isdir(output_dirpath):
os.mkdir(output_dirpath)
reference_length = None
reference_lengths = []
reference_fragments = None
icarus_gc_fpath = None
circos_gc_fpath = None
if ref_fpath:
reference_lengths = sorted(fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True)
reference_fragments = len(reference_lengths)
reference_length = sum(reference_lengths)
reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(ref_fpath)
if qconfig.create_icarus_html or qconfig.draw_plots:
icarus_gc_fpath = join(output_dirpath, 'gc.icarus.txt')
save_icarus_GC(ref_fpath, icarus_gc_fpath)
if qconfig.draw_circos:
circos_gc_fpath = join(output_dirpath, 'gc.circos.txt')
save_circos_GC(ref_fpath, reference_length, circos_gc_fpath)
logger.info(' Reference genome:')
logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) +
', num fragments = ' + str(reference_fragments) + ', GC % = ' +
'%.2f' % reference_GC if reference_GC is not None else 'undefined')
if reference_fragments > 30 and not qconfig.check_for_fragmented_ref:
logger.warning(' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.'
' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).')
elif qconfig.estimated_reference_size:
reference_length = qconfig.estimated_reference_size
reference_lengths = [reference_length]
logger.info(' Estimated reference length = ' + str(reference_length))
logger.info(' Contig files: ')
lists_of_lengths = []
numbers_of_Ns = []
coverage_dict = dict()
cov_pattern = re.compile(r'_cov_(\d+\.?\d*)')
for id, contigs_fpath in enumerate(contigs_fpaths):
coverage_dict[contigs_fpath] = []
assembly_label = qutils.label_from_fpath(contigs_fpath)
logger.info(' ' + qutils.index_to_str(id) + assembly_label)
# lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
list_of_length = []
number_of_Ns = 0
for (name, seq) in fastaparser.read_fasta(contigs_fpath):
list_of_length.append(len(seq))
number_of_Ns += seq.count('N')
if cov_pattern.findall(name):
cov = int(float(cov_pattern.findall(name)[0]))
if len(coverage_dict[contigs_fpath]) <= cov:
coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1)
coverage_dict[contigs_fpath][cov] += len(seq)
lists_of_lengths.append(list_of_length)
numbers_of_Ns.append(number_of_Ns)
lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])
multiplicator = 1
if num_contigs >= (qconfig.max_points * 2):
import math
multiplicator = int(num_contigs / qconfig.max_points)
max_points = num_contigs // multiplicator
corr_lists_of_lengths = [[sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points)
if (i * multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths]
if len(reference_lengths) > 1:
reference_lengths = [sum(reference_lengths[((i - 1) * multiplicator):(i * multiplicator)])
if (i * multiplicator) < len(reference_lengths) else
sum(reference_lengths[((i - 1) * multiplicator):])
for i in range(1, max_points)] + [sum(reference_lengths[(max_points - 1) * multiplicator:])]
for num_list in range(len(corr_lists_of_lengths)):
last_index = len(corr_lists_of_lengths[num_list])
corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator:]))
else:
corr_lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
if reference_lengths:
# Saving for an HTML report
if qconfig.html_report:
from quast_libs.html_saver import html_saver
html_saver.save_reference_lengths(results_dir, reference_lengths)
if qconfig.html_report:
from quast_libs.html_saver import html_saver
html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
html_saver.save_tick_x(results_dir, multiplicator)
########################################################################
logger.info(' Calculating N50 and L50...')
list_of_GC_distributions = []
list_of_GC_contigs_distributions = []
largest_contig = 0
from . import N50
for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
report = reporting.get(contigs_fpath)
n50, l50 = N50.N50_and_L50(lengths_list)
auN = N50.au_metric(lengths_list)
ng50, lg50 = None, None
if reference_length:
ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
auNG = N50.au_metric(lengths_list, reference_length)
nx, lx = N50.N50_and_L50(lengths_list, qconfig.x_for_additional_Nx)
ngx, lgx = None, None
if reference_length:
ngx, lgx = N50.NG50_and_LG50(lengths_list, reference_length, qconfig.x_for_additional_Nx)
total_length = sum(lengths_list)
total_GC, GC_distribution, GC_contigs_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
list_of_GC_distributions.append(GC_distribution)
list_of_GC_contigs_distributions.append(GC_contigs_distribution)
logger.info(' ' + qutils.index_to_str(id) +
qutils.label_from_fpath(contigs_fpath) + \
', N50 = ' + str(n50) + \
', L50 = ' + str(l50) + \
', auN = ' + ('%.1f' % auN if auN is not None else None) + \
', Total length = ' + str(total_length) + \
', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')
report.add_field(reporting.Fields.N50, n50)
report.add_field(reporting.Fields.L50, l50)
report.add_field(reporting.Fields.auN, ('%.1f' % auN if auN is not None else None))
if reference_length and not qconfig.is_combined_ref:
report.add_field(reporting.Fields.NG50, ng50)
report.add_field(reporting.Fields.LG50, lg50)
report.add_field(reporting.Fields.auNG, ('%.1f' % auNG if auNG is not None else None))
report.add_field(reporting.Fields.Nx, nx)
report.add_field(reporting.Fields.Lx, lx)
if reference_length and not qconfig.is_combined_ref:
report.add_field(reporting.Fields.NGx, ngx)
report.add_field(reporting.Fields.LGx, lgx)
report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
if lengths_list:
report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
largest_contig = max(largest_contig, max(lengths_list))
report.add_field(reporting.Fields.TOTALLEN, total_length)
if not qconfig.is_combined_ref:
report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None))
report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
if ref_fpath:
report.add_field(reporting.Fields.REFLEN, int(reference_length))
report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments)
if qconfig.ploid_mode:
report.add_field(reporting.Fields.REFLEN_HAPLOTYPES,
[int(l) for l in length_of_haplotypes.values()])
if not qconfig.is_combined_ref:
report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None))
elif reference_length:
report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))
import math
qconfig.min_difference = math.ceil((largest_contig / 1000) / 600) # divide on height of plot
list_of_GC_distributions_with_ref = list_of_GC_distributions
reference_index = None
if ref_fpath:
reference_index = len(list_of_GC_distributions_with_ref)
list_of_GC_distributions_with_ref.append(reference_GC_distribution)
if qconfig.html_report and not qconfig.no_gc:
from quast_libs.html_saver import html_saver
html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index)
########################################################################
# Drawing Nx and NGx plots...
plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', [])
if reference_length and not qconfig.is_combined_ref:
plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx',
[reference_length for i in range(len(contigs_fpaths))])
if qconfig.draw_plots:
########################################################################import plotter
# Drawing cumulative plot...
plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length')
if not qconfig.no_gc:
########################################################################
# Drawing GC content plot...
plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot'))
for contigs_fpath, GC_distribution in zip(contigs_fpaths, list_of_GC_contigs_distributions):
plotter.contigs_GC_content_plot(contigs_fpath, GC_distribution,
join(output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot'))
if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths):
draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath)
logger.main_info('Done.')
return icarus_gc_fpath, circos_gc_fpath