Skip to content

Commit 1ad5ffd

Browse files
clean-hairpins.cpp: Adding a struct to keep track of summary statistics and the functionality to output the histogram of hairpin matchig for all reads into a file.
1 parent e545296 commit 1ad5ffd

File tree

1 file changed

+88
-38
lines changed

1 file changed

+88
-38
lines changed

src/utils/clean-hairpins.cpp

Lines changed: 88 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,63 @@ check_hairpins(const size_t name_suffix_len,
152152
return static_cast<double>(n_bad_reads)/n_reads;
153153
}
154154

155+
struct hp_summary {
155156

157+
// n_reads is the total number of read pairs in the input fastq
158+
// files.
159+
uint64_t n_reads{};
160+
161+
// n_hairpin_reads is the number of read pairs identified as being
162+
// hairpins using the criteria in the "cutoff" variable.
163+
uint64_t n_hairpin_reads{};
164+
165+
// sum_percent_match_good is the sum of the percent matches between
166+
// the read ends for reads that do not meet the criteria for hairpin.
167+
double sum_percent_match_good{};
168+
169+
// sum_percent_match_good is the sum of the percent matches between
170+
// the read ends for reads that meet the criteria for hairpin.
171+
double sum_percent_match_bad{};
172+
173+
// cutoff is the fraction of matches between the two ends of the
174+
// read when matching under the assumption that the ends are from a
175+
// hairpin.
176+
double cutoff{};
177+
178+
// mean_percent_match_non_hairpin is the ratio of the
179+
// sum_percent_match_good over the total non-hairpin reads.
180+
double mean_percent_match_non_hairpin{};
181+
182+
// mean_percent_match_hairpin is the ratio of the
183+
// sum_percent_match_bad over the total hairpin reads.
184+
double mean_percent_match_hairpin{};
185+
186+
auto assign_values() -> void {
187+
mean_percent_match_non_hairpin =
188+
sum_percent_match_good / (n_reads - n_hairpin_reads);
189+
mean_percent_match_hairpin = sum_percent_match_bad / n_hairpin_reads;
190+
}
191+
192+
auto tostring() const -> string {
193+
std::ostringstream oss;
194+
oss << "total_reads_pairs: " << n_reads << '\n'
195+
<< "hairpin_read_pairs: " << n_hairpin_reads << '\n'
196+
<< "hairpin_cutoff: " << cutoff << '\n'
197+
<< "sum_percent_match_good: " << sum_percent_match_good << '\n'
198+
<< "mean_percent_match_non_hairpin: " << mean_percent_match_non_hairpin << '\n'
199+
<< "sum_percent_match_bad: " << sum_percent_match_bad << '\n'
200+
<< "mean_percent_match_hairpin: " << mean_percent_match_hairpin << '\n';
201+
return oss.str();
202+
}
203+
};
156204

157205
int
158206
main_clean_hairpins(int argc, const char **argv) {
159-
160207
try {
161208

162209
string outfile;
163210
string stat_outfile;
211+
string hist_outfile;
164212

165213
double cutoff = 0.95;
166214
size_t name_suffix_len = 0;
@@ -175,22 +223,23 @@ main_clean_hairpins(int argc, const char **argv) {
175223
OptionParser opt_parse(strip_path(argv[0]),
176224
"fix and stat invdup/hairping reads",
177225
"<end1-fastq> <end2-fastq>");
178-
opt_parse.add_opt("output", 'o',
179-
"output filename", true, outfile);
180-
opt_parse.add_opt("stat", 's',
181-
"stats output filename", true, stat_outfile);
182-
opt_parse.add_opt("hairpin", 'h',
183-
"max hairpin rate", false, max_hairpin_rate);
184-
opt_parse.add_opt("check", '\0',
185-
"check for hairpin contamination", false, check_first);
186-
opt_parse.add_opt("nreads", 'n',
187-
"number of reads in initial check",
188-
false, reads_to_check);
226+
opt_parse.add_opt("output", 'o', "output filename", true, outfile);
227+
opt_parse.add_opt("stat", 's', "stats output filename", true, stat_outfile);
228+
opt_parse.add_opt("hairpin", 'H', "max hairpin rate", false,
229+
max_hairpin_rate);
230+
opt_parse.add_opt("check", '\0', "check for hairpin contamination", false,
231+
check_first);
232+
opt_parse.add_opt("nreads", 'n', "number of reads in initial check", false,
233+
reads_to_check);
189234
opt_parse.add_opt("cutoff", 'c',
190-
"cutoff for calling an invdup (default: 0.95)",
191-
false, cutoff);
192-
opt_parse.add_opt("ignore", 'i', "length of read name suffix "
193-
"to ignore when matching", false, name_suffix_len);
235+
"cutoff for calling an invdup (default: 0.95)", false,
236+
cutoff);
237+
opt_parse.add_opt("ignore", 'i',
238+
"length of read name suffix "
239+
"to ignore when matching",
240+
false, name_suffix_len);
241+
opt_parse.add_opt("hist", '\0', "write a histogram of hairpin matches here",
242+
true, hist_outfile);
194243
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
195244
vector<string> leftover_args;
196245
opt_parse.parse(argc, argv, leftover_args);
@@ -228,6 +277,8 @@ main_clean_hairpins(int argc, const char **argv) {
228277
}
229278
}
230279

280+
hp_summary hps;
281+
hps.cutoff = cutoff;
231282
vector<double> hist(20, 0.0);
232283

233284
if (!check_first || hairpin_fraction > max_hairpin_rate) {
@@ -246,13 +297,9 @@ main_clean_hairpins(int argc, const char **argv) {
246297
if (!out)
247298
throw runtime_error("cannot open output file: " + outfile);
248299

249-
size_t n_reads = 0;
250-
size_t n_bad_reads = 0;
251-
double sum_percent_match_good = 0.0, sum_percent_match_bad = 0.0;
252-
253300
FASTQRecord end_one, end_two;
254301
while (in1 >> end_one && in2 >> end_two) {
255-
++n_reads;
302+
hps.n_reads++;
256303

257304
// two reads should be in paired-ends
258305
if (!mates(name_suffix_len, end_one, end_two))
@@ -267,38 +314,41 @@ main_clean_hairpins(int argc, const char **argv) {
267314
const double min_len = min(end_one.seq.length(), end_two.seq.length());
268315
const double percent_match = sim/min_len;
269316

317+
// ADS: need a bitter way to get this bin identifier
270318
const int hist_bin = hist.size()*((sim - 0.001)/(min_len + 0.001));
271319
hist[hist_bin]++;
272320

273321
if (percent_match > cutoff) {
274322

275-
sum_percent_match_bad += percent_match;
276-
++n_bad_reads;
323+
hps.sum_percent_match_bad += percent_match;
324+
hps.n_hairpin_reads++;
277325

278326
end_two.seq.clear();
279327
end_two.score.clear();
280328
}
281329
else
282-
sum_percent_match_good += percent_match;
330+
hps.sum_percent_match_good += percent_match;
283331

284332
out << end_two << '\n';
285333
}
286334

287335
std::ofstream stat_out(stat_outfile);
288-
stat_out << "total_reads_pairs: " << n_reads << endl
289-
<< "hairpin_read_pairs: " << n_bad_reads << endl
290-
<< "hairpin_cutoff: " << cutoff << endl
291-
<< "mean_percent_match_non_hairpin: "
292-
<< sum_percent_match_good/(n_reads - n_bad_reads) << endl
293-
<< "mean_percent_match_hairpin: "
294-
<< sum_percent_match_bad/n_bad_reads << endl;
295-
296-
const auto total = accumulate(cbegin(hist), cend(hist), 0.0);
297-
transform(cbegin(hist), cend(hist), begin(hist),
298-
[&total](const double t) {return t/total;});
299-
for (auto i = 0; i < std::size(hist); ++i)
300-
cout << i << '\t' << std::setprecision(3) << hist[i] << endl;
301-
cout << endl;
336+
if (!stat_out)
337+
throw runtime_error("failed to open file: " + stat_outfile);
338+
hps.assign_values();
339+
stat_out << hps.tostring();
340+
341+
if (!hist_outfile.empty()) {
342+
std::ofstream hist_out(hist_outfile);
343+
if (!hist_out)
344+
throw runtime_error("failed to open file: " + hist_outfile);
345+
const auto total = accumulate(cbegin(hist), cend(hist), 0.0);
346+
transform(cbegin(hist), cend(hist), begin(hist),
347+
[&total](const double t) { return t / total; });
348+
hist_out.precision(3);
349+
for (auto i = 0u; i < std::size(hist); ++i)
350+
hist_out << i << '\t' << hist[i] << '\n';
351+
}
302352
}
303353
}
304354
catch (const runtime_error &e) {

0 commit comments

Comments
 (0)