Skip to content

Commit 00821e0

Browse files
Merge pull request #145 from smithlabcode/clean-hairpins-histogram
Clean hairpins histogram
2 parents ca7c708 + 1ad5ffd commit 00821e0

File tree

1 file changed

+93
-31
lines changed

1 file changed

+93
-31
lines changed

src/utils/clean-hairpins.cpp

Lines changed: 93 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,63 @@ check_hairpins(const size_t name_suffix_len,
152152
return static_cast<double>(n_bad_reads)/n_reads;
153153
}
154154

155+
struct hp_summary {
155156

157+
// n_reads is the total number of read pairs in the input fastq
158+
// files.
159+
uint64_t n_reads{};
160+
161+
// n_hairpin_reads is the number of read pairs identified as being
162+
// hairpins using the criteria in the "cutoff" variable.
163+
uint64_t n_hairpin_reads{};
164+
165+
// sum_percent_match_good is the sum of the percent matches between
166+
// the read ends for reads that do not meet the criteria for hairpin.
167+
double sum_percent_match_good{};
168+
169+
// sum_percent_match_good is the sum of the percent matches between
170+
// the read ends for reads that meet the criteria for hairpin.
171+
double sum_percent_match_bad{};
172+
173+
// cutoff is the fraction of matches between the two ends of the
174+
// read when matching under the assumption that the ends are from a
175+
// hairpin.
176+
double cutoff{};
177+
178+
// mean_percent_match_non_hairpin is the ratio of the
179+
// sum_percent_match_good over the total non-hairpin reads.
180+
double mean_percent_match_non_hairpin{};
181+
182+
// mean_percent_match_hairpin is the ratio of the
183+
// sum_percent_match_bad over the total hairpin reads.
184+
double mean_percent_match_hairpin{};
185+
186+
auto assign_values() -> void {
187+
mean_percent_match_non_hairpin =
188+
sum_percent_match_good / (n_reads - n_hairpin_reads);
189+
mean_percent_match_hairpin = sum_percent_match_bad / n_hairpin_reads;
190+
}
191+
192+
auto tostring() const -> string {
193+
std::ostringstream oss;
194+
oss << "total_reads_pairs: " << n_reads << '\n'
195+
<< "hairpin_read_pairs: " << n_hairpin_reads << '\n'
196+
<< "hairpin_cutoff: " << cutoff << '\n'
197+
<< "sum_percent_match_good: " << sum_percent_match_good << '\n'
198+
<< "mean_percent_match_non_hairpin: " << mean_percent_match_non_hairpin << '\n'
199+
<< "sum_percent_match_bad: " << sum_percent_match_bad << '\n'
200+
<< "mean_percent_match_hairpin: " << mean_percent_match_hairpin << '\n';
201+
return oss.str();
202+
}
203+
};
156204

157205
int
158206
main_clean_hairpins(int argc, const char **argv) {
159-
160207
try {
161208

162209
string outfile;
163210
string stat_outfile;
211+
string hist_outfile;
164212

165213
double cutoff = 0.95;
166214
size_t name_suffix_len = 0;
@@ -175,22 +223,23 @@ main_clean_hairpins(int argc, const char **argv) {
175223
OptionParser opt_parse(strip_path(argv[0]),
176224
"fix and stat invdup/hairping reads",
177225
"<end1-fastq> <end2-fastq>");
178-
opt_parse.add_opt("output", 'o',
179-
"output filename", true, outfile);
180-
opt_parse.add_opt("stat", 's',
181-
"stats output filename", true, stat_outfile);
182-
opt_parse.add_opt("hairpin", 'h',
183-
"max hairpin rate", false, max_hairpin_rate);
184-
opt_parse.add_opt("check", '\0',
185-
"check for hairpin contamination", false, check_first);
186-
opt_parse.add_opt("nreads", 'n',
187-
"number of reads in initial check",
188-
false, reads_to_check);
226+
opt_parse.add_opt("output", 'o', "output filename", true, outfile);
227+
opt_parse.add_opt("stat", 's', "stats output filename", true, stat_outfile);
228+
opt_parse.add_opt("hairpin", 'H', "max hairpin rate", false,
229+
max_hairpin_rate);
230+
opt_parse.add_opt("check", '\0', "check for hairpin contamination", false,
231+
check_first);
232+
opt_parse.add_opt("nreads", 'n', "number of reads in initial check", false,
233+
reads_to_check);
189234
opt_parse.add_opt("cutoff", 'c',
190-
"cutoff for calling an invdup (default: 0.95)",
191-
false, cutoff);
192-
opt_parse.add_opt("ignore", 'i', "length of read name suffix "
193-
"to ignore when matching", false, name_suffix_len);
235+
"cutoff for calling an invdup (default: 0.95)", false,
236+
cutoff);
237+
opt_parse.add_opt("ignore", 'i',
238+
"length of read name suffix "
239+
"to ignore when matching",
240+
false, name_suffix_len);
241+
opt_parse.add_opt("hist", '\0', "write a histogram of hairpin matches here",
242+
true, hist_outfile);
194243
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
195244
vector<string> leftover_args;
196245
opt_parse.parse(argc, argv, leftover_args);
@@ -228,6 +277,10 @@ main_clean_hairpins(int argc, const char **argv) {
228277
}
229278
}
230279

280+
hp_summary hps;
281+
hps.cutoff = cutoff;
282+
vector<double> hist(20, 0.0);
283+
231284
if (!check_first || hairpin_fraction > max_hairpin_rate) {
232285

233286
// Input: paired-end reads with end1 and end2
@@ -244,13 +297,9 @@ main_clean_hairpins(int argc, const char **argv) {
244297
if (!out)
245298
throw runtime_error("cannot open output file: " + outfile);
246299

247-
size_t n_reads = 0;
248-
size_t n_bad_reads = 0;
249-
double sum_percent_match_good = 0.0, sum_percent_match_bad = 0.0;
250-
251300
FASTQRecord end_one, end_two;
252301
while (in1 >> end_one && in2 >> end_two) {
253-
++n_reads;
302+
hps.n_reads++;
254303

255304
// two reads should be in paired-ends
256305
if (!mates(name_suffix_len, end_one, end_two))
@@ -265,28 +314,41 @@ main_clean_hairpins(int argc, const char **argv) {
265314
const double min_len = min(end_one.seq.length(), end_two.seq.length());
266315
const double percent_match = sim/min_len;
267316

317+
// ADS: need a bitter way to get this bin identifier
318+
const int hist_bin = hist.size()*((sim - 0.001)/(min_len + 0.001));
319+
hist[hist_bin]++;
320+
268321
if (percent_match > cutoff) {
269322

270-
sum_percent_match_bad += percent_match;
271-
++n_bad_reads;
323+
hps.sum_percent_match_bad += percent_match;
324+
hps.n_hairpin_reads++;
272325

273326
end_two.seq.clear();
274327
end_two.score.clear();
275328
}
276329
else
277-
sum_percent_match_good += percent_match;
330+
hps.sum_percent_match_good += percent_match;
278331

279332
out << end_two << '\n';
280333
}
281334

282335
std::ofstream stat_out(stat_outfile);
283-
stat_out << "total_reads_pairs: " << n_reads << endl
284-
<< "hairpin_read_pairs: " << n_bad_reads << endl
285-
<< "hairpin_cutoff: " << cutoff << endl
286-
<< "mean_percent_match_non_hairpin: "
287-
<< sum_percent_match_good/(n_reads - n_bad_reads) << endl
288-
<< "mean_percent_match_hairpin: "
289-
<< sum_percent_match_bad/n_bad_reads << endl;
336+
if (!stat_out)
337+
throw runtime_error("failed to open file: " + stat_outfile);
338+
hps.assign_values();
339+
stat_out << hps.tostring();
340+
341+
if (!hist_outfile.empty()) {
342+
std::ofstream hist_out(hist_outfile);
343+
if (!hist_out)
344+
throw runtime_error("failed to open file: " + hist_outfile);
345+
const auto total = accumulate(cbegin(hist), cend(hist), 0.0);
346+
transform(cbegin(hist), cend(hist), begin(hist),
347+
[&total](const double t) { return t / total; });
348+
hist_out.precision(3);
349+
for (auto i = 0u; i < std::size(hist); ++i)
350+
hist_out << i << '\t' << hist[i] << '\n';
351+
}
290352
}
291353
}
292354
catch (const runtime_error &e) {

0 commit comments

Comments
 (0)