@@ -152,15 +152,63 @@ check_hairpins(const size_t name_suffix_len,
152152 return static_cast <double >(n_bad_reads)/n_reads;
153153}
154154
155+ struct hp_summary {
155156
157+ // n_reads is the total number of read pairs in the input fastq
158+ // files.
159+ uint64_t n_reads{};
160+
161+ // n_hairpin_reads is the number of read pairs identified as being
162+ // hairpins using the criteria in the "cutoff" variable.
163+ uint64_t n_hairpin_reads{};
164+
165+ // sum_percent_match_good is the sum of the percent matches between
166+ // the read ends for reads that do not meet the criteria for hairpin.
167+ double sum_percent_match_good{};
168+
169+ // sum_percent_match_good is the sum of the percent matches between
170+ // the read ends for reads that meet the criteria for hairpin.
171+ double sum_percent_match_bad{};
172+
173+ // cutoff is the fraction of matches between the two ends of the
174+ // read when matching under the assumption that the ends are from a
175+ // hairpin.
176+ double cutoff{};
177+
178+ // mean_percent_match_non_hairpin is the ratio of the
179+ // sum_percent_match_good over the total non-hairpin reads.
180+ double mean_percent_match_non_hairpin{};
181+
182+ // mean_percent_match_hairpin is the ratio of the
183+ // sum_percent_match_bad over the total hairpin reads.
184+ double mean_percent_match_hairpin{};
185+
186+ auto assign_values () -> void {
187+ mean_percent_match_non_hairpin =
188+ sum_percent_match_good / (n_reads - n_hairpin_reads);
189+ mean_percent_match_hairpin = sum_percent_match_bad / n_hairpin_reads;
190+ }
191+
192+ auto tostring () const -> string {
193+ std::ostringstream oss;
194+ oss << " total_reads_pairs: " << n_reads << ' \n '
195+ << " hairpin_read_pairs: " << n_hairpin_reads << ' \n '
196+ << " hairpin_cutoff: " << cutoff << ' \n '
197+ << " sum_percent_match_good: " << sum_percent_match_good << ' \n '
198+ << " mean_percent_match_non_hairpin: " << mean_percent_match_non_hairpin << ' \n '
199+ << " sum_percent_match_bad: " << sum_percent_match_bad << ' \n '
200+ << " mean_percent_match_hairpin: " << mean_percent_match_hairpin << ' \n ' ;
201+ return oss.str ();
202+ }
203+ };
156204
157205int
158206main_clean_hairpins (int argc, const char **argv) {
159-
160207 try {
161208
162209 string outfile;
163210 string stat_outfile;
211+ string hist_outfile;
164212
165213 double cutoff = 0.95 ;
166214 size_t name_suffix_len = 0 ;
@@ -175,22 +223,23 @@ main_clean_hairpins(int argc, const char **argv) {
175223 OptionParser opt_parse (strip_path (argv[0 ]),
176224 " fix and stat invdup/hairping reads" ,
177225 " <end1-fastq> <end2-fastq>" );
178- opt_parse.add_opt (" output" , ' o' ,
179- " output filename" , true , outfile);
180- opt_parse.add_opt (" stat" , ' s' ,
181- " stats output filename" , true , stat_outfile);
182- opt_parse.add_opt (" hairpin" , ' h' ,
183- " max hairpin rate" , false , max_hairpin_rate);
184- opt_parse.add_opt (" check" , ' \0 ' ,
185- " check for hairpin contamination" , false , check_first);
186- opt_parse.add_opt (" nreads" , ' n' ,
187- " number of reads in initial check" ,
188- false , reads_to_check);
226+ opt_parse.add_opt (" output" , ' o' , " output filename" , true , outfile);
227+ opt_parse.add_opt (" stat" , ' s' , " stats output filename" , true , stat_outfile);
228+ opt_parse.add_opt (" hairpin" , ' H' , " max hairpin rate" , false ,
229+ max_hairpin_rate);
230+ opt_parse.add_opt (" check" , ' \0 ' , " check for hairpin contamination" , false ,
231+ check_first);
232+ opt_parse.add_opt (" nreads" , ' n' , " number of reads in initial check" , false ,
233+ reads_to_check);
189234 opt_parse.add_opt (" cutoff" , ' c' ,
190- " cutoff for calling an invdup (default: 0.95)" ,
191- false , cutoff);
192- opt_parse.add_opt (" ignore" , ' i' , " length of read name suffix "
193- " to ignore when matching" , false , name_suffix_len);
235+ " cutoff for calling an invdup (default: 0.95)" , false ,
236+ cutoff);
237+ opt_parse.add_opt (" ignore" , ' i' ,
238+ " length of read name suffix "
239+ " to ignore when matching" ,
240+ false , name_suffix_len);
241+ opt_parse.add_opt (" hist" , ' \0 ' , " write a histogram of hairpin matches here" ,
242+ true , hist_outfile);
194243 opt_parse.add_opt (" verbose" , ' v' , " print more run info" , false , VERBOSE);
195244 vector<string> leftover_args;
196245 opt_parse.parse (argc, argv, leftover_args);
@@ -228,6 +277,10 @@ main_clean_hairpins(int argc, const char **argv) {
228277 }
229278 }
230279
280+ hp_summary hps;
281+ hps.cutoff = cutoff;
282+ vector<double > hist (20 , 0.0 );
283+
231284 if (!check_first || hairpin_fraction > max_hairpin_rate) {
232285
233286 // Input: paired-end reads with end1 and end2
@@ -244,13 +297,9 @@ main_clean_hairpins(int argc, const char **argv) {
244297 if (!out)
245298 throw runtime_error (" cannot open output file: " + outfile);
246299
247- size_t n_reads = 0 ;
248- size_t n_bad_reads = 0 ;
249- double sum_percent_match_good = 0.0 , sum_percent_match_bad = 0.0 ;
250-
251300 FASTQRecord end_one, end_two;
252301 while (in1 >> end_one && in2 >> end_two) {
253- ++n_reads ;
302+ hps. n_reads ++ ;
254303
255304 // two reads should be in paired-ends
256305 if (!mates (name_suffix_len, end_one, end_two))
@@ -265,28 +314,41 @@ main_clean_hairpins(int argc, const char **argv) {
265314 const double min_len = min (end_one.seq .length (), end_two.seq .length ());
266315 const double percent_match = sim/min_len;
267316
317+ // ADS: need a bitter way to get this bin identifier
318+ const int hist_bin = hist.size ()*((sim - 0.001 )/(min_len + 0.001 ));
319+ hist[hist_bin]++;
320+
268321 if (percent_match > cutoff) {
269322
270- sum_percent_match_bad += percent_match;
271- ++n_bad_reads ;
323+ hps. sum_percent_match_bad += percent_match;
324+ hps. n_hairpin_reads ++ ;
272325
273326 end_two.seq .clear ();
274327 end_two.score .clear ();
275328 }
276329 else
277- sum_percent_match_good += percent_match;
330+ hps. sum_percent_match_good += percent_match;
278331
279332 out << end_two << ' \n ' ;
280333 }
281334
282335 std::ofstream stat_out (stat_outfile);
283- stat_out << " total_reads_pairs: " << n_reads << endl
284- << " hairpin_read_pairs: " << n_bad_reads << endl
285- << " hairpin_cutoff: " << cutoff << endl
286- << " mean_percent_match_non_hairpin: "
287- << sum_percent_match_good/(n_reads - n_bad_reads) << endl
288- << " mean_percent_match_hairpin: "
289- << sum_percent_match_bad/n_bad_reads << endl;
336+ if (!stat_out)
337+ throw runtime_error (" failed to open file: " + stat_outfile);
338+ hps.assign_values ();
339+ stat_out << hps.tostring ();
340+
341+ if (!hist_outfile.empty ()) {
342+ std::ofstream hist_out (hist_outfile);
343+ if (!hist_out)
344+ throw runtime_error (" failed to open file: " + hist_outfile);
345+ const auto total = accumulate (cbegin (hist), cend (hist), 0.0 );
346+ transform (cbegin (hist), cend (hist), begin (hist),
347+ [&total](const double t) { return t / total; });
348+ hist_out.precision (3 );
349+ for (auto i = 0u ; i < std::size (hist); ++i)
350+ hist_out << i << ' \t ' << hist[i] << ' \n ' ;
351+ }
290352 }
291353 }
292354 catch (const runtime_error &e) {
0 commit comments