@@ -77,25 +77,66 @@ struct rd_stats { // keep track of good bases/reads in and out
7777 }
7878};
7979
80+
81+ struct uniq_summary {
82+ uniq_summary (const rd_stats &rs_in, const rd_stats &rs_out,
83+ const size_t reads_duped) {
84+ total_reads = rs_in.reads ;
85+ total_bases = rs_in.bases ;
86+ unique_reads = rs_out.reads ;
87+ unique_read_bases = rs_out.bases ;
88+ reads_removed = rs_in.reads - rs_out.reads ;
89+ non_duplicate_fraction = static_cast <double >(rs_out.reads - reads_duped) /
90+ std::max (1ul , rs_in.reads );
91+ duplication_rate = static_cast <double >(reads_removed + reads_duped) /
92+ std::max (1ul , reads_duped);
93+ duplicate_reads = reads_duped;
94+ }
95+
96+ // total_reads is the number of input reads
97+ size_t total_reads{};
98+ // total_bases is the total number of input bases
99+ size_t total_bases{};
100+ // unique_reads is the number of unique reads
101+ size_t unique_reads{};
102+ // unique_read_bases is the total number of bases for the unique reads
103+ size_t unique_read_bases{};
104+ // non_duplicate_fraction is the ratio of the number of unique reads with
105+ // no duplicates to that of the input reads
106+ double non_duplicate_fraction{};
107+ // duplicate_reads is the number of unique reads with at least one duplicate
108+ size_t duplicate_reads{};
109+ // reads_removed is the number of duplicate reads that have been removed
110+ size_t reads_removed{};
111+ // duplication_rate is the average number of duplicates for the reads with
112+ // at least one duplicate (>1 by definition)
113+ double duplication_rate{};
114+
115+ string tostring () {
116+ std::ostringstream oss;
117+ oss << " total_reads: " << total_reads << endl
118+ << " total_bases: " << total_bases << endl
119+ << " unique_reads: " << unique_reads << endl
120+ << " unique_read_bases: " << unique_read_bases << endl
121+ << " non_duplicate_fraction: " << non_duplicate_fraction << endl
122+ << " duplicate_reads: " << duplicate_reads << endl
123+ << " reads_removed: " << reads_removed << endl
124+ << " duplication_rate: " << duplication_rate;
125+
126+ return oss.str ();
127+ }
128+ };
129+
130+
131+
80132static void
81133write_stats_output (const rd_stats &rs_in, const rd_stats &rs_out,
82134 const size_t reads_duped, const string &statfile) {
83135 if (!statfile.empty ()) {
84- const size_t reads_removed = rs_in.reads - rs_out.reads ;
85- const double non_dup_frac =
86- (rs_out.reads - reads_duped) / static_cast <double >(rs_in.reads );
87- const double dup_rate =
88- (reads_removed + reads_duped) / static_cast <double >(reads_duped);
136+ uniq_summary summary (rs_in, rs_out, reads_duped);
89137 ofstream out_stat (statfile);
90138 if (!out_stat) throw runtime_error (" bad stats output file" );
91- out_stat << " total_reads: " << rs_in.reads << endl
92- << " total_bases: " << rs_in.bases << endl
93- << " unique_reads: " << rs_out.reads << endl
94- << " unique_read_bases: " << rs_out.bases << endl
95- << " non_duplicate_fraction: " << non_dup_frac << endl
96- << " duplicate_reads: " << reads_duped << endl
97- << " reads_removed: " << reads_removed << endl
98- << " duplication_rate: " << dup_rate << endl;
139+ out_stat << summary.tostring () << endl;
99140 }
100141}
101142
@@ -245,7 +286,7 @@ main_uniq(int argc, const char **argv) {
245286 " sorted mapped reads" ,
246287 " <in-file> [out-file]" , 2 );
247288 opt_parse.add_opt (" threads" , ' t' , " number of threads" , false , n_threads);
248- opt_parse.add_opt (" stats " , ' S' , " statistics output file" , false , statfile);
289+ opt_parse.add_opt (" summary " , ' S' , " statistics output file" , false , statfile);
249290 opt_parse.add_opt (" add-count" , ' a' , " add duplicate counts to reads" , false ,
250291 add_dup_count);
251292 opt_parse.add_opt (" hist" , ' \0 ' ,
0 commit comments