@@ -145,6 +145,12 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
145145 seq_vec = std::move (ind_vec);
146146 }
147147
148+ void add_sequence (sequence_stats const & seq)
149+ {
150+ seq_vec.push_back (seq.ind );
151+ len += seq.len ;
152+ }
153+
148154 std::string unique_id ()
149155 {
150156 std::string str_id{};
@@ -175,8 +181,6 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
175181
176182 inline bool operator () (segment_stats const & left, segment_stats const & right)
177183 {
178- if (left.seq_vec .size () > 1 || right.seq_vec .size () > 1 )
179- throw std::runtime_error (" Can't order sets of sets of sequences." );
180184 return (left.seq_vec [0 ] < right.seq_vec [0 ]);
181185 }
182186 };
@@ -265,15 +269,35 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
265269 * @param overlap Length of overlap between adjacent segments.
266270 * @param seq_it Iterator to first sequence of sufficient length.
267271 */
268- template <typename it_t >
269- void make_exactly_n_segments (size_t const & n, size_t const & overlap, it_t & seq_it)
272+ void make_exactly_n_segments (uint32_t const & n, size_t const & overlap)
270273 {
271- size_t remaining_db_len = total_len;
272- for (auto it = seq_it; it != sequences.end (); it++)
274+ uint64_t remaining_db_len = total_len;
275+ size_t len_lower_bound = default_seg_len / 10 ;
276+ auto first_long_seq = std::find_if (sequences.begin (), sequences.end (), [&](auto const & seq){return (seq.len > len_lower_bound);});
277+
278+ segment_stats short_sequences{};
279+ short_sequences.start = 0 ;
280+ short_sequences.id = segments.size ();
281+
282+ uint64_t short_len{0 };
283+ for (auto it = sequences.begin (); it < sequences.end (); it++)
273284 {
274285 auto seq = *it;
275286 size_t start = 0 ;
276- if (seq.len <= default_seg_len * 1.5 )
287+ if (seq.len < len_lower_bound)
288+ {
289+ short_len += seq.len ;
290+ short_sequences.add_sequence (seq);
291+ if ((short_len >= default_seg_len) ||
292+ ((it + 1 ) == first_long_seq))
293+ {
294+ segments.push_back (short_sequences);
295+ short_sequences.seq_vec .clear ();
296+ remaining_db_len -= short_len;
297+ short_len = 0 ;
298+ }
299+ }
300+ else if (seq.len <= default_seg_len * 1.5 )
277301 {
278302 // database sequence is single segment
279303 add_segment (seq.ind , start, seq.len );
@@ -319,10 +343,9 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
319343 * @param overlap Length of overlap between adjacent segments.
320344 * @param seq_it Iterator to first sequence of sufficient length.
321345 */
322- template <typename it_t >
323- void make_equal_length_segments (size_t const & overlap, it_t & seq_it)
346+ void make_equal_length_segments (size_t const & overlap)
324347 {
325- for (auto it = seq_it ; it != sequences.end (); it++)
348+ for (auto it = sequences. begin () ; it != sequences.end (); it++)
326349 {
327350 auto seq = *it;
328351 size_t start = 0 ;
@@ -371,28 +394,16 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
371394 std::to_string (arguments.pattern_size ) + " bp.\n Decrease the overlap or the number of segments." );
372395 }
373396
374- size_t len_lower_bound = default_seg_len / 10 ;
375- // Check how many sequences are discarded for being too short
376- // !TODO: gather short sequences
377- auto first_long_seq = std::find_if (sequences.begin (), sequences.end (), [&](auto const & seq){return (seq.len > len_lower_bound);});
378- size_t discarded_short_sequences = first_long_seq - sequences.begin ();
379-
380- for (auto it = sequences.begin (); it < first_long_seq; it++)
381- {
382- seqan3::debug_stream << " Sequence: " << (*it).id << " is too short and will be skipped.\n " ;
383- total_len -= (*it).len ;
384- }
385-
386- if (arguments.seg_count < (sequences.size () - discarded_short_sequences))
397+ if (arguments.seg_count < sequences.size ())
387398 {
388399 throw std::runtime_error (" Can not split " + std::to_string (sequences.size ()) +
389400 " sequences into " + std::to_string (arguments.seg_count ) + " segments." );
390401 }
391402
392403 if constexpr (std::is_same<arg_t , build_arguments>::value)
393- make_exactly_n_segments (arguments.seg_count , arguments.pattern_size , first_long_seq );
404+ make_exactly_n_segments (arguments.seg_count , arguments.pattern_size );
394405 else
395- make_equal_length_segments (arguments.pattern_size , first_long_seq );
406+ make_equal_length_segments (arguments.pattern_size );
396407
397408 std::stable_sort (sequences.begin (), sequences.end (), fasta_order ());
398409 std::stable_sort (segments.begin (), segments.end (), fasta_order ());
@@ -413,8 +424,7 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
413424 std::to_string (arguments.pattern_size ) + " bp.\n Decrease the overlap or the number of segments." );
414425 }
415426
416- auto seq_begin = sequences.begin ();
417- make_equal_length_segments (arguments.pattern_size , seq_begin);
427+ make_equal_length_segments (arguments.pattern_size );
418428 seg_count = segments.size ();
419429
420430 std::stable_sort (sequences.begin (), sequences.end (), fasta_order ());
@@ -561,7 +571,7 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
561571 segment_stats seg = segments[seg_id];
562572 out_str << seg_id << ' \t ' ;
563573 for (size_t ind : seg.seq_vec )
564- out_str << ind << ' \t ' ;
574+ out_str << ind << ' ; ' ;
565575 out_str << seg.start << ' \t ' << seg.len << ' \n ' ;
566576 }
567577
0 commit comments