Skip to content

Commit f9861d4

Browse files
authored
Gather scaffolds (#154)
* Gather short scaffolds into a single bin * Count all minimisers by default
2 parents e4f5aa1 + 1c24f82 commit f9861d4

File tree

8 files changed

+51
-36
lines changed

8 files changed

+51
-36
lines changed

include/valik/shared.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ struct build_arguments final : public split_arguments
9393
bool manual_parameters{false};
9494
bool input_is_minimiser{false};
9595

96-
uint8_t kmer_count_min_cutoff{2};
97-
uint8_t kmer_count_max_cutoff{64};
96+
uint8_t kmer_count_min_cutoff{0};
97+
uint8_t kmer_count_max_cutoff{254};
9898
bool use_filesize_dependent_cutoff{false};
9999

100100
std::filesystem::path ref_meta_path{};

include/valik/split/metadata.hpp

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,12 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
145145
seq_vec = std::move(ind_vec);
146146
}
147147

148+
void add_sequence(sequence_stats const & seq)
149+
{
150+
seq_vec.push_back(seq.ind);
151+
len += seq.len;
152+
}
153+
148154
std::string unique_id()
149155
{
150156
std::string str_id{};
@@ -175,8 +181,6 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
175181

176182
inline bool operator() (segment_stats const & left, segment_stats const & right)
177183
{
178-
if (left.seq_vec.size() > 1 || right.seq_vec.size() > 1)
179-
throw std::runtime_error("Can't order sets of sets of sequences.");
180184
return (left.seq_vec[0] < right.seq_vec[0]);
181185
}
182186
};
@@ -265,15 +269,35 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
265269
* @param overlap Length of overlap between adjacent segments.
266270
* @param seq_it Iterator to first sequence of sufficient length.
267271
*/
268-
template <typename it_t>
269-
void make_exactly_n_segments(size_t const & n, size_t const & overlap, it_t & seq_it)
272+
void make_exactly_n_segments(uint32_t const & n, size_t const & overlap)
270273
{
271-
size_t remaining_db_len = total_len;
272-
for (auto it = seq_it; it != sequences.end(); it++)
274+
uint64_t remaining_db_len = total_len;
275+
size_t len_lower_bound = default_seg_len / 10;
276+
auto first_long_seq = std::find_if(sequences.begin(), sequences.end(), [&](auto const & seq){return (seq.len > len_lower_bound);});
277+
278+
segment_stats short_sequences{};
279+
short_sequences.start = 0;
280+
short_sequences.id = segments.size();
281+
282+
uint64_t short_len{0};
283+
for (auto it = sequences.begin(); it < sequences.end(); it++)
273284
{
274285
auto seq = *it;
275286
size_t start = 0;
276-
if (seq.len <= default_seg_len * 1.5)
287+
if (seq.len < len_lower_bound)
288+
{
289+
short_len += seq.len;
290+
short_sequences.add_sequence(seq);
291+
if ((short_len >= default_seg_len) ||
292+
((it + 1) == first_long_seq))
293+
{
294+
segments.push_back(short_sequences);
295+
short_sequences.seq_vec.clear();
296+
remaining_db_len -= short_len;
297+
short_len = 0;
298+
}
299+
}
300+
else if (seq.len <= default_seg_len * 1.5)
277301
{
278302
// database sequence is single segment
279303
add_segment(seq.ind, start, seq.len);
@@ -319,10 +343,9 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
319343
* @param overlap Length of overlap between adjacent segments.
320344
* @param seq_it Iterator to first sequence of sufficient length.
321345
*/
322-
template <typename it_t>
323-
void make_equal_length_segments(size_t const & overlap, it_t & seq_it)
346+
void make_equal_length_segments(size_t const & overlap)
324347
{
325-
for (auto it = seq_it; it != sequences.end(); it++)
348+
for (auto it = sequences.begin(); it != sequences.end(); it++)
326349
{
327350
auto seq = *it;
328351
size_t start = 0;
@@ -371,28 +394,16 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
371394
std::to_string(arguments.pattern_size) + "bp.\nDecrease the overlap or the number of segments.");
372395
}
373396

374-
size_t len_lower_bound = default_seg_len / 10;
375-
// Check how many sequences are discarded for being too short
376-
//!TODO: gather short sequences
377-
auto first_long_seq = std::find_if(sequences.begin(), sequences.end(), [&](auto const & seq){return (seq.len > len_lower_bound);});
378-
size_t discarded_short_sequences = first_long_seq - sequences.begin();
379-
380-
for (auto it = sequences.begin(); it < first_long_seq; it++)
381-
{
382-
seqan3::debug_stream << "Sequence: " << (*it).id << " is too short and will be skipped.\n";
383-
total_len -= (*it).len;
384-
}
385-
386-
if (arguments.seg_count < (sequences.size() - discarded_short_sequences))
397+
if (arguments.seg_count < sequences.size())
387398
{
388399
throw std::runtime_error("Can not split " + std::to_string(sequences.size()) +
389400
" sequences into " + std::to_string(arguments.seg_count) + " segments.");
390401
}
391402

392403
if constexpr (std::is_same<arg_t, build_arguments>::value)
393-
make_exactly_n_segments(arguments.seg_count, arguments.pattern_size, first_long_seq);
404+
make_exactly_n_segments(arguments.seg_count, arguments.pattern_size);
394405
else
395-
make_equal_length_segments(arguments.pattern_size, first_long_seq);
406+
make_equal_length_segments(arguments.pattern_size);
396407

397408
std::stable_sort(sequences.begin(), sequences.end(), fasta_order());
398409
std::stable_sort(segments.begin(), segments.end(), fasta_order());
@@ -413,8 +424,7 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
413424
std::to_string(arguments.pattern_size) + "bp.\nDecrease the overlap or the number of segments.");
414425
}
415426

416-
auto seq_begin = sequences.begin();
417-
make_equal_length_segments(arguments.pattern_size, seq_begin);
427+
make_equal_length_segments(arguments.pattern_size);
418428
seg_count = segments.size();
419429

420430
std::stable_sort(sequences.begin(), sequences.end(), fasta_order());
@@ -561,7 +571,7 @@ SEQAN3_WORKAROUND_GCC_BOGUS_MEMCPY_STOP
561571
segment_stats seg = segments[seg_id];
562572
out_str << seg_id << '\t';
563573
for (size_t ind : seg.seq_vec)
564-
out_str << ind << '\t';
574+
out_str << ind << ';';
565575
out_str << seg.start << '\t' << seg.len << '\n';
566576
}
567577

test/api/valik/split/metadata_test.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ TEST_F(split_options, split_various_length)
9494

9595
try
9696
{
97-
for (uint32_t b : std::vector<uint32_t>{4, 16})
97+
for (uint32_t b : std::vector<uint32_t>{5, 16})
9898
{
9999
arguments.seg_count = b;
100100
auto expected_segments = valik::metadata(segment_metadata_path(arguments.pattern_size, arguments.seg_count));
@@ -119,18 +119,23 @@ TEST_F(split_options, split_few_long)
119119
arguments.manual_parameters = true;
120120
arguments.pattern_size = 20;
121121
arguments.bin_path.emplace_back(data("various_chromosome_lengths.fasta"));
122+
const uint8_t record_count{5};
122123

123124
try
124125
{
125-
for (uint32_t b : std::vector<uint32_t>{4, 12, 19})
126+
for (uint32_t b : std::vector<uint32_t>{5, 12, 19})
126127
{
127128
arguments.seg_count = b;
128129
valik::metadata meta(arguments);
129-
130-
if (arguments.seg_count > meta.seq_count) // one-to-many pairing of sequences and segments
130+
std::unordered_set<size_t> sequenes_ids{};
131+
for (auto & seg : meta.segments)
131132
{
132-
EXPECT_GE(0.2f, meta.segment_length_cv()); // create segments of roughly equal length
133+
for (auto & seq_id : seg.seq_vec)
134+
{
135+
sequenes_ids.insert(seq_id);
136+
}
133137
}
138+
EXPECT_EQ(sequenes_ids.size(), record_count); // all sequences should be used
134139
}
135140
}
136141
catch( const std::runtime_error& e )

test/data/split/api_test_output.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ set -Eeuo pipefail
77
seg_input="various_chromosome_lengths.fasta"
88
for p in 20
99
do
10-
for b in 4 16
10+
for b in 5 16
1111
do
1212
echo "Splitting the genome into $b segments that overlap by $p"
1313
seg_meta="multi/"$p"overlap"$b"bins.index"
0 Bytes
Binary file not shown.
0 Bytes
Binary file not shown.
101 KB
Binary file not shown.

0 commit comments

Comments
 (0)