Skip to content

Commit c1a6dff

Browse files
authored
Merge pull request #5 from eseiler/misc/contrib_minhash
Use alternative minimiser hash implementation
2 parents 2687872 + 477daa2 commit c1a6dff

File tree

5 files changed

+347
-73
lines changed

5 files changed

+347
-73
lines changed
Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin
2+
// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik
3+
// SPDX-License-Identifier: BSD-3-Clause
4+
5+
#pragma once
6+
7+
#include <algorithm>
8+
#include <deque>
9+
10+
#include <seqan3/alphabet/nucleotide/dna4.hpp>
11+
12+
#include <hibf/contrib/std/detail/adaptor_from_functor.hpp>
13+
14+
// Same as seqan3::views::minimiser_hash, but optimized for dna4, with integrated adjust_seed
15+
16+
namespace contrib
17+
{
18+
19+
struct minimiser_hash_parameters
20+
{
21+
uint8_t kmer_size{};
22+
uint32_t window_size{};
23+
};
24+
25+
} // namespace contrib
26+
27+
namespace contrib::detail
28+
{
29+
30+
template <std::ranges::view range_t>
31+
requires std::ranges::input_range<range_t> && std::ranges::sized_range<range_t>
32+
class minimiser_hash : public std::ranges::view_interface<minimiser_hash<range_t>>
33+
{
34+
private:
35+
range_t range{};
36+
minimiser_hash_parameters params{};
37+
38+
template <bool range_is_const>
39+
class basic_iterator;
40+
41+
public:
42+
minimiser_hash()
43+
requires std::default_initializable<range_t>
44+
= default;
45+
minimiser_hash(minimiser_hash const & rhs) = default;
46+
minimiser_hash(minimiser_hash && rhs) = default;
47+
minimiser_hash & operator=(minimiser_hash const & rhs) = default;
48+
minimiser_hash & operator=(minimiser_hash && rhs) = default;
49+
~minimiser_hash() = default;
50+
51+
explicit minimiser_hash(range_t range, minimiser_hash_parameters params) :
52+
range{std::move(range)},
53+
params{std::move(params)}
54+
{}
55+
56+
basic_iterator<false> begin()
57+
{
58+
return {std::ranges::begin(range), std::ranges::size(range), params};
59+
}
60+
61+
basic_iterator<true> begin() const
62+
requires std::ranges::view<range_t const> && std::ranges::input_range<range_t const>
63+
{
64+
return {std::ranges::begin(range), std::ranges::size(range), params};
65+
}
66+
67+
auto end() noexcept
68+
{
69+
return std::default_sentinel;
70+
}
71+
72+
auto end() const noexcept
73+
requires std::ranges::view<range_t const> && std::ranges::input_range<range_t const>
74+
{
75+
return std::default_sentinel;
76+
}
77+
};
78+
79+
template <std::ranges::view range_t>
80+
requires std::ranges::input_range<range_t> && std::ranges::sized_range<range_t>
81+
template <bool range_is_const>
82+
class minimiser_hash<range_t>::basic_iterator
83+
{
84+
private:
85+
template <bool>
86+
friend class basic_iterator;
87+
88+
using maybe_const_range_t = std::conditional_t<range_is_const, range_t const, range_t>;
89+
using range_iterator_t = std::ranges::iterator_t<maybe_const_range_t>;
90+
91+
public:
92+
using difference_type = std::ranges::range_difference_t<maybe_const_range_t>;
93+
using value_type = uint64_t;
94+
using pointer = void;
95+
using reference = value_type;
96+
using iterator_category = std::conditional_t<std::ranges::forward_range<maybe_const_range_t>,
97+
std::forward_iterator_tag,
98+
std::input_iterator_tag>;
99+
using iterator_concept = iterator_category;
100+
101+
private:
102+
range_iterator_t range_it{};
103+
104+
uint64_t kmer_mask{};
105+
uint64_t seed{};
106+
107+
uint64_t kmer_value{};
108+
uint64_t kmer_value_rev{};
109+
size_t minimiser_position{};
110+
111+
size_t range_size{};
112+
size_t range_position{};
113+
114+
value_type minimiser_value{};
115+
116+
int kmer_rev_shift{};
117+
118+
std::deque<uint64_t> kmer_values_in_window{};
119+
120+
static inline constexpr uint64_t compute_mask(uint8_t const kmer_size)
121+
{
122+
assert(kmer_size > 0u);
123+
assert(kmer_size <= 32u);
124+
125+
if (kmer_size == 32u)
126+
return std::numeric_limits<uint64_t>::max();
127+
else
128+
return (uint64_t{1u} << (2u * kmer_size)) - 1u;
129+
}
130+
131+
static inline constexpr uint64_t compute_seed(uint8_t const kmer_size)
132+
{
133+
assert(kmer_size > 0u);
134+
assert(kmer_size <= 32u);
135+
136+
return uint64_t{0x8F3F73B5CF1C9ADEULL} >> (64u - 2u * kmer_size);
137+
}
138+
139+
public:
140+
basic_iterator() = default;
141+
basic_iterator(basic_iterator const &) = default;
142+
basic_iterator(basic_iterator &&) = default;
143+
basic_iterator & operator=(basic_iterator const &) = default;
144+
basic_iterator & operator=(basic_iterator &&) = default;
145+
~basic_iterator() = default;
146+
147+
basic_iterator(basic_iterator<!range_is_const> const & it)
148+
requires range_is_const
149+
:
150+
range_it{it.range_it},
151+
kmer_mask{it.kmer_mask},
152+
seed{it.seed},
153+
kmer_value{it.kmer_value},
154+
minimiser_position{it.minimiser_position},
155+
range_size{it.range_size},
156+
range_position{it.range_position},
157+
minimiser_value{it.minimiser_value},
158+
kmer_rev_shift{it.kmer_rev_shift},
159+
kmer_values_in_window{it.kmer_values_in_window}
160+
{}
161+
162+
basic_iterator(range_iterator_t range_iterator, size_t const range_size, minimiser_hash_parameters const & params) :
163+
range_it{std::move(range_iterator)},
164+
kmer_mask{compute_mask(params.kmer_size)},
165+
seed{compute_seed(params.kmer_size)},
166+
range_size{range_size},
167+
kmer_rev_shift{2 * static_cast<int>(params.kmer_size - 1)}
168+
{
169+
if (range_size < params.window_size)
170+
range_position = range_size;
171+
else
172+
init(params);
173+
}
174+
175+
friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs)
176+
{
177+
return lhs.range_it == rhs.range_it;
178+
}
179+
180+
friend bool operator==(basic_iterator const & lhs, std::default_sentinel_t const &)
181+
{
182+
return lhs.range_position == lhs.range_size;
183+
}
184+
185+
basic_iterator & operator++() noexcept
186+
{
187+
while (!next_minimiser_is_new())
188+
{}
189+
return *this;
190+
}
191+
192+
basic_iterator operator++(int) noexcept
193+
{
194+
basic_iterator tmp{*this};
195+
while (!next_minimiser_is_new())
196+
{}
197+
return tmp;
198+
}
199+
200+
value_type operator*() const noexcept
201+
{
202+
return minimiser_value;
203+
}
204+
205+
private:
206+
enum class pop_first : bool
207+
{
208+
no,
209+
yes
210+
};
211+
212+
void rolling_hash()
213+
requires std::same_as<std::ranges::range_value_t<range_t>, uint8_t>
214+
{
215+
uint64_t const new_rank = *range_it;
216+
217+
kmer_value <<= 2;
218+
kmer_value |= new_rank;
219+
kmer_value &= kmer_mask;
220+
221+
kmer_value_rev >>= 2;
222+
kmer_value_rev |= (new_rank ^ 0b11) << kmer_rev_shift;
223+
}
224+
225+
void rolling_hash()
226+
requires std::same_as<std::ranges::range_value_t<range_t>, seqan3::dna4>
227+
{
228+
uint64_t const new_rank = range_it->to_rank();
229+
230+
kmer_value <<= 2;
231+
kmer_value |= new_rank;
232+
kmer_value &= kmer_mask;
233+
234+
kmer_value_rev >>= 2;
235+
kmer_value_rev |= (new_rank ^ 0b11) << kmer_rev_shift;
236+
}
237+
238+
template <pop_first pop>
239+
void next_window()
240+
{
241+
++range_position;
242+
++range_it;
243+
244+
rolling_hash();
245+
246+
if constexpr (pop == pop_first::yes)
247+
kmer_values_in_window.pop_front();
248+
249+
kmer_values_in_window.push_back(std::min<uint64_t>(kmer_value ^ seed, kmer_value_rev ^ seed));
250+
}
251+
252+
void find_minimiser_in_window()
253+
{
254+
auto minimiser_it = std::ranges::min_element(kmer_values_in_window, std::less_equal<uint64_t>{});
255+
minimiser_value = *minimiser_it;
256+
minimiser_position = std::distance(std::begin(kmer_values_in_window), minimiser_it);
257+
}
258+
259+
void init(minimiser_hash_parameters const & params)
260+
{
261+
// range_it is already at the beginning of the range
262+
rolling_hash();
263+
kmer_values_in_window.push_back(std::min<uint64_t>(kmer_value ^ seed, kmer_value_rev ^ seed));
264+
265+
// After this loop, `kmer_values_in_window` contains the first kmer value of the window.
266+
for (size_t i = 1u; i < params.kmer_size; ++i)
267+
next_window<pop_first::yes>();
268+
269+
// After this loop, `kmer_values_in_window` contains all kmer values of the window.
270+
for (size_t i = params.kmer_size; i < params.window_size; ++i)
271+
next_window<pop_first::no>();
272+
273+
find_minimiser_in_window();
274+
}
275+
276+
bool next_minimiser_is_new()
277+
{
278+
// If we reached the end of the range, we are done.
279+
if (range_position + 1 == range_size)
280+
return ++range_position; // Return true, but also increment range_position
281+
282+
next_window<pop_first::yes>();
283+
284+
// The minimiser left the window.
285+
if (minimiser_position == 0)
286+
{
287+
find_minimiser_in_window();
288+
return true;
289+
}
290+
291+
// Update minimiser if the new kmer value is smaller than the current minimiser.
292+
if (uint64_t new_kmer_value = kmer_values_in_window.back(); new_kmer_value < minimiser_value)
293+
{
294+
minimiser_value = new_kmer_value;
295+
minimiser_position = kmer_values_in_window.size() - 1u;
296+
return true;
297+
}
298+
299+
--minimiser_position;
300+
return false;
301+
}
302+
};
303+
304+
template <std::ranges::viewable_range rng_t>
305+
minimiser_hash(rng_t &&, minimiser_hash_parameters &&) -> minimiser_hash<std::views::all_t<rng_t>>;
306+
307+
struct minimiser_hash_fn
308+
{
309+
constexpr auto operator()(minimiser_hash_parameters params) const
310+
{
311+
return seqan::stl::detail::adaptor_from_functor{*this, std::move(params)};
312+
}
313+
314+
template <std::ranges::range range_t>
315+
constexpr auto operator()(range_t && range, minimiser_hash_parameters params) const
316+
{
317+
// static_assert(std::same_as<std::ranges::range_value_t<range_t>, seqan3::dna4>, "Only dna4 supported.");
318+
static_assert(std::ranges::sized_range<range_t>, "Input range must be a std::ranges::sized_range.");
319+
320+
if (params.kmer_size == 0u)
321+
throw std::invalid_argument{"kmer_size must be > 0."};
322+
if (params.kmer_size > 32u)
323+
throw std::invalid_argument{"kmer_size must be <= 32."};
324+
if (params.window_size == 0u)
325+
throw std::invalid_argument{"window_size must be > 0."};
326+
if (params.window_size < params.kmer_size)
327+
throw std::invalid_argument{"window_size must be >= kmer_size."};
328+
329+
return minimiser_hash{std::forward<range_t>(range), std::move(params)};
330+
}
331+
};
332+
333+
} // namespace contrib::detail
334+
335+
namespace contrib::views
336+
{
337+
338+
inline constexpr auto minimiser_hash = contrib::detail::minimiser_hash_fn{};
339+
340+
}

src/build/ibf.cpp

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@
33
// SPDX-License-Identifier: BSD-3-Clause
44

55
#include <seqan3/io/sequence_file/input.hpp>
6-
#include <seqan3/search/views/minimiser_hash.hpp>
76

87
#include <hibf/config.hpp>
98
#include <hibf/interleaved_bloom_filter.hpp>
109

1110
#include <fpgalign/build/build.hpp>
1211
#include <fpgalign/colored_strings.hpp>
12+
#include <fpgalign/contrib/minimiser_hash.hpp>
1313

1414
namespace build
1515
{
@@ -19,11 +19,6 @@ struct dna4_traits : seqan3::sequence_file_input_default_traits_dna
1919
using sequence_alphabet = seqan3::dna4;
2020
};
2121

22-
static inline constexpr uint64_t adjust_seed(uint8_t const kmer_size) noexcept
23-
{
24-
return 0x8F3F73B5CF1C9ADEULL >> (64u - 2u * kmer_size);
25-
}
26-
2722
void ibf(config const & config, meta & meta)
2823
{
2924
meta.kmer_size = config.kmer_size;
@@ -33,9 +28,8 @@ void ibf(config const & config, meta & meta)
3328
{
3429
using sequence_file_t = seqan3::sequence_file_input<dna4_traits, seqan3::fields<seqan3::field::seq>>;
3530

36-
auto minimiser_view = seqan3::views::minimiser_hash(seqan3::ungapped{config.kmer_size},
37-
seqan3::window_size{config.window_size},
38-
seqan3::seed{adjust_seed(config.kmer_size)});
31+
auto minimiser_view =
32+
contrib::views::minimiser_hash({.kmer_size = config.kmer_size, .window_size = config.window_size});
3933

4034
for (auto && bin_path : meta.bin_paths[user_bin_id])
4135
{

0 commit comments

Comments
 (0)