From 477daa28cb020f3e8e862889b8098f45c62a8dfe Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Tue, 12 Aug 2025 18:10:03 +0200 Subject: [PATCH] Use alternative minimiser hash implementation --- include/fpgalign/contrib/minimiser_hash.hpp | 340 ++++++++++++++++++++ src/build/ibf.cpp | 12 +- src/search/fmindex.cpp | 39 --- src/search/ibf.cpp | 13 +- src/search/search.cpp | 16 - 5 files changed, 347 insertions(+), 73 deletions(-) create mode 100644 include/fpgalign/contrib/minimiser_hash.hpp diff --git a/include/fpgalign/contrib/minimiser_hash.hpp b/include/fpgalign/contrib/minimiser_hash.hpp new file mode 100644 index 0000000..3409d11 --- /dev/null +++ b/include/fpgalign/contrib/minimiser_hash.hpp @@ -0,0 +1,340 @@ +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +#include + +#include + +// Same as seqan3::views::minimiser_hash, but optimized for dna4, with integrated adjust_seed + +namespace contrib +{ + +struct minimiser_hash_parameters +{ + uint8_t kmer_size{}; + uint32_t window_size{}; +}; + +} // namespace contrib + +namespace contrib::detail +{ + +template + requires std::ranges::input_range && std::ranges::sized_range +class minimiser_hash : public std::ranges::view_interface> +{ +private: + range_t range{}; + minimiser_hash_parameters params{}; + + template + class basic_iterator; + +public: + minimiser_hash() + requires std::default_initializable + = default; + minimiser_hash(minimiser_hash const & rhs) = default; + minimiser_hash(minimiser_hash && rhs) = default; + minimiser_hash & operator=(minimiser_hash const & rhs) = default; + minimiser_hash & operator=(minimiser_hash && rhs) = default; + ~minimiser_hash() = default; + + explicit minimiser_hash(range_t range, minimiser_hash_parameters params) : + range{std::move(range)}, + params{std::move(params)} + {} + + basic_iterator begin() + { + return {std::ranges::begin(range), std::ranges::size(range), params}; + } + + basic_iterator begin() const + requires std::ranges::view && std::ranges::input_range + { + return {std::ranges::begin(range), std::ranges::size(range), params}; + } + + auto end() noexcept + { + return std::default_sentinel; + } + + auto end() const noexcept + requires std::ranges::view && std::ranges::input_range + { + return std::default_sentinel; + } +}; + +template + requires std::ranges::input_range && std::ranges::sized_range +template +class minimiser_hash::basic_iterator +{ +private: + template + friend class basic_iterator; + + using maybe_const_range_t = std::conditional_t; + using range_iterator_t = std::ranges::iterator_t; + +public: + using difference_type = std::ranges::range_difference_t; + using value_type = uint64_t; + using pointer = void; + using reference = value_type; + using iterator_category = std::conditional_t, + std::forward_iterator_tag, + std::input_iterator_tag>; + using iterator_concept = iterator_category; + +private: + range_iterator_t range_it{}; + + uint64_t kmer_mask{}; + uint64_t seed{}; + + uint64_t kmer_value{}; + uint64_t kmer_value_rev{}; + size_t minimiser_position{}; + + size_t range_size{}; + size_t range_position{}; + + value_type minimiser_value{}; + + int kmer_rev_shift{}; + + std::deque kmer_values_in_window{}; + + static inline constexpr uint64_t compute_mask(uint8_t const kmer_size) + { + assert(kmer_size > 0u); + assert(kmer_size <= 32u); + + if (kmer_size == 32u) + return std::numeric_limits::max(); + else + return (uint64_t{1u} << (2u * kmer_size)) - 1u; + } + + static inline constexpr uint64_t compute_seed(uint8_t const kmer_size) + { + assert(kmer_size > 0u); + assert(kmer_size <= 32u); + + return uint64_t{0x8F3F73B5CF1C9ADEULL} >> (64u - 2u * kmer_size); + } + +public: + basic_iterator() = default; + basic_iterator(basic_iterator const &) = default; + basic_iterator(basic_iterator &&) = default; + basic_iterator & operator=(basic_iterator const &) = default; + basic_iterator & operator=(basic_iterator &&) = default; + ~basic_iterator() = default; + + basic_iterator(basic_iterator const & it) + requires range_is_const + : + range_it{it.range_it}, + kmer_mask{it.kmer_mask}, + seed{it.seed}, + kmer_value{it.kmer_value}, + minimiser_position{it.minimiser_position}, + range_size{it.range_size}, + range_position{it.range_position}, + minimiser_value{it.minimiser_value}, + kmer_rev_shift{it.kmer_rev_shift}, + kmer_values_in_window{it.kmer_values_in_window} + {} + + basic_iterator(range_iterator_t range_iterator, size_t const range_size, minimiser_hash_parameters const & params) : + range_it{std::move(range_iterator)}, + kmer_mask{compute_mask(params.kmer_size)}, + seed{compute_seed(params.kmer_size)}, + range_size{range_size}, + kmer_rev_shift{2 * static_cast(params.kmer_size - 1)} + { + if (range_size < params.window_size) + range_position = range_size; + else + init(params); + } + + friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) + { + return lhs.range_it == rhs.range_it; + } + + friend bool operator==(basic_iterator const & lhs, std::default_sentinel_t const &) + { + return lhs.range_position == lhs.range_size; + } + + basic_iterator & operator++() noexcept + { + while (!next_minimiser_is_new()) + {} + return *this; + } + + basic_iterator operator++(int) noexcept + { + basic_iterator tmp{*this}; + while (!next_minimiser_is_new()) + {} + return tmp; + } + + value_type operator*() const noexcept + { + return minimiser_value; + } + +private: + enum class pop_first : bool + { + no, + yes + }; + + void rolling_hash() + requires std::same_as, uint8_t> + { + uint64_t const new_rank = *range_it; + + kmer_value <<= 2; + kmer_value |= new_rank; + kmer_value &= kmer_mask; + + kmer_value_rev >>= 2; + kmer_value_rev |= (new_rank ^ 0b11) << kmer_rev_shift; + } + + void rolling_hash() + requires std::same_as, seqan3::dna4> + { + uint64_t const new_rank = range_it->to_rank(); + + kmer_value <<= 2; + kmer_value |= new_rank; + kmer_value &= kmer_mask; + + kmer_value_rev >>= 2; + kmer_value_rev |= (new_rank ^ 0b11) << kmer_rev_shift; + } + + template + void next_window() + { + ++range_position; + ++range_it; + + rolling_hash(); + + if constexpr (pop == pop_first::yes) + kmer_values_in_window.pop_front(); + + kmer_values_in_window.push_back(std::min(kmer_value ^ seed, kmer_value_rev ^ seed)); + } + + void find_minimiser_in_window() + { + auto minimiser_it = std::ranges::min_element(kmer_values_in_window, std::less_equal{}); + minimiser_value = *minimiser_it; + minimiser_position = std::distance(std::begin(kmer_values_in_window), minimiser_it); + } + + void init(minimiser_hash_parameters const & params) + { + // range_it is already at the beginning of the range + rolling_hash(); + kmer_values_in_window.push_back(std::min(kmer_value ^ seed, kmer_value_rev ^ seed)); + + // After this loop, `kmer_values_in_window` contains the first kmer value of the window. + for (size_t i = 1u; i < params.kmer_size; ++i) + next_window(); + + // After this loop, `kmer_values_in_window` contains all kmer values of the window. + for (size_t i = params.kmer_size; i < params.window_size; ++i) + next_window(); + + find_minimiser_in_window(); + } + + bool next_minimiser_is_new() + { + // If we reached the end of the range, we are done. + if (range_position + 1 == range_size) + return ++range_position; // Return true, but also increment range_position + + next_window(); + + // The minimiser left the window. + if (minimiser_position == 0) + { + find_minimiser_in_window(); + return true; + } + + // Update minimiser if the new kmer value is smaller than the current minimiser. + if (uint64_t new_kmer_value = kmer_values_in_window.back(); new_kmer_value < minimiser_value) + { + minimiser_value = new_kmer_value; + minimiser_position = kmer_values_in_window.size() - 1u; + return true; + } + + --minimiser_position; + return false; + } +}; + +template +minimiser_hash(rng_t &&, minimiser_hash_parameters &&) -> minimiser_hash>; + +struct minimiser_hash_fn +{ + constexpr auto operator()(minimiser_hash_parameters params) const + { + return seqan::stl::detail::adaptor_from_functor{*this, std::move(params)}; + } + + template + constexpr auto operator()(range_t && range, minimiser_hash_parameters params) const + { + // static_assert(std::same_as, seqan3::dna4>, "Only dna4 supported."); + static_assert(std::ranges::sized_range, "Input range must be a std::ranges::sized_range."); + + if (params.kmer_size == 0u) + throw std::invalid_argument{"kmer_size must be > 0."}; + if (params.kmer_size > 32u) + throw std::invalid_argument{"kmer_size must be <= 32."}; + if (params.window_size == 0u) + throw std::invalid_argument{"window_size must be > 0."}; + if (params.window_size < params.kmer_size) + throw std::invalid_argument{"window_size must be >= kmer_size."}; + + return minimiser_hash{std::forward(range), std::move(params)}; + } +}; + +} // namespace contrib::detail + +namespace contrib::views +{ + +inline constexpr auto minimiser_hash = contrib::detail::minimiser_hash_fn{}; + +} diff --git a/src/build/ibf.cpp b/src/build/ibf.cpp index 569ad8c..5ce357b 100644 --- a/src/build/ibf.cpp +++ b/src/build/ibf.cpp @@ -3,13 +3,13 @@ // SPDX-License-Identifier: BSD-3-Clause #include -#include #include #include #include #include +#include namespace build { @@ -19,11 +19,6 @@ struct dna4_traits : seqan3::sequence_file_input_default_traits_dna using sequence_alphabet = seqan3::dna4; }; -static inline constexpr uint64_t adjust_seed(uint8_t const kmer_size) noexcept -{ - return 0x8F3F73B5CF1C9ADEULL >> (64u - 2u * kmer_size); -} - void ibf(config const & config, meta & meta) { meta.kmer_size = config.kmer_size; @@ -33,9 +28,8 @@ void ibf(config const & config, meta & meta) { using sequence_file_t = seqan3::sequence_file_input>; - auto minimiser_view = seqan3::views::minimiser_hash(seqan3::ungapped{config.kmer_size}, - seqan3::window_size{config.window_size}, - seqan3::seed{adjust_seed(config.kmer_size)}); + auto minimiser_view = + contrib::views::minimiser_hash({.kmer_size = config.kmer_size, .window_size = config.window_size}); for (auto && bin_path : meta.bin_paths[user_bin_id]) { diff --git a/src/search/fmindex.cpp b/src/search/fmindex.cpp index 60000b7..f375fd1 100644 --- a/src/search/fmindex.cpp +++ b/src/search/fmindex.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: BSD-3-Clause #include -#include #include @@ -18,22 +17,6 @@ namespace search { -// template -// auto myReconstruct(Index const & index, size_t seqNbr) -> std::vector -// { -// //TODO: möglicher weise ist das identisch zu einfach index.C[1] -// auto totalNumberOfSeq = index.bwt.rank(index.size(), 0) + index.C[0]; -// for (size_t i{0}; i < totalNumberOfSeq; ++i) -// { -// auto idx = std::get<0>(std::get<0>(index.locate(i))); -// if (idx == seqNbr) -// { -// return reconstructText(index, i); -// } -// } -// throw std::runtime_error{"unknown sequence number"}; -// } - fmc::BiFMIndex<5> load_index(config const & config, size_t const id) { fmc::BiFMIndex<5> index{}; @@ -44,33 +27,11 @@ fmc::BiFMIndex<5> load_index(config const & config, size_t const id) iarchive(index); } - // { - // fmt::println(" === Reconstruct all ==="); - // auto text = fmc::reconstructText(index); - // for (size_t i = 0; i < text.size(); ++i) - // fmt::print("### {} ###\n{}\n", i, fmt::join(text[i], "")); - // } - // { - // fmt::println(" === Reconstruct single ==="); - // auto text = fmc::reconstructText(index, 0); - // fmt::print("### 0 ###\n{}\n", fmt::join(text, "")); - // text = fmc::reconstructText(index, 1); - // fmt::print("### 1 ###\n{}\n", fmt::join(text, "")); - // } - // { - // fmt::println(" === Reconstruct alternative ==="); - // auto text = myReconstruct(index, 0); - // fmt::print("### 0 ###\n{}\n", fmt::join(text, "")); - // text = myReconstruct(index, 1); - // fmt::print("### 1 ###\n{}\n", fmt::join(text, "")); - // } - return index; } std::vector fmindex(config const & config, meta & meta, std::vector hits) { - // todo bin count // todo capacity // each slot = 1 bin // a cart is full if it has 5 elements (hits) diff --git a/src/search/ibf.cpp b/src/search/ibf.cpp index a99d7be..4536d82 100644 --- a/src/search/ibf.cpp +++ b/src/search/ibf.cpp @@ -5,11 +5,11 @@ #include #include -#include #include #include +#include #include #include @@ -21,11 +21,6 @@ struct dna4_traits : seqan3::sequence_file_input_default_traits_dna using sequence_alphabet = seqan3::dna4; }; -static inline constexpr uint64_t adjust_seed(uint8_t const kmer_size) noexcept -{ - return 0x8F3F73B5CF1C9ADEULL >> (64u - 2u * kmer_size); -} - threshold::threshold get_thresholder(config const & config, meta const & meta) { size_t const first_sequence_size = [&]() @@ -71,9 +66,9 @@ std::vector ibf(config const & config, meta & meta) { auto agent = ibf.membership_agent(); threshold::threshold const thresholder = get_thresholder(config, meta); - auto minimiser_view = seqan3::views::minimiser_hash(seqan3::ungapped{meta.kmer_size}, - seqan3::window_size{meta.window_size}, - seqan3::seed{adjust_seed(meta.kmer_size)}); + auto minimiser_view = + contrib::views::minimiser_hash({.kmer_size = meta.kmer_size, .window_size = meta.window_size}); + std::vector hashes; #pragma omp for diff --git a/src/search/search.cpp b/src/search/search.cpp index bdde0e3..a7252f1 100644 --- a/src/search/search.cpp +++ b/src/search/search.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: BSD-3-Clause #include -#include #include #include @@ -14,17 +13,6 @@ namespace search { - -// auto format_as(wip_alignment wip) -// { -// return fmt::format("(bin = {}, seqNo = {}, pos = {}, seq = {:d}, ref = {:d})", -// wip.bin, -// wip.sequence_number, -// wip.position, -// fmt::join(wip.seq, ""), -// fmt::join(wip.ref, "")); -// } - void do_alignment(config const & config, meta & meta, std::vector const & wips) { seqan3::sam_file_output sam_out{config.output_path, @@ -100,10 +88,6 @@ void search(config const & config) std::vector hits = ibf(config, meta); auto const res = fmindex(config, meta, std::move(hits)); do_alignment(config, meta, res); - // for (auto const & elem : res) - // { - // fmt::print("{}\n", elem); - // } } } // namespace search