|
| 1 | +// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin |
| 2 | +// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik |
| 3 | +// SPDX-License-Identifier: BSD-3-Clause |
| 4 | + |
| 5 | +#pragma once |
| 6 | + |
| 7 | +#include <algorithm> |
| 8 | +#include <deque> |
| 9 | + |
| 10 | +#include <seqan3/alphabet/nucleotide/dna4.hpp> |
| 11 | + |
| 12 | +#include <hibf/contrib/std/detail/adaptor_from_functor.hpp> |
| 13 | + |
| 14 | +// Same as seqan3::views::minimiser_hash, but optimized for dna4, with integrated adjust_seed |
| 15 | + |
| 16 | +namespace contrib |
| 17 | +{ |
| 18 | + |
| 19 | +struct minimiser_hash_parameters |
| 20 | +{ |
| 21 | + uint8_t kmer_size{}; |
| 22 | + uint32_t window_size{}; |
| 23 | +}; |
| 24 | + |
| 25 | +} // namespace contrib |
| 26 | + |
| 27 | +namespace contrib::detail |
| 28 | +{ |
| 29 | + |
| 30 | +template <std::ranges::view range_t> |
| 31 | + requires std::ranges::input_range<range_t> && std::ranges::sized_range<range_t> |
| 32 | +class minimiser_hash : public std::ranges::view_interface<minimiser_hash<range_t>> |
| 33 | +{ |
| 34 | +private: |
| 35 | + range_t range{}; |
| 36 | + minimiser_hash_parameters params{}; |
| 37 | + |
| 38 | + template <bool range_is_const> |
| 39 | + class basic_iterator; |
| 40 | + |
| 41 | +public: |
| 42 | + minimiser_hash() |
| 43 | + requires std::default_initializable<range_t> |
| 44 | + = default; |
| 45 | + minimiser_hash(minimiser_hash const & rhs) = default; |
| 46 | + minimiser_hash(minimiser_hash && rhs) = default; |
| 47 | + minimiser_hash & operator=(minimiser_hash const & rhs) = default; |
| 48 | + minimiser_hash & operator=(minimiser_hash && rhs) = default; |
| 49 | + ~minimiser_hash() = default; |
| 50 | + |
| 51 | + explicit minimiser_hash(range_t range, minimiser_hash_parameters params) : |
| 52 | + range{std::move(range)}, |
| 53 | + params{std::move(params)} |
| 54 | + {} |
| 55 | + |
| 56 | + basic_iterator<false> begin() |
| 57 | + { |
| 58 | + return {std::ranges::begin(range), std::ranges::size(range), params}; |
| 59 | + } |
| 60 | + |
| 61 | + basic_iterator<true> begin() const |
| 62 | + requires std::ranges::view<range_t const> && std::ranges::input_range<range_t const> |
| 63 | + { |
| 64 | + return {std::ranges::begin(range), std::ranges::size(range), params}; |
| 65 | + } |
| 66 | + |
| 67 | + auto end() noexcept |
| 68 | + { |
| 69 | + return std::default_sentinel; |
| 70 | + } |
| 71 | + |
| 72 | + auto end() const noexcept |
| 73 | + requires std::ranges::view<range_t const> && std::ranges::input_range<range_t const> |
| 74 | + { |
| 75 | + return std::default_sentinel; |
| 76 | + } |
| 77 | +}; |
| 78 | + |
| 79 | +template <std::ranges::view range_t> |
| 80 | + requires std::ranges::input_range<range_t> && std::ranges::sized_range<range_t> |
| 81 | +template <bool range_is_const> |
| 82 | +class minimiser_hash<range_t>::basic_iterator |
| 83 | +{ |
| 84 | +private: |
| 85 | + template <bool> |
| 86 | + friend class basic_iterator; |
| 87 | + |
| 88 | + using maybe_const_range_t = std::conditional_t<range_is_const, range_t const, range_t>; |
| 89 | + using range_iterator_t = std::ranges::iterator_t<maybe_const_range_t>; |
| 90 | + |
| 91 | +public: |
| 92 | + using difference_type = std::ranges::range_difference_t<maybe_const_range_t>; |
| 93 | + using value_type = uint64_t; |
| 94 | + using pointer = void; |
| 95 | + using reference = value_type; |
| 96 | + using iterator_category = std::conditional_t<std::ranges::forward_range<maybe_const_range_t>, |
| 97 | + std::forward_iterator_tag, |
| 98 | + std::input_iterator_tag>; |
| 99 | + using iterator_concept = iterator_category; |
| 100 | + |
| 101 | +private: |
| 102 | + range_iterator_t range_it{}; |
| 103 | + |
| 104 | + uint64_t kmer_mask{}; |
| 105 | + uint64_t seed{}; |
| 106 | + |
| 107 | + uint64_t kmer_value{}; |
| 108 | + uint64_t kmer_value_rev{}; |
| 109 | + size_t minimiser_position{}; |
| 110 | + |
| 111 | + size_t range_size{}; |
| 112 | + size_t range_position{}; |
| 113 | + |
| 114 | + value_type minimiser_value{}; |
| 115 | + |
| 116 | + int kmer_rev_shift{}; |
| 117 | + |
| 118 | + std::deque<uint64_t> kmer_values_in_window{}; |
| 119 | + |
| 120 | + static inline constexpr uint64_t compute_mask(uint8_t const kmer_size) |
| 121 | + { |
| 122 | + assert(kmer_size > 0u); |
| 123 | + assert(kmer_size <= 32u); |
| 124 | + |
| 125 | + if (kmer_size == 32u) |
| 126 | + return std::numeric_limits<uint64_t>::max(); |
| 127 | + else |
| 128 | + return (uint64_t{1u} << (2u * kmer_size)) - 1u; |
| 129 | + } |
| 130 | + |
| 131 | + static inline constexpr uint64_t compute_seed(uint8_t const kmer_size) |
| 132 | + { |
| 133 | + assert(kmer_size > 0u); |
| 134 | + assert(kmer_size <= 32u); |
| 135 | + |
| 136 | + return uint64_t{0x8F3F73B5CF1C9ADEULL} >> (64u - 2u * kmer_size); |
| 137 | + } |
| 138 | + |
| 139 | +public: |
| 140 | + basic_iterator() = default; |
| 141 | + basic_iterator(basic_iterator const &) = default; |
| 142 | + basic_iterator(basic_iterator &&) = default; |
| 143 | + basic_iterator & operator=(basic_iterator const &) = default; |
| 144 | + basic_iterator & operator=(basic_iterator &&) = default; |
| 145 | + ~basic_iterator() = default; |
| 146 | + |
| 147 | + basic_iterator(basic_iterator<!range_is_const> const & it) |
| 148 | + requires range_is_const |
| 149 | + : |
| 150 | + range_it{it.range_it}, |
| 151 | + kmer_mask{it.kmer_mask}, |
| 152 | + seed{it.seed}, |
| 153 | + kmer_value{it.kmer_value}, |
| 154 | + minimiser_position{it.minimiser_position}, |
| 155 | + range_size{it.range_size}, |
| 156 | + range_position{it.range_position}, |
| 157 | + minimiser_value{it.minimiser_value}, |
| 158 | + kmer_rev_shift{it.kmer_rev_shift}, |
| 159 | + kmer_values_in_window{it.kmer_values_in_window} |
| 160 | + {} |
| 161 | + |
| 162 | + basic_iterator(range_iterator_t range_iterator, size_t const range_size, minimiser_hash_parameters const & params) : |
| 163 | + range_it{std::move(range_iterator)}, |
| 164 | + kmer_mask{compute_mask(params.kmer_size)}, |
| 165 | + seed{compute_seed(params.kmer_size)}, |
| 166 | + range_size{range_size}, |
| 167 | + kmer_rev_shift{2 * static_cast<int>(params.kmer_size - 1)} |
| 168 | + { |
| 169 | + if (range_size < params.window_size) |
| 170 | + range_position = range_size; |
| 171 | + else |
| 172 | + init(params); |
| 173 | + } |
| 174 | + |
| 175 | + friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) |
| 176 | + { |
| 177 | + return lhs.range_it == rhs.range_it; |
| 178 | + } |
| 179 | + |
| 180 | + friend bool operator==(basic_iterator const & lhs, std::default_sentinel_t const &) |
| 181 | + { |
| 182 | + return lhs.range_position == lhs.range_size; |
| 183 | + } |
| 184 | + |
| 185 | + basic_iterator & operator++() noexcept |
| 186 | + { |
| 187 | + while (!next_minimiser_is_new()) |
| 188 | + {} |
| 189 | + return *this; |
| 190 | + } |
| 191 | + |
| 192 | + basic_iterator operator++(int) noexcept |
| 193 | + { |
| 194 | + basic_iterator tmp{*this}; |
| 195 | + while (!next_minimiser_is_new()) |
| 196 | + {} |
| 197 | + return tmp; |
| 198 | + } |
| 199 | + |
| 200 | + value_type operator*() const noexcept |
| 201 | + { |
| 202 | + return minimiser_value; |
| 203 | + } |
| 204 | + |
| 205 | +private: |
| 206 | + enum class pop_first : bool |
| 207 | + { |
| 208 | + no, |
| 209 | + yes |
| 210 | + }; |
| 211 | + |
| 212 | + void rolling_hash() |
| 213 | + requires std::same_as<std::ranges::range_value_t<range_t>, uint8_t> |
| 214 | + { |
| 215 | + uint64_t const new_rank = *range_it; |
| 216 | + |
| 217 | + kmer_value <<= 2; |
| 218 | + kmer_value |= new_rank; |
| 219 | + kmer_value &= kmer_mask; |
| 220 | + |
| 221 | + kmer_value_rev >>= 2; |
| 222 | + kmer_value_rev |= (new_rank ^ 0b11) << kmer_rev_shift; |
| 223 | + } |
| 224 | + |
| 225 | + void rolling_hash() |
| 226 | + requires std::same_as<std::ranges::range_value_t<range_t>, seqan3::dna4> |
| 227 | + { |
| 228 | + uint64_t const new_rank = range_it->to_rank(); |
| 229 | + |
| 230 | + kmer_value <<= 2; |
| 231 | + kmer_value |= new_rank; |
| 232 | + kmer_value &= kmer_mask; |
| 233 | + |
| 234 | + kmer_value_rev >>= 2; |
| 235 | + kmer_value_rev |= (new_rank ^ 0b11) << kmer_rev_shift; |
| 236 | + } |
| 237 | + |
| 238 | + template <pop_first pop> |
| 239 | + void next_window() |
| 240 | + { |
| 241 | + ++range_position; |
| 242 | + ++range_it; |
| 243 | + |
| 244 | + rolling_hash(); |
| 245 | + |
| 246 | + if constexpr (pop == pop_first::yes) |
| 247 | + kmer_values_in_window.pop_front(); |
| 248 | + |
| 249 | + kmer_values_in_window.push_back(std::min<uint64_t>(kmer_value ^ seed, kmer_value_rev ^ seed)); |
| 250 | + } |
| 251 | + |
| 252 | + void find_minimiser_in_window() |
| 253 | + { |
| 254 | + auto minimiser_it = std::ranges::min_element(kmer_values_in_window, std::less_equal<uint64_t>{}); |
| 255 | + minimiser_value = *minimiser_it; |
| 256 | + minimiser_position = std::distance(std::begin(kmer_values_in_window), minimiser_it); |
| 257 | + } |
| 258 | + |
| 259 | + void init(minimiser_hash_parameters const & params) |
| 260 | + { |
| 261 | + // range_it is already at the beginning of the range |
| 262 | + rolling_hash(); |
| 263 | + kmer_values_in_window.push_back(std::min<uint64_t>(kmer_value ^ seed, kmer_value_rev ^ seed)); |
| 264 | + |
| 265 | + // After this loop, `kmer_values_in_window` contains the first kmer value of the window. |
| 266 | + for (size_t i = 1u; i < params.kmer_size; ++i) |
| 267 | + next_window<pop_first::yes>(); |
| 268 | + |
| 269 | + // After this loop, `kmer_values_in_window` contains all kmer values of the window. |
| 270 | + for (size_t i = params.kmer_size; i < params.window_size; ++i) |
| 271 | + next_window<pop_first::no>(); |
| 272 | + |
| 273 | + find_minimiser_in_window(); |
| 274 | + } |
| 275 | + |
| 276 | + bool next_minimiser_is_new() |
| 277 | + { |
| 278 | + // If we reached the end of the range, we are done. |
| 279 | + if (range_position + 1 == range_size) |
| 280 | + return ++range_position; // Return true, but also increment range_position |
| 281 | + |
| 282 | + next_window<pop_first::yes>(); |
| 283 | + |
| 284 | + // The minimiser left the window. |
| 285 | + if (minimiser_position == 0) |
| 286 | + { |
| 287 | + find_minimiser_in_window(); |
| 288 | + return true; |
| 289 | + } |
| 290 | + |
| 291 | + // Update minimiser if the new kmer value is smaller than the current minimiser. |
| 292 | + if (uint64_t new_kmer_value = kmer_values_in_window.back(); new_kmer_value < minimiser_value) |
| 293 | + { |
| 294 | + minimiser_value = new_kmer_value; |
| 295 | + minimiser_position = kmer_values_in_window.size() - 1u; |
| 296 | + return true; |
| 297 | + } |
| 298 | + |
| 299 | + --minimiser_position; |
| 300 | + return false; |
| 301 | + } |
| 302 | +}; |
| 303 | + |
| 304 | +template <std::ranges::viewable_range rng_t> |
| 305 | +minimiser_hash(rng_t &&, minimiser_hash_parameters &&) -> minimiser_hash<std::views::all_t<rng_t>>; |
| 306 | + |
| 307 | +struct minimiser_hash_fn |
| 308 | +{ |
| 309 | + constexpr auto operator()(minimiser_hash_parameters params) const |
| 310 | + { |
| 311 | + return seqan::stl::detail::adaptor_from_functor{*this, std::move(params)}; |
| 312 | + } |
| 313 | + |
| 314 | + template <std::ranges::range range_t> |
| 315 | + constexpr auto operator()(range_t && range, minimiser_hash_parameters params) const |
| 316 | + { |
| 317 | + // static_assert(std::same_as<std::ranges::range_value_t<range_t>, seqan3::dna4>, "Only dna4 supported."); |
| 318 | + static_assert(std::ranges::sized_range<range_t>, "Input range must be a std::ranges::sized_range."); |
| 319 | + |
| 320 | + if (params.kmer_size == 0u) |
| 321 | + throw std::invalid_argument{"kmer_size must be > 0."}; |
| 322 | + if (params.kmer_size > 32u) |
| 323 | + throw std::invalid_argument{"kmer_size must be <= 32."}; |
| 324 | + if (params.window_size == 0u) |
| 325 | + throw std::invalid_argument{"window_size must be > 0."}; |
| 326 | + if (params.window_size < params.kmer_size) |
| 327 | + throw std::invalid_argument{"window_size must be >= kmer_size."}; |
| 328 | + |
| 329 | + return minimiser_hash{std::forward<range_t>(range), std::move(params)}; |
| 330 | + } |
| 331 | +}; |
| 332 | + |
| 333 | +} // namespace contrib::detail |
| 334 | + |
| 335 | +namespace contrib::views |
| 336 | +{ |
| 337 | + |
| 338 | +inline constexpr auto minimiser_hash = contrib::detail::minimiser_hash_fn{}; |
| 339 | + |
| 340 | +} |
0 commit comments