-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathminhashes.cpp
More file actions
68 lines (55 loc) · 2.32 KB
/
minhashes.cpp
File metadata and controls
68 lines (55 loc) · 2.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause
#include <algorithm> // for __fn, is_sorted, all_of, find, pop_heap, push_heap
#include <cassert> // for assert
#include <cstdint> // for uint64_t
#include <span> // for span
#include <vector> // for vector
#include <hibf/sketch/minhashes.hpp> // for minhashes
namespace seqan::hibf::sketch
{
minhashes::minhashes(std::vector<uint64_t> const & smallest_values)
{
assert(std::ranges::is_sorted(smallest_values));
table.resize(num_sketches);
for (auto & elem : table)
elem.reserve(sketch_size);
for (uint64_t const hash : smallest_values)
{
auto & hash_table = table[hash & register_id_mask];
if (hash_table.size() < sketch_size)
hash_table.push_back(hash >> 4);
}
}
bool minhashes::is_valid() const
{
return table.size() == num_sketches
&& std::ranges::all_of(table,
[](auto const & minHash_sketch)
{
return minHash_sketch.size() == sketch_size;
});
}
void minhashes::fill_incomplete_sketches(std::span<uint64_t> const & more_smallest_values)
{
assert(std::ranges::is_sorted(more_smallest_values));
for (uint64_t const hash : more_smallest_values)
{
auto & hash_table = table[hash & register_id_mask];
assert(std::ranges::find(hash_table, hash) == hash_table.end()); // hashes should be unique
if (hash_table.size() < sketch_size)
hash_table.push_back(hash >> 4);
}
}
void minhashes::push_to_heap_if_smaller(uint64_t const value, std::vector<uint64_t> & heap)
{
// Do nothing if value is bigger than the current biggest element in the (max) heap.
if (value >= heap[0])
return;
// we do not need a guard (hash table) to check for duplications because `kmers` is already a set
std::ranges::pop_heap(heap); // max elements move to end of vector
heap.back() = value; // replace last elements instead of physically popping and pushing
std::ranges::push_heap(heap); // last elements is rearranged in the heap to be pushed
}
} // namespace seqan::hibf::sketch