Skip to content

Commit 62110d5

Browse files
authored
Merge pull request #295 from eseiler/misc/collissions
[UTIL] hash collisions
2 parents 36c2235 + 93dfa42 commit 62110d5

File tree

3 files changed

+305
-7
lines changed

3 files changed

+305
-7
lines changed

src/interleaved_bloom_filter.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_
142142
void interleaved_bloom_filter::clear(bin_index const bin) noexcept
143143
{
144144
assert(bin.value < bins);
145+
occupancy[bin.value] = 0u;
145146
for (size_t idx = bin.value, i = 0; i < bin_size_; idx += technical_bins, ++i)
146147
(*this)[idx] = 0;
147148
}

util/CMakeLists.txt

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,12 @@ include (${HIBF_ROOT_DIR}/cmake/cxx_config.cmake)
1111
add_subdirectory ("${HIBF_ROOT_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/hibf_lib")
1212

1313
# Dependency: Sharg
14-
include (FetchContent)
15-
option (SHARG_NO_TDL "" ON)
16-
FetchContent_Declare (
17-
sharg
18-
GIT_REPOSITORY https://github.com/seqan/sharg-parser
19-
GIT_TAG main)
20-
FetchContent_MakeAvailable (sharg)
14+
CPMAddPackage (URI "gh:seqan/sharg-parser#main"
15+
OPTIONS "INSTALL_SHARG OFF" "INSTALL_TDL OFF" "CMAKE_MESSAGE_LOG_LEVEL WARNING" "BUILD_TESTING OFF"
16+
"SHARG_NO_TDL ON")
2117

2218
add_executable (fpr_correction_check fpr_correction_check.cpp)
2319
target_link_libraries (fpr_correction_check seqan::hibf sharg::sharg)
20+
21+
add_executable (hash_collisions hash_collisions.cpp)
22+
target_link_libraries (hash_collisions seqan::hibf sharg::sharg)

util/hash_collisions.cpp

Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
2+
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
3+
// SPDX-License-Identifier: BSD-3-Clause
4+
5+
#include <iostream>
6+
#include <random>
7+
8+
#include <sharg/parser.hpp>
9+
10+
#include <hibf/build/bin_size_in_bits.hpp>
11+
#include <hibf/contrib/robin_hood.hpp>
12+
#include <hibf/interleaved_bloom_filter.hpp>
13+
#include <hibf/layout/compute_fpr_correction.hpp>
14+
#include <hibf/misc/divide_and_ceil.hpp>
15+
16+
struct config
17+
{
18+
// Set by user
19+
size_t kmer_size{12};
20+
size_t elements{130};
21+
size_t hash{2};
22+
size_t repetitions{1};
23+
double fpr{0.05};
24+
25+
// Internal
26+
size_t kmer_max{};
27+
};
28+
29+
class positive_integer_validator
30+
{
31+
public:
32+
using option_value_type = size_t;
33+
34+
positive_integer_validator() = default;
35+
positive_integer_validator(positive_integer_validator const &) = default;
36+
positive_integer_validator & operator=(positive_integer_validator const &) = default;
37+
positive_integer_validator(positive_integer_validator &&) = default;
38+
positive_integer_validator & operator=(positive_integer_validator &&) = default;
39+
~positive_integer_validator() = default;
40+
41+
explicit positive_integer_validator(bool const is_zero_positive_) : is_zero_positive{is_zero_positive_}
42+
{}
43+
44+
void operator()(option_value_type const & val) const
45+
{
46+
if (!is_zero_positive && !val)
47+
throw sharg::validation_error{"The value must be a positive integer."};
48+
}
49+
50+
std::string get_help_page_message() const
51+
{
52+
if (is_zero_positive)
53+
return "Value must be a positive integer or 0.";
54+
else
55+
return "Value must be a positive integer.";
56+
}
57+
58+
private:
59+
bool is_zero_positive{false};
60+
};
61+
62+
void init_parser(sharg::parser & parser, config & cfg)
63+
{
64+
parser.add_option(cfg.kmer_size,
65+
sharg::config{.short_id = '\0',
66+
.long_id = "kmer",
67+
.description = "The k-mer size.",
68+
.validator = sharg::arithmetic_range_validator{1, 32}});
69+
parser.add_option(cfg.elements,
70+
sharg::config{.short_id = '\0',
71+
.long_id = "elements",
72+
.description = "Number of elements to insert.",
73+
.validator = positive_integer_validator{}});
74+
parser.add_option(cfg.hash,
75+
sharg::config{.short_id = '\0',
76+
.long_id = "hash",
77+
.description = "The number of hash functions to use.",
78+
.validator = sharg::arithmetic_range_validator{1, 5}});
79+
parser.add_option(cfg.repetitions,
80+
sharg::config{.short_id = '\0',
81+
.long_id = "repeats",
82+
.description = "Number of repetitions.",
83+
.validator = positive_integer_validator{}});
84+
parser.add_option(cfg.fpr,
85+
sharg::config{.short_id = '\0',
86+
.long_id = "fpr",
87+
.description = "The desired false positive rate.",
88+
.validator = sharg::arithmetic_range_validator{0.0, 1.0}});
89+
}
90+
91+
double false_positive_rate(size_t const elements, size_t const hash_count, size_t const bin_size)
92+
{
93+
double const exp_arg = (hash_count * elements) / static_cast<double>(bin_size);
94+
double const log_arg = 1.0 - std::exp(-exp_arg);
95+
return std::exp(hash_count * std::log(log_arg));
96+
}
97+
98+
struct stats
99+
{
100+
size_t min{};
101+
size_t max{};
102+
double mean{};
103+
double variance{};
104+
};
105+
106+
stats get_stats(std::vector<size_t> const & values)
107+
{
108+
size_t min = std::ranges::min(values);
109+
size_t max = std::ranges::max(values);
110+
111+
size_t number_of_values = values.size();
112+
double mean = std::reduce(values.begin(), values.end()) / static_cast<double>(number_of_values);
113+
double variance = [&values, mean, number_of_values]()
114+
{
115+
if (number_of_values == 1u)
116+
return std::numeric_limits<double>::quiet_NaN();
117+
118+
auto helper = [mean, number_of_values](double current, double const value)
119+
{
120+
return current + ((value - mean) * (value - mean) / (number_of_values - 1));
121+
};
122+
123+
return std::reduce(values.begin(), values.end(), 0.0, helper);
124+
}();
125+
126+
return stats{.min = min, .max = max, .mean = mean, .variance = variance};
127+
}
128+
129+
stats get_stats(std::vector<size_t> const & values_, size_t const elements)
130+
{
131+
std::vector<size_t> values;
132+
values.reserve(values_.size());
133+
std::ranges::transform(values_,
134+
std::back_inserter(values),
135+
[elements](size_t const val)
136+
{
137+
return elements - val;
138+
});
139+
140+
return get_stats(values);
141+
}
142+
143+
void run(config const & cfg)
144+
{
145+
size_t const bin_size = seqan::hibf::build::bin_size_in_bits({.fpr = cfg.fpr, //
146+
.hash_count = cfg.hash,
147+
.elements = cfg.elements});
148+
149+
seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{1u},
150+
seqan::hibf::bin_size{bin_size},
151+
seqan::hibf::hash_function_count{cfg.hash},
152+
true};
153+
154+
std::vector<size_t> occupancies{};
155+
occupancies.reserve(cfg.repetitions);
156+
157+
// Simulation
158+
{
159+
auto agent = ibf.containment_agent();
160+
161+
robin_hood::unordered_set<uint64_t> inserted_values{};
162+
std::random_device rd;
163+
std::mt19937_64 gen(rd());
164+
std::uniform_int_distribution<uint64_t> distrib(0ULL, cfg.kmer_max);
165+
166+
for (size_t i{}; i < cfg.repetitions; ++i)
167+
{
168+
for (; inserted_values.size() < cfg.elements;)
169+
inserted_values.emplace(distrib(gen));
170+
171+
for (uint64_t const value : inserted_values)
172+
ibf.emplace(value, seqan::hibf::bin_index{0u});
173+
174+
occupancies.push_back(ibf.occupancy[0]);
175+
inserted_values.clear();
176+
ibf.clear(seqan::hibf::bin_index{0u});
177+
}
178+
}
179+
180+
stats const occupancy_stats = get_stats(occupancies);
181+
stats const collision_stats = get_stats(occupancies, cfg.elements);
182+
183+
// I don't know why this works reasonably well, but it does.
184+
// Removing `/ std::sqrt(cfg.hash)` and using `1` hash is a known computation for the expected number of
185+
// hash collisions.
186+
// Hence, it is exact for 1 hash, but only a rough estimate for more than 1 hash.
187+
double const rough_estimate = [&]()
188+
{
189+
double const bin_size_1_hash = seqan::hibf::build::bin_size_in_bits({.fpr = cfg.fpr, //
190+
.hash_count = 1u,
191+
.elements = cfg.elements});
192+
193+
double estimate_1 = std::exp(cfg.elements * std::log((bin_size_1_hash - 1.0) / bin_size_1_hash));
194+
double estimate_2 = bin_size_1_hash * (1.0 - estimate_1);
195+
return (cfg.elements - estimate_2) / std::sqrt(cfg.hash);
196+
}();
197+
198+
// This one does make sense, but is rather expensive to compute.
199+
double const exact_estimate = [&]()
200+
{
201+
double estimate = 0.0;
202+
for (size_t i = 1; i <= cfg.elements; ++i)
203+
estimate += false_positive_rate(i - 1, cfg.hash, bin_size);
204+
return estimate;
205+
}();
206+
207+
std::stringstream result{};
208+
result.setf(std::ios::left, std::ios::adjustfield);
209+
210+
result.width(20);
211+
result << "";
212+
result.width(11);
213+
result << "min";
214+
result.width(11);
215+
result << "max";
216+
result.width(11);
217+
result << "mean";
218+
result.width(11);
219+
result << "variance";
220+
result << '\n';
221+
222+
result.width(20);
223+
result << "occupancy";
224+
result.width(11);
225+
result << occupancy_stats.min;
226+
result.width(11);
227+
result << occupancy_stats.max;
228+
result.width(11);
229+
result << occupancy_stats.mean;
230+
result.width(11);
231+
result << occupancy_stats.variance;
232+
result << '\n';
233+
234+
result.width(20);
235+
result << "collisions";
236+
result.width(11);
237+
result << collision_stats.min;
238+
result.width(11);
239+
result << collision_stats.max;
240+
result.width(11);
241+
result << collision_stats.mean;
242+
result.width(11);
243+
result << collision_stats.variance;
244+
result << '\n';
245+
246+
result.width(20);
247+
result << "exact estimate";
248+
result.width(11);
249+
result << "";
250+
result.width(11);
251+
result << "";
252+
result.width(11);
253+
result << exact_estimate;
254+
result.width(11);
255+
result << "";
256+
result << '\n';
257+
258+
result.width(20);
259+
result << "rough estimate";
260+
result.width(11);
261+
result << "";
262+
result.width(11);
263+
result << "";
264+
result.width(11);
265+
result << rough_estimate;
266+
result.width(11);
267+
result << "";
268+
result << '\n';
269+
270+
std::cout << result.str();
271+
}
272+
273+
int main(int argc, char ** argv)
274+
{
275+
sharg::parser parser{"hash_collisions", argc, argv, sharg::update_notifications::off};
276+
parser.info.author = "Enrico Seiler";
277+
parser.info.short_copyright = "BSD 3-Clause License";
278+
parser.info.short_description = "Determines the number of hash collisions in the IBF.";
279+
config cfg{};
280+
init_parser(parser, cfg);
281+
parser.parse();
282+
283+
cfg.kmer_max = cfg.kmer_size == 32 ? std::numeric_limits<uint64_t>::max() : (1ULL << (2 * cfg.kmer_size)) - 1u;
284+
285+
if (cfg.elements > cfg.kmer_max && cfg.elements != cfg.kmer_max + 1u)
286+
{
287+
std::cout << "[WARNING] Inserting more elements than there are possible k-mers. "
288+
<< "Setting number of elements to number of possible k-mers.\n\n";
289+
cfg.elements = cfg.kmer_max + 1u;
290+
}
291+
292+
std::cout << "kmer: " << cfg.kmer_size << '\n';
293+
std::cout << "elements: " << cfg.elements << '\n';
294+
std::cout << "hash: " << cfg.hash << '\n';
295+
std::cout << "fpr: " << cfg.fpr << '\n';
296+
std::cout << "repetitions: " << cfg.repetitions << "\n\n";
297+
run(cfg);
298+
}

0 commit comments

Comments
 (0)