Skip to content

Commit b1fc725

Browse files
koleoptileseiler
authored andcommitted
integrate hibf (doesn't work yet)
1 parent 44c7989 commit b1fc725

File tree

2 files changed

+51
-61
lines changed

2 files changed

+51
-61
lines changed

src/estimate.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
#include <cereal/archives/binary.hpp>
1010

11+
#include <hibf/hierarchical_interleaved_bloom_filter.hpp>
12+
1113
#include "misc/debug.hpp"
1214
#include "misc/filenames.hpp"
1315
#include "misc/read_levels.hpp"
@@ -44,9 +46,17 @@ void check_ibf(estimate_ibf_arguments const & args,
4446
log("Warning: 16-bit counter might overflow.");
4547
#endif
4648

47-
std::vector<float> counter(ibf.bin_count());
48-
auto agent = ibf.counting_agent();
49-
std::ranges::copy(agent.bulk_count(minimiser), counter.begin());
49+
std::vector<float> counter(num_levels * num_experiments, 0.0f);
50+
auto agent = ibf.membership_agent();
51+
for (auto const & mini : minimiser)
52+
{
53+
auto const & result = agent.membership_for(std::vector<uint64_t>{mini}, 1u);
54+
55+
for (size_t user_bin_id : result)
56+
{
57+
counter[user_bin_id] += 1.0f;
58+
}
59+
}
5060

5161
// Defines where the median should be
5262
float const minimiser_pos = minimiser_count / 2.0;
@@ -208,7 +218,7 @@ void estimate(estimate_ibf_arguments & args,
208218

209219
// Calculate dimensions based on IBF structure
210220
size_t const num_levels = args.number_expression_thresholds;
211-
size_t const total_bins = ibf.bin_count();
221+
size_t const total_bins = ibf.number_of_user_bins;
212222

213223
assert(total_bins % num_levels == 0); // Ensure that total_bins is divisible by num_levels
214224
size_t const num_experiments = total_bins / num_levels; // This should match the number of files/experiments
@@ -326,5 +336,5 @@ void call_estimate(estimate_ibf_arguments & args, estimate_arguments & estimate_
326336
}
327337
};
328338

329-
call.template operator()<seqan::hibf::interleaved_bloom_filter>({});
339+
call.template operator()<seqan::hibf::hierarchical_interleaved_bloom_filter>({});
330340
}

src/ibf.cpp

Lines changed: 36 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include <cereal/archives/binary.hpp>
1010

1111
#include <hibf/build/bin_size_in_bits.hpp>
12+
#include <hibf/config.hpp>
13+
#include <hibf/hierarchical_interleaved_bloom_filter.hpp>
1214

1315
#include "misc/calculate_cutoff.hpp"
1416
#include "misc/check_cutoffs_samples.hpp"
@@ -141,7 +143,6 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
141143
}();
142144

143145
std::vector<std::vector<uint64_t>> sizes(num_files);
144-
std::vector<uint64_t> sizes_ibf{};
145146
std::vector<std::vector<uint64_t>> counts_per_level(num_files,
146147
std::vector<uint64_t>(ibf_args.number_expression_thresholds));
147148

@@ -170,7 +171,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
170171
omp_set_num_threads(ibf_args.threads);
171172
}
172173

173-
// size_t const chunk_size = std::clamp<size_t>(std::bit_ceil(num_files / ibf_args.threads), 8u, 64u);
174+
size_t const chunk_size = std::clamp<size_t>(std::bit_ceil(num_files / ibf_args.threads), 8u, 64u);
174175

175176
// If expression_thresholds should only be depending on minimsers in a certain genome file, genome is created.
176177
robin_hood::unordered_set<uint64_t> genome{};
@@ -249,52 +250,10 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
249250
}
250251
}
251252

252-
seqan::hibf::interleaved_bloom_filter single_ibf = [&]()
253-
{
254-
size_t const total_bins = num_files * ibf_args.number_expression_thresholds;
255-
256-
uint64_t max_elements = 0;
257-
uint64_t total_elements = 0;
258-
259-
for (size_t i = 0; i < num_files; ++i)
260-
{
261-
for (size_t j = 0; j < ibf_args.number_expression_thresholds; ++j)
262-
{
263-
max_elements = std::max(max_elements, sizes[i][j]);
264-
total_elements += sizes[i][j];
265-
}
266-
}
267-
268-
// Check if we have valid data
269-
if (max_elements == num_files)
270-
{
271-
throw std::invalid_argument{"No minimizers found for any threshold level."};
272-
}
273-
274-
// Use average elements per bin for sizing
275-
uint64_t avg_elements = std::max<uint64_t>(1UL, total_elements / total_bins);
276-
// uint64_t elements_for_sizing = avg_elements;
277-
// Alternatively: Use maximum elements for sizing:
278-
uint64_t elements_for_sizing = std::max(max_elements, avg_elements);
279-
280-
uint64_t const ibf_bin_size = seqan::hibf::build::bin_size_in_bits({.fpr = fprs[0], //
281-
.hash_count = num_hash,
282-
.elements = elements_for_sizing});
283-
284-
// Initialize sizes_ibf for compatibility with FPR calculation
285-
sizes_ibf.assign(ibf_args.number_expression_thresholds, ibf_bin_size);
286-
287-
// Create single IBF
288-
return seqan::hibf::interleaved_bloom_filter{seqan::hibf::bin_count{total_bins},
289-
seqan::hibf::bin_size{ibf_bin_size},
290-
seqan::hibf::hash_function_count{num_hash}};
291-
}();
292-
293253
// Collect minimizer insertions from all threads
294254
std::vector<std::vector<std::pair<uint64_t, std::vector<size_t>>>> file_insertions(num_files);
295255

296-
// Parallelize this loop
297-
#pragma omp parallel for schedule(dynamic)
256+
#pragma omp parallel for schedule(dynamic, chunk_size)
298257
for (size_t i = 0; i < num_files; i++)
299258
{
300259
robin_hood::unordered_node_map<uint64_t, uint16_t> hash_table{}; // Storage for minimisers
@@ -382,21 +341,43 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
382341
file_insertions[i] = std::move(local_insertions);
383342
}
384343

385-
// Insert all collected minimizers into the IBF (single-threaded)
386-
for (auto const & file_data : file_insertions)
344+
// HIBF construction
345+
std::vector<std::vector<uint64_t>> user_bin_minimisers(num_files * ibf_args.number_expression_thresholds);
346+
347+
// Populate user bins from collected insertions
348+
for (size_t i = 0; i < num_files; ++i)
387349
{
388-
for (auto const & [hash, bins] : file_data)
350+
for (auto const & [hash, bins] : file_insertions[i])
389351
{
390352
for (size_t bin_idx : bins)
391353
{
392-
single_ibf.emplace(hash, seqan::hibf::bin_index{bin_idx});
354+
user_bin_minimisers[bin_idx].push_back(hash);
393355
}
394356
}
395357
}
396358

397-
// Store the single IBF
359+
// HIBF input function
360+
auto hibf_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it)
361+
{
362+
for (auto hash : user_bin_minimisers[user_bin_id])
363+
it = hash;
364+
};
365+
366+
// HIBF config
367+
seqan::hibf::config hibf_config{
368+
.input_fn = hibf_input,
369+
.number_of_user_bins = user_bin_minimisers.size(),
370+
.number_of_hash_functions = num_hash,
371+
.maximum_fpr = fprs[0],
372+
.threads = ibf_args.threads,
373+
};
374+
375+
// Construct the HIBF
376+
seqan::hibf::hierarchical_interleaved_bloom_filter hibf{hibf_config};
377+
378+
// Store the HIBF
398379
std::filesystem::path const filename = filenames::ibf(ibf_args.path_out, samplewise, 0, ibf_args);
399-
store_ibf(single_ibf, filename);
380+
store_ibf(hibf, filename);
400381

401382
// Store all expression thresholds per level.
402383
if constexpr (samplewise)
@@ -412,16 +393,15 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
412393
}
413394

414395
// Store FPR information
396+
//TODO: implement fpr calculation for HIBF
415397
std::ofstream outfile{filenames::fprs(ibf_args.path_out)};
416398
for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++)
417399
{
418400
for (size_t i = 0; i < num_files; i++)
419401
{
420-
// Calculate actual FPR based on IBF parameters
421-
double const exp_arg = (num_hash * counts_per_level[i][j]) / static_cast<double>(sizes_ibf[j]);
422-
double const log_arg = 1.0 - std::exp(-exp_arg);
423-
double const fpr = std::exp(num_hash * std::log(log_arg));
424-
outfile << fpr << " ";
402+
// For HIBF, use the configured maximum FPR as approximation
403+
// since internal bin structure is not directly accessible
404+
outfile << fprs[0] << " ";
425405
}
426406
outfile << "\n";
427407
}

0 commit comments

Comments
 (0)