99#include < cereal/archives/binary.hpp>
1010
1111#include < hibf/build/bin_size_in_bits.hpp>
12+ #include < hibf/config.hpp>
13+ #include < hibf/hierarchical_interleaved_bloom_filter.hpp>
1214
1315#include " misc/calculate_cutoff.hpp"
1416#include " misc/check_cutoffs_samples.hpp"
@@ -141,7 +143,6 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
141143 }();
142144
143145 std::vector<std::vector<uint64_t >> sizes (num_files);
144- std::vector<uint64_t > sizes_ibf{};
145146 std::vector<std::vector<uint64_t >> counts_per_level (num_files,
146147 std::vector<uint64_t >(ibf_args.number_expression_thresholds ));
147148
@@ -170,7 +171,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
170171 omp_set_num_threads (ibf_args.threads );
171172 }
172173
173- // size_t const chunk_size = std::clamp<size_t>(std::bit_ceil(num_files / ibf_args.threads), 8u, 64u);
174+ size_t const chunk_size = std::clamp<size_t >(std::bit_ceil (num_files / ibf_args.threads ), 8u , 64u );
174175
175176 // If expression_thresholds should only be depending on minimsers in a certain genome file, genome is created.
176177 robin_hood::unordered_set<uint64_t > genome{};
@@ -249,52 +250,10 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
249250 }
250251 }
251252
252- seqan::hibf::interleaved_bloom_filter single_ibf = [&]()
253- {
254- size_t const total_bins = num_files * ibf_args.number_expression_thresholds ;
255-
256- uint64_t max_elements = 0 ;
257- uint64_t total_elements = 0 ;
258-
259- for (size_t i = 0 ; i < num_files; ++i)
260- {
261- for (size_t j = 0 ; j < ibf_args.number_expression_thresholds ; ++j)
262- {
263- max_elements = std::max (max_elements, sizes[i][j]);
264- total_elements += sizes[i][j];
265- }
266- }
267-
268- // Check if we have valid data
269- if (max_elements == num_files)
270- {
271- throw std::invalid_argument{" No minimizers found for any threshold level." };
272- }
273-
274- // Use average elements per bin for sizing
275- uint64_t avg_elements = std::max<uint64_t >(1UL , total_elements / total_bins);
276- // uint64_t elements_for_sizing = avg_elements;
277- // Alternatively: Use maximum elements for sizing:
278- uint64_t elements_for_sizing = std::max (max_elements, avg_elements);
279-
280- uint64_t const ibf_bin_size = seqan::hibf::build::bin_size_in_bits ({.fpr = fprs[0 ], //
281- .hash_count = num_hash,
282- .elements = elements_for_sizing});
283-
284- // Initialize sizes_ibf for compatibility with FPR calculation
285- sizes_ibf.assign (ibf_args.number_expression_thresholds , ibf_bin_size);
286-
287- // Create single IBF
288- return seqan::hibf::interleaved_bloom_filter{seqan::hibf::bin_count{total_bins},
289- seqan::hibf::bin_size{ibf_bin_size},
290- seqan::hibf::hash_function_count{num_hash}};
291- }();
292-
293253 // Collect minimizer insertions from all threads
294254 std::vector<std::vector<std::pair<uint64_t , std::vector<size_t >>>> file_insertions (num_files);
295255
296- // Parallelize this loop
297- #pragma omp parallel for schedule(dynamic)
256+ #pragma omp parallel for schedule(dynamic, chunk_size)
298257 for (size_t i = 0 ; i < num_files; i++)
299258 {
300259 robin_hood::unordered_node_map<uint64_t , uint16_t > hash_table{}; // Storage for minimisers
@@ -382,21 +341,43 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
382341 file_insertions[i] = std::move (local_insertions);
383342 }
384343
385- // Insert all collected minimizers into the IBF (single-threaded)
386- for (auto const & file_data : file_insertions)
344+ // HIBF construction
345+ std::vector<std::vector<uint64_t >> user_bin_minimisers (num_files * ibf_args.number_expression_thresholds );
346+
347+ // Populate user bins from collected insertions
348+ for (size_t i = 0 ; i < num_files; ++i)
387349 {
388- for (auto const & [hash, bins] : file_data )
350+ for (auto const & [hash, bins] : file_insertions[i] )
389351 {
390352 for (size_t bin_idx : bins)
391353 {
392- single_ibf. emplace (hash, seqan::hibf::bin_index{bin_idx} );
354+ user_bin_minimisers[bin_idx]. push_back (hash);
393355 }
394356 }
395357 }
396358
397- // Store the single IBF
359+ // HIBF input function
360+ auto hibf_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it)
361+ {
362+ for (auto hash : user_bin_minimisers[user_bin_id])
363+ it = hash;
364+ };
365+
366+ // HIBF config
367+ seqan::hibf::config hibf_config{
368+ .input_fn = hibf_input,
369+ .number_of_user_bins = user_bin_minimisers.size (),
370+ .number_of_hash_functions = num_hash,
371+ .maximum_fpr = fprs[0 ],
372+ .threads = ibf_args.threads ,
373+ };
374+
375+ // Construct the HIBF
376+ seqan::hibf::hierarchical_interleaved_bloom_filter hibf{hibf_config};
377+
378+ // Store the HIBF
398379 std::filesystem::path const filename = filenames::ibf (ibf_args.path_out , samplewise, 0 , ibf_args);
399- store_ibf (single_ibf , filename);
380+ store_ibf (hibf , filename);
400381
401382 // Store all expression thresholds per level.
402383 if constexpr (samplewise)
@@ -412,16 +393,15 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
412393 }
413394
414395 // Store FPR information
396+ // TODO: implement fpr calculation for HIBF
415397 std::ofstream outfile{filenames::fprs (ibf_args.path_out )};
416398 for (unsigned j = 0 ; j < ibf_args.number_expression_thresholds ; j++)
417399 {
418400 for (size_t i = 0 ; i < num_files; i++)
419401 {
420- // Calculate actual FPR based on IBF parameters
421- double const exp_arg = (num_hash * counts_per_level[i][j]) / static_cast <double >(sizes_ibf[j]);
422- double const log_arg = 1.0 - std::exp (-exp_arg);
423- double const fpr = std::exp (num_hash * std::log (log_arg));
424- outfile << fpr << " " ;
402+ // For HIBF, use the configured maximum FPR as approximation
403+ // since internal bin structure is not directly accessible
404+ outfile << fprs[0 ] << " " ;
425405 }
426406 outfile << " \n " ;
427407 }
0 commit comments