@@ -116,6 +116,83 @@ void get_filsize_per_expression_level(std::filesystem::path const & filename,
116116 }
117117}
118118
119+ void store_fpr_information (seqan::hibf::hierarchical_interleaved_bloom_filter const & hibf,
120+ size_t const num_files,
121+ estimate_ibf_arguments const & ibf_args)
122+ {
123+ // TODO: implement fpr calculation for HIBF
124+ struct fpr_parameters
125+ {
126+ size_t bin_size{};
127+ size_t elements{};
128+ size_t hash_count{};
129+ };
130+
131+ auto compute_fpr = [](fpr_parameters const & params) -> double
132+ {
133+ double const exp_arg = (params.hash_count * params.elements ) / static_cast <double >(params.bin_size );
134+ double const log_arg = 1.0 - std::exp (-exp_arg);
135+ return std::exp (params.hash_count * std::log (log_arg));
136+ };
137+
138+ // Combined FPR is the sum: std::log1p(-fpr1) + std::log1p(-fpr2) + ...
139+ auto compute_combined_fpr = [](double const sum_of_split_fpr) -> double
140+ {
141+ return (1.0 - std::exp (sum_of_split_fpr));
142+ };
143+
144+ size_t const num_levels = ibf_args.number_expression_thresholds ;
145+ std::vector<double > actual_fprs (num_files * num_levels);
146+
147+ auto traverse_hibf = [&](this auto self, size_t const ibf_idx) -> void
148+ {
149+ auto const & ibf = hibf.ibf_vector [ibf_idx];
150+ auto const & ub_ids = hibf.ibf_bin_to_user_bin_id [ibf_idx];
151+ auto const & next_ibfs = hibf.next_ibf_id [ibf_idx];
152+
153+ double sum{};
154+ fpr_parameters params{.bin_size = ibf.bin_size (), .hash_count = ibf.hash_function_count ()};
155+
156+ for (size_t tbin = 0 ; tbin < ibf.bin_count (); ++tbin)
157+ {
158+ params.elements = ibf.occupancy [tbin];
159+ sum += std::log1p (-compute_fpr (params));
160+
161+ size_t const user_bin_id = ub_ids[tbin];
162+
163+ switch (user_bin_id)
164+ {
165+ case seqan::hibf::bin_kind::merged:
166+ self (next_ibfs[tbin]);
167+ [[fallthrough]];
168+ case seqan::hibf::bin_kind::deleted:
169+ sum = 0.0 ;
170+ break ;
171+ default :
172+ if (tbin + 1u == ibf.bin_count () || user_bin_id != ub_ids[tbin + 1 ]) // last bin || end of split bin
173+ {
174+ actual_fprs[user_bin_id] = compute_combined_fpr (sum);
175+ sum = 0u ;
176+ }
177+ }
178+ }
179+ };
180+
181+ traverse_hibf (0 );
182+
183+ std::ofstream outfile{filenames::fprs (ibf_args.path_out )};
184+ for (unsigned level = 0 ; level < ibf_args.number_expression_thresholds ; level++)
185+ {
186+ for (size_t file = 0 ; file < num_files; file++)
187+ {
188+ size_t const bin = level * num_files + file;
189+ outfile << actual_fprs[bin] << " " ;
190+ }
191+ outfile << " \n " ;
192+ }
193+ outfile << " /\n " ;
194+ }
195+
119196// Actual ibf construction
120197template <bool samplewise, bool minimiser_files_given = true >
121198void ibf_helper (std::vector<std::filesystem::path> const & minimiser_files,
@@ -378,13 +455,12 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
378455 };
379456
380457 // HIBF config
381- seqan::hibf::config hibf_config{
382- .input_fn = hibf_input,
383- .number_of_user_bins = user_bin_minimisers.size (),
384- .number_of_hash_functions = num_hash,
385- .maximum_fpr = fprs[0 ],
386- .threads = ibf_args.threads ,
387- };
458+ seqan::hibf::config hibf_config{.input_fn = hibf_input,
459+ .number_of_user_bins = user_bin_minimisers.size (),
460+ .number_of_hash_functions = num_hash,
461+ .maximum_fpr = fprs[0 ],
462+ .threads = ibf_args.threads ,
463+ .track_occupancy = true };
388464
389465 // Construct the HIBF
390466 seqan::hibf::hierarchical_interleaved_bloom_filter hibf{hibf_config};
@@ -406,20 +482,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
406482 outfile << " /\n " ;
407483 }
408484
409- // Store FPR information
410- // TODO: implement fpr calculation for HIBF
411- std::ofstream outfile{filenames::fprs (ibf_args.path_out )};
412- for (unsigned j = 0 ; j < ibf_args.number_expression_thresholds ; j++)
413- {
414- for (size_t i = 0 ; i < num_files; i++)
415- {
416- // For HIBF, use the configured maximum FPR as approximation
417- // since internal bin structure is not directly accessible
418- outfile << fprs[0 ] << " " ;
419- }
420- outfile << " \n " ;
421- }
422- outfile << " /\n " ;
485+ store_fpr_information (hibf, num_files, ibf_args);
423486}
424487
425488// Create ibfs
0 commit comments