Skip to content

Commit 8d9f529

Browse files
authored
Merge pull request #235 from eseiler/feat/fpr
feat: fpr computation
2 parents 4a96ef2 + b894dea commit 8d9f529

File tree

4 files changed

+87
-24
lines changed

4 files changed

+87
-24
lines changed

.github/workflows/ci_linux.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
strategy:
3535
fail-fast: false
3636
matrix:
37-
compiler: ["clang-latest", "clang-second-latest", "clang-third-latest", "gcc-latest", "gcc-second-latest", "gcc-third-latest", "intel"]
37+
compiler: ["clang-latest", "clang-second-latest", "clang-third-latest", "gcc-latest", "gcc-second-latest", "intel"]
3838
container:
3939
image: ghcr.io/seqan/${{ matrix.compiler }}
4040
steps:

.github/workflows/ci_macos.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
strategy:
3535
fail-fast: false
3636
matrix:
37-
compiler: ["clang-latest", "clang-second-latest", "clang-third-latest", "gcc-latest", "gcc-second-latest", "gcc-third-latest"]
37+
compiler: ["clang-latest", "clang-second-latest", "clang-third-latest", "gcc-latest", "gcc-second-latest"]
3838
steps:
3939
- name: Checkout
4040
uses: actions/checkout@v6

cmake/package-lock.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
# cmake-format: off
99

1010
# hibf
11-
set (NEEDLE_HIBF_VERSION fc4e40d8672f496a32df73c608915d45e715f6a8 CACHE STRING "")
11+
set (NEEDLE_HIBF_VERSION bbca76409f5445f1404c206021fcfa37f1fd7054 CACHE STRING "")
1212
CPMDeclarePackage (hibf
1313
NAME hibf
1414
GIT_TAG ${NEEDLE_HIBF_VERSION} # main

src/ibf.cpp

Lines changed: 84 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,83 @@ void get_filsize_per_expression_level(std::filesystem::path const & filename,
116116
}
117117
}
118118

119+
void store_fpr_information(seqan::hibf::hierarchical_interleaved_bloom_filter const & hibf,
120+
size_t const num_files,
121+
estimate_ibf_arguments const & ibf_args)
122+
{
123+
//TODO: implement fpr calculation for HIBF
124+
struct fpr_parameters
125+
{
126+
size_t bin_size{};
127+
size_t elements{};
128+
size_t hash_count{};
129+
};
130+
131+
auto compute_fpr = [](fpr_parameters const & params) -> double
132+
{
133+
double const exp_arg = (params.hash_count * params.elements) / static_cast<double>(params.bin_size);
134+
double const log_arg = 1.0 - std::exp(-exp_arg);
135+
return std::exp(params.hash_count * std::log(log_arg));
136+
};
137+
138+
// Combined FPR is the sum: std::log1p(-fpr1) + std::log1p(-fpr2) + ...
139+
auto compute_combined_fpr = [](double const sum_of_split_fpr) -> double
140+
{
141+
return (1.0 - std::exp(sum_of_split_fpr));
142+
};
143+
144+
size_t const num_levels = ibf_args.number_expression_thresholds;
145+
std::vector<double> actual_fprs(num_files * num_levels);
146+
147+
auto traverse_hibf = [&](this auto self, size_t const ibf_idx) -> void
148+
{
149+
auto const & ibf = hibf.ibf_vector[ibf_idx];
150+
auto const & ub_ids = hibf.ibf_bin_to_user_bin_id[ibf_idx];
151+
auto const & next_ibfs = hibf.next_ibf_id[ibf_idx];
152+
153+
double sum{};
154+
fpr_parameters params{.bin_size = ibf.bin_size(), .hash_count = ibf.hash_function_count()};
155+
156+
for (size_t tbin = 0; tbin < ibf.bin_count(); ++tbin)
157+
{
158+
params.elements = ibf.occupancy[tbin];
159+
sum += std::log1p(-compute_fpr(params));
160+
161+
size_t const user_bin_id = ub_ids[tbin];
162+
163+
switch (user_bin_id)
164+
{
165+
case seqan::hibf::bin_kind::merged:
166+
self(next_ibfs[tbin]);
167+
[[fallthrough]];
168+
case seqan::hibf::bin_kind::deleted:
169+
sum = 0.0;
170+
break;
171+
default:
172+
if (tbin + 1u == ibf.bin_count() || user_bin_id != ub_ids[tbin + 1]) // last bin || end of split bin
173+
{
174+
actual_fprs[user_bin_id] = compute_combined_fpr(sum);
175+
sum = 0u;
176+
}
177+
}
178+
}
179+
};
180+
181+
traverse_hibf(0);
182+
183+
std::ofstream outfile{filenames::fprs(ibf_args.path_out)};
184+
for (unsigned level = 0; level < ibf_args.number_expression_thresholds; level++)
185+
{
186+
for (size_t file = 0; file < num_files; file++)
187+
{
188+
size_t const bin = level * num_files + file;
189+
outfile << actual_fprs[bin] << " ";
190+
}
191+
outfile << "\n";
192+
}
193+
outfile << "/\n";
194+
}
195+
119196
// Actual ibf construction
120197
template <bool samplewise, bool minimiser_files_given = true>
121198
void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
@@ -378,13 +455,12 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
378455
};
379456

380457
// HIBF config
381-
seqan::hibf::config hibf_config{
382-
.input_fn = hibf_input,
383-
.number_of_user_bins = user_bin_minimisers.size(),
384-
.number_of_hash_functions = num_hash,
385-
.maximum_fpr = fprs[0],
386-
.threads = ibf_args.threads,
387-
};
458+
seqan::hibf::config hibf_config{.input_fn = hibf_input,
459+
.number_of_user_bins = user_bin_minimisers.size(),
460+
.number_of_hash_functions = num_hash,
461+
.maximum_fpr = fprs[0],
462+
.threads = ibf_args.threads,
463+
.track_occupancy = true};
388464

389465
// Construct the HIBF
390466
seqan::hibf::hierarchical_interleaved_bloom_filter hibf{hibf_config};
@@ -406,20 +482,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
406482
outfile << "/\n";
407483
}
408484

409-
// Store FPR information
410-
//TODO: implement fpr calculation for HIBF
411-
std::ofstream outfile{filenames::fprs(ibf_args.path_out)};
412-
for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++)
413-
{
414-
for (size_t i = 0; i < num_files; i++)
415-
{
416-
// For HIBF, use the configured maximum FPR as approximation
417-
// since internal bin structure is not directly accessible
418-
outfile << fprs[0] << " ";
419-
}
420-
outfile << "\n";
421-
}
422-
outfile << "/\n";
485+
store_fpr_information(hibf, num_files, ibf_args);
423486
}
424487

425488
// Create ibfs

0 commit comments

Comments
 (0)