Skip to content

Commit bbca764

Browse files
authored
Merge pull request seqan#294 from eseiler/misc/track_occupancy
feat: track occupancy without dynamic hibf
2 parents 5a950f8 + b76dc5f commit bbca764

File tree

6 files changed

+106
-7
lines changed

6 files changed

+106
-7
lines changed

include/hibf/config.hpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ namespace seqan::hibf
4141
* | Layout | seqan::hibf::config::sketch_bits | 12 | |
4242
* | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset |
4343
* | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout |
44+
* | General | seqan::hibf::config::track_occupancy | false | |
4445
* | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | |
4546
* | Layout | seqan::hibf::config::alpha | 1.2 | |
4647
* | Layout | seqan::hibf::config::disable_estimate_union | false | |
@@ -243,11 +244,32 @@ struct config
243244
* designated to contain any data. The resulting layout will be very similar to a layout with `tmax` set to `58`
244245
* and no empty bins.
245246
*
247+
* Choosing a value larger than `0.0` will also enable the `track_occupancy` option.
248+
*
246249
* Value must be in range [0.0,1.0).
247250
* Recommendation: default value (0.0). This option is not recommended for general use.
248251
*/
249252
double empty_bin_fraction{};
250253

254+
/*!\brief Track the amount of emplaced elements for each technical bin.
255+
*
256+
* An IBF can track how many elements were emplaced into each technical bin.
257+
* This option can be useful for a dynamic index, or to compute the exact FPR for a technical bin.
258+
*
259+
* The occupancy of a technical bin `i` of IBF `ibf` can be accessed via `ibf.occupancy[i]`.
260+
*
261+
* For occupancy, emplacing an element means that a bit of the conceptual Bloom Filter representing the respective
262+
* technical bin changes.
263+
* For example, adding the same value multiple times to the same technical bin will not increase the occupancy.
264+
* Likewise, if the respective bits for a value have already been set by previous emplacing operations, the
265+
* occupancy will not increase.
266+
*
267+
* This option comes with a minor performance penalty for seqan::hibf::interleaved_bloom_filter::emplace.
268+
*
269+
* Recommendation: default value (false).
270+
*/
271+
bool track_occupancy{false};
272+
251273
/*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm.
252274
*
253275
* The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of
@@ -330,6 +352,8 @@ struct config
330352
* * Not setting seqan::hibf::config::tmax, or setting it to `0`, results in a default tmax
331353
* `std::ceil(std::sqrt(number_of_user_bins))` being used.
332354
* * seqan::hibf::config::tmax is increased to the next multiple of 64.
355+
* * Setting seqan::hibf::config::empty_bin_fraction to a value larger than `0.0` will also enable
356+
* seqan::hibf::config::track_occupancy.
333357
*/
334358
void validate_and_set_defaults();
335359

@@ -354,7 +378,7 @@ struct config
354378
private:
355379
friend class cereal::access;
356380

357-
static constexpr uint32_t version{2};
381+
static constexpr uint32_t version{3u};
358382

359383
template <typename archive_t>
360384
void serialize(archive_t & archive)
@@ -371,9 +395,16 @@ struct config
371395
archive(CEREAL_NVP(sketch_bits));
372396
archive(CEREAL_NVP(tmax));
373397

374-
if (parsed_version > 1u)
398+
if (parsed_version >= 2u)
399+
{
375400
archive(CEREAL_NVP(empty_bin_fraction));
376401

402+
if (parsed_version >= 3u)
403+
archive(CEREAL_NVP(track_occupancy));
404+
else
405+
track_occupancy = empty_bin_fraction != 0.0;
406+
}
407+
377408
archive(CEREAL_NVP(alpha));
378409
archive(CEREAL_NVP(max_rearrangement_ratio));
379410
archive(CEREAL_NVP(disable_estimate_union));

src/build/construct_ibf.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s
5151
seqan::hibf::interleaved_bloom_filter ibf{bin_count,
5252
bin_size,
5353
seqan::hibf::hash_function_count{data.config.number_of_hash_functions},
54-
data.config.empty_bin_fraction > 0.0};
54+
data.config.track_occupancy};
5555

5656
local_index_allocation_timer.stop();
5757
data.index_allocation_timer += local_index_allocation_timer;

src/config.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ void config::validate_and_set_defaults()
116116
if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0)
117117
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."};
118118

119+
if (empty_bin_fraction != 0.0)
120+
track_occupancy = true;
121+
119122
if (alpha < 0.0)
120123
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."};
121124

src/interleaved_bloom_filter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_
105105
interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins},
106106
seqan::hibf::bin_size{max_bin_size(configuration, max_bin_elements)},
107107
seqan::hibf::hash_function_count{configuration.number_of_hash_functions},
108-
configuration.empty_bin_fraction > 0.0}
108+
configuration.track_occupancy}
109109
{
110110
size_t const chunk_size = std::clamp<size_t>(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u);
111111

test/snippet/hibf/hibf_construction.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ int main()
2828
.threads = 1, // recommended to adapt
2929
.sketch_bits = 12,
3030
.tmax = 0, // triggers default copmutation
31+
.empty_bin_fraction = 0.0,
32+
.track_occupancy = false,
3133
.alpha = 1.2,
3234
.max_rearrangement_ratio = 0.5,
3335
.disable_estimate_union = false,

test/unit/hibf/config_test.cpp

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ TEST(config_test, write_to)
3737
std::string const expected_file{"@HIBF_CONFIG\n"
3838
"@{\n"
3939
"@ \"hibf_config\": {\n"
40-
"@ \"version\": 2,\n"
40+
"@ \"version\": 3,\n"
4141
"@ \"number_of_user_bins\": 123456789,\n"
4242
"@ \"number_of_hash_functions\": 4,\n"
4343
"@ \"maximum_fpr\": 0.0001,\n"
@@ -46,6 +46,7 @@ TEST(config_test, write_to)
4646
"@ \"sketch_bits\": 8,\n"
4747
"@ \"tmax\": 128,\n"
4848
"@ \"empty_bin_fraction\": 0.0,\n"
49+
"@ \"track_occupancy\": false,\n"
4950
"@ \"alpha\": 1.0,\n"
5051
"@ \"max_rearrangement_ratio\": 0.333,\n"
5152
"@ \"disable_estimate_union\": true,\n"
@@ -62,7 +63,7 @@ TEST(config_test, read_from)
6263
std::stringstream ss{"@HIBF_CONFIG\n"
6364
"@{\n"
6465
"@ \"hibf_config\": {\n"
65-
"@ \"version\": 2,\n"
66+
"@ \"version\": 3,\n"
6667
"@ \"number_of_user_bins\": 123456789,\n"
6768
"@ \"number_of_hash_functions\": 4,\n"
6869
"@ \"maximum_fpr\": 0.0001,\n"
@@ -71,6 +72,7 @@ TEST(config_test, read_from)
7172
"@ \"sketch_bits\": 8,\n"
7273
"@ \"tmax\": 128,\n"
7374
"@ \"empty_bin_fraction\": 0.5,\n"
75+
"@ \"track_occupancy\": true,\n"
7476
"@ \"alpha\": 1.0,\n"
7577
"@ \"max_rearrangement_ratio\": 0.333,\n"
7678
"@ \"disable_estimate_union\": true,\n"
@@ -90,6 +92,7 @@ TEST(config_test, read_from)
9092
EXPECT_EQ(configuration.sketch_bits, 8);
9193
EXPECT_EQ(configuration.tmax, 128);
9294
EXPECT_EQ(configuration.empty_bin_fraction, 0.5);
95+
EXPECT_EQ(configuration.track_occupancy, true);
9396
EXPECT_EQ(configuration.alpha, 1.0);
9497
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
9598
EXPECT_EQ(configuration.disable_estimate_union, true);
@@ -134,6 +137,46 @@ TEST(config_test, read_from_v1)
134137
EXPECT_EQ(configuration.disable_rearrangement, false);
135138
}
136139

140+
TEST(config_test, read_from_v2)
141+
{
142+
std::stringstream ss{"@HIBF_CONFIG\n"
143+
"@{\n"
144+
"@ \"hibf_config\": {\n"
145+
"@ \"version\": 2,\n"
146+
"@ \"number_of_user_bins\": 123456789,\n"
147+
"@ \"number_of_hash_functions\": 4,\n"
148+
"@ \"maximum_fpr\": 0.0001,\n"
149+
"@ \"relaxed_fpr\": 0.3,\n"
150+
"@ \"threads\": 31,\n"
151+
"@ \"sketch_bits\": 8,\n"
152+
"@ \"tmax\": 128,\n"
153+
"@ \"empty_bin_fraction\": 0.5,\n"
154+
"@ \"alpha\": 1.0,\n"
155+
"@ \"max_rearrangement_ratio\": 0.333,\n"
156+
"@ \"disable_estimate_union\": true,\n"
157+
"@ \"disable_rearrangement\": false\n"
158+
"@ }\n"
159+
"@}\n"
160+
"@HIBF_CONFIG_END\n"};
161+
162+
seqan::hibf::config configuration;
163+
configuration.read_from(ss);
164+
165+
EXPECT_EQ(configuration.number_of_user_bins, 123456789);
166+
EXPECT_EQ(configuration.number_of_hash_functions, 4);
167+
EXPECT_EQ(configuration.maximum_fpr, 0.0001);
168+
EXPECT_EQ(configuration.relaxed_fpr, 0.3);
169+
EXPECT_EQ(configuration.threads, 31);
170+
EXPECT_EQ(configuration.sketch_bits, 8);
171+
EXPECT_EQ(configuration.tmax, 128);
172+
EXPECT_EQ(configuration.empty_bin_fraction, 0.5);
173+
EXPECT_EQ(configuration.track_occupancy, true);
174+
EXPECT_EQ(configuration.alpha, 1.0);
175+
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
176+
EXPECT_EQ(configuration.disable_estimate_union, true);
177+
EXPECT_EQ(configuration.disable_rearrangement, false);
178+
}
179+
137180
TEST(config_test, read_from_with_more_meta)
138181
{
139182
std::stringstream ss{"@blah some chopper stuff\n"
@@ -144,14 +187,16 @@ TEST(config_test, read_from_with_more_meta)
144187
"@HIBF_CONFIG\n"
145188
"@{\n"
146189
"@ \"hibf_config\": {\n"
147-
"@ \"version\": 1,\n"
190+
"@ \"version\": 3,\n"
148191
"@ \"number_of_user_bins\": 123456789,\n"
149192
"@ \"number_of_hash_functions\": 4,\n"
150193
"@ \"maximum_fpr\": 0.0001,\n"
151194
"@ \"relaxed_fpr\": 0.3,\n"
152195
"@ \"threads\": 31,\n"
153196
"@ \"sketch_bits\": 8,\n"
154197
"@ \"tmax\": 128,\n"
198+
"@ \"empty_bin_fraction\": 0.0,\n"
199+
"@ \"track_occupancy\": true,\n"
155200
"@ \"alpha\": 1.0,\n"
156201
"@ \"max_rearrangement_ratio\": 0.333,\n"
157202
"@ \"disable_estimate_union\": true,\n"
@@ -170,6 +215,8 @@ TEST(config_test, read_from_with_more_meta)
170215
EXPECT_EQ(configuration.threads, 31);
171216
EXPECT_EQ(configuration.sketch_bits, 8);
172217
EXPECT_EQ(configuration.tmax, 128);
218+
EXPECT_EQ(configuration.empty_bin_fraction, 0.0);
219+
EXPECT_EQ(configuration.track_occupancy, true);
173220
EXPECT_EQ(configuration.alpha, 1.0);
174221
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
175222
EXPECT_EQ(configuration.disable_estimate_union, true);
@@ -349,6 +396,20 @@ TEST(config_test, validate_and_set_defaults)
349396
"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0).");
350397
}
351398

399+
// empty_bin_fraction != 0.0 also enables tracking occupancy
400+
{
401+
seqan::hibf::config configuration{.input_fn = dummy_input_fn,
402+
.number_of_user_bins = 1u,
403+
.empty_bin_fraction = 0.0,
404+
.track_occupancy = false};
405+
configuration.validate_and_set_defaults();
406+
EXPECT_EQ(configuration.track_occupancy, false);
407+
408+
configuration.empty_bin_fraction = 0.3;
409+
configuration.validate_and_set_defaults();
410+
EXPECT_EQ(configuration.track_occupancy, true);
411+
}
412+
352413
// alpha must be positive
353414
{
354415
seqan::hibf::config configuration{.input_fn = dummy_input_fn, .number_of_user_bins = 1u, .alpha = -0.1};
@@ -413,6 +474,8 @@ TEST(config_test, serialisation)
413474
.threads = 31,
414475
.sketch_bits = 8,
415476
.tmax = 128,
477+
.empty_bin_fraction = 0.13,
478+
.track_occupancy = true,
416479
.alpha = 1.0,
417480
.max_rearrangement_ratio = 0.333,
418481
.disable_estimate_union = true,

0 commit comments

Comments
 (0)