Skip to content

Commit d94628b

Browse files
authored
Bypass WAL in bottom-up build mode (couchbase#11)
1 parent 5158550 commit d94628b

File tree

8 files changed

+170
-65
lines changed

8 files changed

+170
-65
lines changed

src/btree.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,7 @@ btree_result btree_init_and_load(
17051705
struct btree_meta *meta,
17061706
uint64_t num_keys,
17071707
btree_load_get_next_kv* next_kv,
1708+
btree_load_write_done* write_done,
17081709
void* aux)
17091710
{
17101711
void *addr;
@@ -1830,6 +1831,7 @@ btree_result btree_init_and_load(
18301831
} else {
18311832
if (cur_node[jj]->nentry >= max_num_entries_non_root) {
18321833
// Otherwise: full, should allocate a new one.
1834+
write_done(btree->blk_handle, node_bids[jj], aux);
18331835
cur_node[jj] = NULL;
18341836
node_bids[jj] = 0;
18351837
}

src/btree.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ btree_result btree_init(
186186
bnode_flag_t flag, struct btree_meta *meta);
187187

188188
typedef int btree_load_get_next_kv(void** key_out, void** value_out, void* aux);
189+
typedef void btree_load_write_done(void* blk_handle, bid_t bid, void* aux);
189190
btree_result btree_init_and_load(
190191
struct btree *btree,
191192
void *blk_handle,
@@ -198,6 +199,7 @@ btree_result btree_init_and_load(
198199
struct btree_meta *meta,
199200
uint64_t num_keys,
200201
btree_load_get_next_kv* next_kv,
202+
btree_load_write_done* write_done,
201203
void* aux);
202204

203205
btree_result btree_iterator_init(struct btree *btree, struct btree_iterator *it, void *initial_key);

src/btreeblock.cc

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -945,6 +945,38 @@ fdb_status btreeblk_operation_end(void *voidhandle)
945945
return status;
946946
}
947947

948+
void btreeblk_write_done(void* voidhandle, bid_t bid) {
949+
// flush and write the block for given BID in the allocation list.
950+
struct btreeblk_handle *handle = (struct btreeblk_handle *)voidhandle;
951+
952+
// write and free items in allocation list
953+
struct list_elem *e = list_begin(&handle->alc_list);
954+
while (e) {
955+
struct btreeblk_block *block = _get_entry(e, struct btreeblk_block, le);
956+
if (block->bid != bid) {
957+
e = list_next(e);
958+
continue;
959+
}
960+
int writable = filemgr_is_writable(handle->file, block->bid);
961+
if (writable) {
962+
fdb_status status = _btreeblk_write_dirty_block(handle, block);
963+
if (status != FDB_RESULT_SUCCESS) {
964+
return;
965+
}
966+
} else {
967+
fdb_log(nullptr, FDB_LOG_FATAL,
968+
FDB_RESULT_WRITE_FAIL,
969+
"b+tree node write fail, BID %zu, file %s",
970+
block->bid,
971+
handle->file->filename);
972+
return;
973+
}
974+
975+
e = list_remove(&handle->alc_list, &block->le);
976+
_btreeblk_free_dirty_block(handle, block);
977+
}
978+
}
979+
948980
void btreeblk_discard_blocks(struct btreeblk_handle *handle)
949981
{
950982
// discard all writable blocks in the read list

src/btreeblock.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ void btreeblk_reset_subblock_info(struct btreeblk_handle *handle);
9696
void btreeblk_free(struct btreeblk_handle *handle);
9797
void btreeblk_discard_blocks(struct btreeblk_handle *handle);
9898
fdb_status btreeblk_end(struct btreeblk_handle *handle);
99+
void btreeblk_write_done(void* voidhandle, bid_t bid);
99100

100101
#ifdef __cplusplus
101102
}

src/forestdb.cc

Lines changed: 102 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2284,6 +2284,14 @@ fdb_status _fdb_open(fdb_kvs_handle *handle,
22842284
}
22852285
}
22862286

2287+
if (config->bottom_up_index_build) {
2288+
handle->bottom_up_build_entries = (struct list*)malloc(sizeof(struct list));
2289+
list_init(handle->bottom_up_build_entries);
2290+
handle->num_bottom_up_build_entries = 0;
2291+
} else {
2292+
handle->bottom_up_build_entries = NULL;
2293+
}
2294+
22872295
return status;
22882296
}
22892297

@@ -4250,29 +4258,47 @@ fdb_status fdb_set(fdb_kvs_handle *handle, fdb_doc *doc)
42504258
if (!txn) {
42514259
txn = &file->global_txn;
42524260
}
4253-
if (handle->kvs) {
4254-
// multi KV instance mode
4255-
fdb_doc kv_ins_doc = *doc;
4256-
kv_ins_doc.key = _doc.key;
4257-
kv_ins_doc.keylen = _doc.length.keylen;
4258-
if (!immediate_remove) {
4259-
wal_insert(txn, file, &cmp_info, &kv_ins_doc, offset,
4260-
WAL_INS_WRITER);
4261-
} else {
4262-
wal_immediate_remove(txn, file, &cmp_info, &kv_ins_doc, offset,
4263-
WAL_INS_WRITER);
4264-
}
4261+
4262+
if (handle->config.bottom_up_index_build) {
4263+
// Bottom-up build mode, bypass WAL.
4264+
struct bottom_up_build_entry* bub_entry =
4265+
(struct bottom_up_build_entry*)
4266+
malloc(sizeof(struct bottom_up_build_entry));
4267+
bub_entry->keylen = _doc.length.keylen;
4268+
bub_entry->key = (void*)malloc(_doc.length.keylen);
4269+
memcpy(bub_entry->key, _doc.key, _doc.length.keylen);
4270+
bub_entry->seqnum = doc->seqnum;
4271+
bub_entry->offset = doc->offset;
4272+
4273+
fdb_kvs_handle* root_handle = handle->fhandle->root;
4274+
list_push_back(root_handle->bottom_up_build_entries, &bub_entry->le);
4275+
root_handle->num_bottom_up_build_entries++;
4276+
42654277
} else {
4266-
if (!immediate_remove) {
4267-
wal_insert(txn, file, &cmp_info, doc, offset, WAL_INS_WRITER);
4278+
if (handle->kvs) {
4279+
// multi KV instance mode
4280+
fdb_doc kv_ins_doc = *doc;
4281+
kv_ins_doc.key = _doc.key;
4282+
kv_ins_doc.keylen = _doc.length.keylen;
4283+
if (!immediate_remove) {
4284+
wal_insert(txn, file, &cmp_info, &kv_ins_doc, offset,
4285+
WAL_INS_WRITER);
4286+
} else {
4287+
wal_immediate_remove(txn, file, &cmp_info, &kv_ins_doc, offset,
4288+
WAL_INS_WRITER);
4289+
}
42684290
} else {
4269-
wal_immediate_remove(txn, file, &cmp_info, doc, offset,
4270-
WAL_INS_WRITER);
4291+
if (!immediate_remove) {
4292+
wal_insert(txn, file, &cmp_info, doc, offset, WAL_INS_WRITER);
4293+
} else {
4294+
wal_immediate_remove(txn, file, &cmp_info, doc, offset,
4295+
WAL_INS_WRITER);
4296+
}
42714297
}
4272-
}
42734298

4274-
if (wal_get_dirty_status(file)== FDB_WAL_CLEAN) {
4275-
wal_set_dirty_status(file, FDB_WAL_DIRTY);
4299+
if (wal_get_dirty_status(file)== FDB_WAL_CLEAN) {
4300+
wal_set_dirty_status(file, FDB_WAL_DIRTY);
4301+
}
42764302
}
42774303

42784304
if (handle->config.auto_commit &&
@@ -4648,11 +4674,11 @@ fdb_status fdb_commit_non_durable(fdb_file_handle *fhandle,
46484674

46494675
void* _fdb_bottom_up_index_build_next(void* cur_entry, void* aux) {
46504676
if (!cur_entry) {
4651-
struct avl_tree* tree = (struct avl_tree*)aux;
4652-
return avl_first(tree);
4677+
struct list* ll = (struct list*)aux;
4678+
return list_begin(ll);
46534679
}
4654-
struct avl_node* an = (struct avl_node*)cur_entry;
4655-
return avl_next(an);
4680+
struct list_elem* le = (struct list_elem*)cur_entry;
4681+
return list_next(le);
46564682
}
46574683

46584684
void _fdb_bottom_up_index_build_get(void* cur_entry,
@@ -4661,15 +4687,13 @@ void _fdb_bottom_up_index_build_get(void* cur_entry,
46614687
void** value_out,
46624688
void* aux)
46634689
{
4664-
struct wal_item_header* header =
4665-
_get_entry(cur_entry, struct wal_item_header, avl_key);
4666-
*key_out = header->key;
4667-
*keylen_out = header->keylen;
4690+
struct bottom_up_build_entry* entry =
4691+
_get_entry(cur_entry, struct bottom_up_build_entry, le);
4692+
*key_out = entry->key;
4693+
*keylen_out = entry->keylen;
46684694

46694695
thread_local uint64_t enc_bid;
4670-
struct list_elem* le = list_begin(&header->items);
4671-
struct wal_item* first_item = _get_entry(le, struct wal_item, list_elem);
4672-
enc_bid = _endian_encode(first_item->offset);
4696+
enc_bid = _endian_encode(entry->offset);
46734697
*value_out = &enc_bid;
46744698
}
46754699

@@ -4683,27 +4707,23 @@ void _fdb_bottom_up_index_build_get_seq(void* cur_entry,
46834707
void** value_out,
46844708
void* aux)
46854709
{
4686-
struct wal_item* elem =
4687-
_get_entry(cur_entry, struct wal_item, avl_seq);
4710+
struct bottom_up_build_entry* entry =
4711+
_get_entry(cur_entry, struct bottom_up_build_entry, le);
46884712
thread_local uint64_t enc_seq;
4689-
enc_seq = _endian_encode(elem->seqnum);
4713+
enc_seq = _endian_encode(entry->seqnum);
46904714
*key_out = &enc_seq;
46914715
*keylen_out = sizeof(enc_seq);
46924716

46934717
thread_local uint64_t enc_bid;
4694-
enc_bid = _endian_encode(elem->offset);
4718+
enc_bid = _endian_encode(entry->offset);
46954719
*value_out = &enc_bid;
46964720
}
46974721

46984722
fdb_status _fdb_bottom_up_index_build(fdb_kvs_handle *handle)
46994723
{
47004724
fdb_status fs = FDB_RESULT_SUCCESS;
47014725

4702-
uint64_t num_entries = handle->file->wal->num_flushable;
4703-
struct avl_tree* avl_key = &handle->file->wal->key_shards[0]._map;
4704-
struct avl_tree* avl_seq =
4705-
handle->file->wal->seq_shards
4706-
? &handle->file->wal->seq_shards[0]._map : NULL;
4726+
uint64_t num_entries = handle->num_bottom_up_build_entries;
47074727

47084728
// Build key-index first (tree is sorted by key).
47094729
struct hbtrie new_key_trie;
@@ -4717,7 +4737,7 @@ fdb_status _fdb_bottom_up_index_build(fdb_kvs_handle *handle)
47174737
_fdb_bottom_up_index_build_next,
47184738
_fdb_bottom_up_index_build_get,
47194739
_fdb_bottom_up_index_btreeblk_end,
4720-
avl_key);
4740+
handle->bottom_up_build_entries);
47214741
handle->trie->root_bid = new_key_trie.root_bid;
47224742

47234743
// Build seq-index next.
@@ -4733,7 +4753,7 @@ fdb_status _fdb_bottom_up_index_build(fdb_kvs_handle *handle)
47334753
_fdb_bottom_up_index_build_next,
47344754
_fdb_bottom_up_index_build_get_seq,
47354755
_fdb_bottom_up_index_btreeblk_end,
4736-
avl_seq);
4756+
handle->bottom_up_build_entries);
47374757
handle->seqtrie->root_bid = new_seq_trie.root_bid;
47384758
} else if (handle->seqtree) {
47394759
// Not supported yet.
@@ -4842,33 +4862,35 @@ fdb_status _fdb_commit(fdb_kvs_handle *handle,
48424862
// (in this case, flush the rest of entries)
48434863
// 3. user forces to manually flush wal
48444864

4845-
struct filemgr_dirty_update_node *prev_node = NULL, *new_node = NULL;
4865+
if (handle->config.bottom_up_index_build) {
4866+
_fdb_bottom_up_index_build(handle);
48464867

4847-
_fdb_dirty_update_ready(handle, &prev_node, &new_node,
4848-
&dirty_idtree_root, &dirty_seqtree_root, false);
4868+
} else {
4869+
struct filemgr_dirty_update_node *prev_node = NULL, *new_node = NULL;
48494870

4850-
wr = wal_flush(handle->file, (void *)handle,
4851-
_fdb_wal_flush_func, _fdb_wal_get_old_offset,
4852-
_fdb_wal_flush_seq_purge, _fdb_wal_flush_kvs_delta_stats,
4853-
&flush_items);
4871+
_fdb_dirty_update_ready(handle, &prev_node, &new_node,
4872+
&dirty_idtree_root, &dirty_seqtree_root, false);
48544873

4855-
if (handle->config.bottom_up_index_build) {
4856-
_fdb_bottom_up_index_build(handle);
4857-
}
4874+
wr = wal_flush(handle->file, (void *)handle,
4875+
_fdb_wal_flush_func, _fdb_wal_get_old_offset,
4876+
_fdb_wal_flush_seq_purge, _fdb_wal_flush_kvs_delta_stats,
4877+
&flush_items);
48584878

4859-
if (wr != FDB_RESULT_SUCCESS) {
4860-
btreeblk_clear_dirty_update(handle->bhandle);
4861-
filemgr_dirty_update_close_node(handle->file, prev_node);
4862-
filemgr_dirty_update_remove_node(handle->file, new_node);
4863-
filemgr_mutex_unlock(handle->file);
4864-
atomic_cas_uint8_t(&handle->handle_busy, 1, 0);
4865-
return wr;
4866-
}
4867-
wal_set_dirty_status(handle->file, FDB_WAL_CLEAN);
4868-
wal_flushed = true;
48694879

4870-
_fdb_dirty_update_finalize(handle, prev_node, new_node,
4871-
&dirty_idtree_root, &dirty_seqtree_root, true);
4880+
if (wr != FDB_RESULT_SUCCESS) {
4881+
btreeblk_clear_dirty_update(handle->bhandle);
4882+
filemgr_dirty_update_close_node(handle->file, prev_node);
4883+
filemgr_dirty_update_remove_node(handle->file, new_node);
4884+
filemgr_mutex_unlock(handle->file);
4885+
atomic_cas_uint8_t(&handle->handle_busy, 1, 0);
4886+
return wr;
4887+
}
4888+
wal_set_dirty_status(handle->file, FDB_WAL_CLEAN);
4889+
wal_flushed = true;
4890+
4891+
_fdb_dirty_update_finalize(handle, prev_node, new_node,
4892+
&dirty_idtree_root, &dirty_seqtree_root, true);
4893+
}
48724894
}
48734895

48744896
// Note: Appending KVS header must be done after flushing WAL
@@ -8182,6 +8204,21 @@ fdb_status _fdb_close_root(fdb_kvs_handle *handle)
81828204
return fs;
81838205
}
81848206

8207+
void _destroy_bottom_up_build_entries(fdb_kvs_handle* handle) {
8208+
if (!handle->bottom_up_build_entries) {
8209+
return;
8210+
}
8211+
struct list_elem* le = list_begin(handle->bottom_up_build_entries);
8212+
while (le) {
8213+
struct bottom_up_build_entry* entry =
8214+
_get_entry(le, struct bottom_up_build_entry, le);
8215+
le = list_next(le);
8216+
free(entry->key);
8217+
free(entry);
8218+
}
8219+
free(handle->bottom_up_build_entries);
8220+
}
8221+
81858222
fdb_status _fdb_close(fdb_kvs_handle *handle)
81868223
{
81878224
fdb_status fs;
@@ -8235,6 +8272,8 @@ fdb_status _fdb_close(fdb_kvs_handle *handle)
82358272
handle->filename = NULL;
82368273
}
82378274

8275+
_destroy_bottom_up_build_entries(handle);
8276+
82388277
return fs;
82398278
}
82408279

src/hbtrie.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <stdlib.h>
2020
#include <string.h>
2121

22+
#include "btreeblock.h"
2223
#include "option.h"
2324
#include "forestdb_endian.h"
2425
#include "hbtrie.h"
@@ -2495,6 +2496,10 @@ int _building_btree_next_kv(void** key_out, void** val_out, void* aux) {
24952496
return 0;
24962497
}
24972498

2499+
void _building_btree_write_done(void* voidhandle, bid_t bid, void* aux) {
2500+
btreeblk_write_done(voidhandle, bid);
2501+
}
2502+
24982503
bid_t _hbtrie_load_recursive(struct hbtrie *trie,
24992504
int cur_chunk_idx,
25002505
int cp_start_chunk_idx,
@@ -2699,6 +2704,7 @@ bid_t _hbtrie_load_recursive(struct hbtrie *trie,
26992704
&meta,
27002705
num_entries,
27012706
_building_btree_next_kv,
2707+
_building_btree_write_done,
27022708
&params );
27032709
do_btreeblk_end(trie->btreeblk_handle);
27042710
ret_bid = btree.root_bid;

src/internal_types.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ struct _fdb_kvs_handle {
307307
dirty_updates = kv_handle.dirty_updates;
308308
node = kv_handle.node;
309309
num_iterators = kv_handle.num_iterators;
310+
bottom_up_build_entries = kv_handle.bottom_up_build_entries;
310311
return *this;
311312
}
312313

@@ -432,6 +433,24 @@ struct _fdb_kvs_handle {
432433
* Number of active iterator instances created from this handle
433434
*/
434435
uint32_t num_iterators;
436+
/**
437+
* Used when `bottom_up_index_build` is `true`.
438+
* Instead of inserting into the WAL, it keeps <key, seqnum, offset> tuples
439+
* to build the index.
440+
*/
441+
struct list* bottom_up_build_entries;
442+
/**
443+
* The number of entries in `bottom_up_build_entries`.
444+
*/
445+
uint64_t num_bottom_up_build_entries;
446+
};
447+
448+
struct bottom_up_build_entry {
449+
struct list_elem le;
450+
void* key;
451+
size_t keylen;
452+
uint64_t offset;
453+
uint64_t seqnum;
435454
};
436455

437456
struct hbtrie_iterator;

0 commit comments

Comments
 (0)