Skip to content

Commit bfbd825

Browse files
authored
Align chunk start offset to 4k and logstore inline flush (eBay#838)
Align the chunk start offset to 4k to fix performance degradation of 4kb writes from 150us to 20us on p5gx. Because physical block size is 4kb on nvme on p5gx. Earlier the chunk start offset was aligned to 512. This alignment will only affect during first time boot. Existing nodes during upgrade wont have issues as they use old format. Add logstore inline flush mode for solo repl dev. Run the completion callback in a different thread only for raft repl devi as there is 100us difference in latency. For nublocks callback can be run inline.
1 parent 7f1a005 commit bfbd825

File tree

6 files changed

+41
-20
lines changed

6 files changed

+41
-20
lines changed

conanfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
class HomestoreConan(ConanFile):
1111
name = "homestore"
12-
version = "7.0.5"
12+
version = "7.0.6"
1313

1414
homepage = "https://github.com/eBay/Homestore"
1515
description = "HomeStore Storage Engine"

src/lib/checkpoint/cp_mgr.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,10 @@ CPManager::CPManager() :
3939
[this](meta_blk* mblk, sisl::byte_view buf, size_t size) { on_meta_blk_found(std::move(buf), (void*)mblk); },
4040
nullptr);
4141

42-
resource_mgr().register_dirty_buf_exceed_cb(
43-
[this]([[maybe_unused]] int64_t dirty_buf_count, bool critical) { this->trigger_cp_flush(false /* force */); });
42+
resource_mgr().register_dirty_buf_exceed_cb([this]([[maybe_unused]] int64_t dirty_buf_count, bool critical) {
43+
LOGINFO("Dirty buffer exceeded count {} critical {}", dirty_buf_count, critical);
44+
this->trigger_cp_flush(false /* force */);
45+
});
4446

4547
start_timer_thread();
4648
start_cp_thread();

src/lib/common/homestore_config.fbs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ table LogStore {
107107
try_flush_iteration: uint64 = 10240(hotswap);
108108

109109
// Logdev flushes in multiples of this size, setting to 0 will make it use default device optimal size
110-
flush_size_multiple_logdev: uint64 = 512;
110+
flush_size_multiple_logdev: uint64 = 0;
111111

112112
// Logdev will flush the logs only in a dedicated thread. Turn this on, if flush IO doesn't want to
113113
// intervene with data IO path.
@@ -165,7 +165,7 @@ table Generic {
165165

166166
table ResourceLimits {
167167
/* it is going to use 2 times of this space because of two concurrent cps */
168-
dirty_buf_percent: uint32 = 1 (hotswap);
168+
dirty_buf_percent: uint32 = 10 (hotswap);
169169

170170
/* it is going to use 2 times of this space because of two concurrent cps */
171171
free_blk_cnt: uint32 = 10000000 (hotswap);
@@ -190,7 +190,7 @@ table ResourceLimits {
190190
/* 0 means HomeStore doesn't reserve anything and let nuraft controlls the truncation */
191191
/* default reserve 1 million logs */
192192
raft_logstore_reserve_threshold: uint32 = 1000000 (hotswap);
193-
193+
194194
/* resource audit timer in ms */
195195
resource_audit_timer_ms: uint32 = 120000;
196196

@@ -262,10 +262,10 @@ table Consensus {
262262
// Minimum log gap a replica has to be from leader before joining the replica set.
263263
// 0 indicates the new member will join in cluster immediately.
264264
min_log_gap_to_join: int32 = 0;
265-
265+
266266
// amount of time in millis to wait on data write before fetch data from remote;
267267
wait_data_write_timer_ms: uint64 = 1500 (hotswap);
268-
268+
269269
// Leadership expiry (=0 indicates 20 times heartbeat period), set -1 to never expire
270270
leadership_expiry_ms: int32 = 0;
271271

src/lib/device/device_manager.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ void DeviceManager::format_devices() {
115115
// Get common iomgr_attributes
116116
for (auto& dinfo : m_dev_infos) {
117117
format_single_device(dinfo);
118-
}
118+
}
119119

120120
// Verify the first blocks to see if the devs are unique
121121
HS_REL_ASSERT(verify_unique_devs(), "Found duplicate physical devices in the system");
@@ -163,7 +163,9 @@ bool DeviceManager::verify_unique_devs() const {
163163
for (auto& pdev : m_all_pdevs) {
164164
if (!pdev) { continue; }
165165
auto buf = hs_utils::iobuf_alloc(hs_super_blk::first_block_size(), sisl::buftag::superblk, 512);
166-
if (auto err = pdev->read_super_block(buf, hs_super_blk::first_block_size(), hs_super_blk::first_block_offset()); err) {
166+
if (auto err =
167+
pdev->read_super_block(buf, hs_super_blk::first_block_size(), hs_super_blk::first_block_offset());
168+
err) {
167169
LOGERROR("Failed to read first block from device={}, error={}", pdev->get_devname(), err.message());
168170
ret = false;
169171
continue;
@@ -704,7 +706,10 @@ uint32_t DeviceManager::populate_pdev_info(const dev_info& dinfo, const iomgr::d
704706
pinfo.max_pdev_chunks = hs_super_blk::max_chunks_in_pdev(dinfo);
705707

706708
auto sb_size = hs_super_blk::total_size(dinfo);
707-
pinfo.data_offset = hs_super_blk::first_block_offset() + sb_size;
709+
// Data offset is the data start offset of a pdev. First chunk's is created at data start
710+
// offset and remainings chunks are created contiguously one after the other.
711+
// Align the data start offset so that chunk start is also aligned to physical page size.
712+
pinfo.data_offset = sisl::round_up(hs_super_blk::first_block_offset() + sb_size, attr.phys_page_size);
708713
pinfo.size = dinfo.dev_size - pinfo.data_offset - (hdd ? sb_size : 0);
709714
pinfo.dev_attr = attr;
710715
pinfo.system_uuid = uuid;

src/lib/logstore/log_dev.cpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "common/homestore_config.hpp"
3333
#include "common/homestore_utils.hpp"
3434
#include "common/crash_simulator.hpp"
35+
#include "replication/service/generic_repl_svc.h"
3536

3637
namespace homestore {
3738

@@ -582,13 +583,26 @@ void LogDev::on_flush_completion(LogGroup* lg) {
582583
// since we support out-of-order lsn write, so no need to guarantee the order of logstore write completion
583584
for (auto const& [idx, req] : req_map) {
584585
m_pending_callback++;
585-
iomanager.run_on_forget(iomgr::reactor_regex::random_worker, /* iomgr::fiber_regex::syncio_only, */
586-
[this, dev_offset, idx, req]() {
587-
auto ld_key = logdev_key{idx, dev_offset};
588-
auto comp_cb = req->log_store->get_comp_cb();
589-
(req->cb) ? req->cb(req, ld_key) : comp_cb(req, ld_key);
590-
m_pending_callback--;
591-
});
586+
auto callback_lambda = [this, dev_offset, idx, req]() {
587+
auto ld_key = logdev_key{idx, dev_offset};
588+
auto comp_cb = req->log_store->get_comp_cb();
589+
(req->cb) ? req->cb(req, ld_key) : comp_cb(req, ld_key);
590+
m_pending_callback--;
591+
};
592+
593+
// Only server side replication which uses raft runs the callback on a random worker.
594+
bool server_side_replication = true;
595+
if (hs()->has_repl_data_service()) {
596+
auto& repl_svc = dynamic_cast< GenericReplService& >(hs()->repl_service());
597+
server_side_replication = repl_svc.get_impl_type() == repl_impl_type::server_side;
598+
}
599+
600+
if (server_side_replication) {
601+
iomanager.run_on_forget(iomgr::reactor_regex::random_worker, /* iomgr::fiber_regex::syncio_only, */
602+
[this, callback_lambda]() { callback_lambda(); });
603+
} else {
604+
callback_lambda();
605+
}
592606
}
593607
}
594608

src/lib/replication/repl_dev/solo_repl_dev.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ SoloReplDev::SoloReplDev(superblk< solo_repl_dev_superblk >&& rd_sb, bool load_e
1818
auto const gid = m_rd_sb->group_id;
1919
if (load_existing) {
2020
m_logdev_id = m_rd_sb->logdev_id;
21-
logstore_service().open_logdev(m_rd_sb->logdev_id, flush_mode_t::TIMER, gid);
21+
logstore_service().open_logdev(m_rd_sb->logdev_id, flush_mode_t::TIMER | flush_mode_t::INLINE, gid);
2222
logstore_service()
2323
.open_log_store(m_rd_sb->logdev_id, m_rd_sb->logstore_id, true /* append_mode */)
2424
.thenValue([this](auto log_store) {
@@ -29,7 +29,7 @@ SoloReplDev::SoloReplDev(superblk< solo_repl_dev_superblk >&& rd_sb, bool load_e
2929
});
3030
m_commit_upto = m_rd_sb->durable_commit_lsn;
3131
} else {
32-
m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::TIMER, gid);
32+
m_logdev_id = logstore_service().create_new_logdev(flush_mode_t::TIMER | flush_mode_t::INLINE, gid);
3333
m_data_journal = logstore_service().create_new_log_store(m_logdev_id, true /* append_mode */);
3434
m_rd_sb->logstore_id = m_data_journal->get_store_id();
3535
m_rd_sb->logdev_id = m_logdev_id;

0 commit comments

Comments
 (0)