Skip to content

Commit 5d39521

Browse files
authored
feat(replica): Support FlushDB command for replication dragonflydb#580 (dragonflydb#591)
1 parent 8d86e9b commit 5d39521

File tree

10 files changed

+107
-25
lines changed

10 files changed

+107
-25
lines changed

src/server/journal/executor.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ namespace dfly {
1212
JournalExecutor::JournalExecutor(Service* service) : service_{service} {
1313
}
1414

15-
void JournalExecutor::Execute(journal::ParsedEntry&& entry) {
15+
void JournalExecutor::Execute(journal::ParsedEntry& entry) {
1616
if (entry.payload) {
1717
io::NullSink null_sink;
1818
ConnectionContext conn_context{&null_sink, nullptr};

src/server/journal/executor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class Service;
1414
class JournalExecutor {
1515
public:
1616
JournalExecutor(Service* service);
17-
void Execute(journal::ParsedEntry&& entry);
17+
void Execute(journal::ParsedEntry& entry);
1818

1919
private:
2020
Service* service_;

src/server/journal/serializer.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ error_code JournalWriter::Write(const journal::Entry& entry) {
7070
return Write(entry.dbid);
7171
case journal::Op::COMMAND:
7272
RETURN_ON_ERR(Write(entry.txid));
73+
RETURN_ON_ERR(Write(entry.shard_cnt));
7374
return std::visit([this](const auto& payload) { return Write(payload); }, entry.payload);
7475
default:
7576
break;
@@ -100,6 +101,10 @@ io::Result<uint16_t> JournalReader::ReadU16(io::Source* source) {
100101
return ReadPackedUIntTyped<uint16_t>(source);
101102
}
102103

104+
io::Result<uint32_t> JournalReader::ReadU32(io::Source* source) {
105+
return ReadPackedUIntTyped<uint32_t>(source);
106+
}
107+
103108
io::Result<uint64_t> JournalReader::ReadU64(io::Source* source) {
104109
return ReadPackedUIntTyped<uint64_t>(source);
105110
}
@@ -153,6 +158,7 @@ io::Result<journal::ParsedEntry> JournalReader::ReadEntry(io::Source* source) {
153158
switch (entry.opcode) {
154159
case journal::Op::COMMAND:
155160
SET_OR_UNEXPECT(ReadU64(source), entry.txid);
161+
SET_OR_UNEXPECT(ReadU32(source), entry.shard_cnt);
156162
entry.payload = CmdArgVec{};
157163
if (auto ec = Read(source, &*entry.payload); ec)
158164
return make_unexpected(ec);

src/server/journal/serializer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ struct JournalReader {
5555
// TODO: Templated endian encoding to not repeat...?
5656
io::Result<uint8_t> ReadU8(io::Source* source);
5757
io::Result<uint16_t> ReadU16(io::Source* source);
58+
io::Result<uint32_t> ReadU32(io::Source* source);
5859
io::Result<uint64_t> ReadU64(io::Source* source);
5960

6061
// Read string into internal buffer and return size.

src/server/journal/types.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ struct EntryBase {
2222
TxId txid;
2323
Op opcode;
2424
DbIndex dbid;
25+
uint32_t shard_cnt;
2526
};
2627

2728
// This struct represents a single journal entry.
@@ -34,11 +35,11 @@ struct Entry : public EntryBase {
3435
std::pair<std::string_view, ArgSlice> // Command and its shard parts.
3536
>;
3637

37-
Entry(TxId txid, DbIndex dbid, Payload pl)
38-
: EntryBase{txid, journal::Op::COMMAND, dbid}, payload{pl} {
38+
Entry(TxId txid, DbIndex dbid, Payload pl, uint32_t shard_cnt)
39+
: EntryBase{txid, journal::Op::COMMAND, dbid, shard_cnt}, payload{pl} {
3940
}
4041

41-
Entry(journal::Op opcode, DbIndex dbid) : EntryBase{0, opcode, dbid}, payload{} {
42+
Entry(journal::Op opcode, DbIndex dbid) : EntryBase{0, opcode, dbid, 0}, payload{} {
4243
}
4344

4445
Payload payload;
@@ -50,11 +51,11 @@ struct ParsedEntry : public EntryBase {
5051

5152
ParsedEntry() = default;
5253

53-
ParsedEntry(journal::Op opcode, DbIndex dbid) : EntryBase{0, opcode, dbid}, payload{} {
54+
ParsedEntry(journal::Op opcode, DbIndex dbid) : EntryBase{0, opcode, dbid, 0}, payload{} {
5455
}
5556

56-
ParsedEntry(TxId txid, DbIndex dbid, Payload pl)
57-
: EntryBase{txid, journal::Op::COMMAND, dbid}, payload{pl} {
57+
ParsedEntry(TxId txid, DbIndex dbid, Payload pl, uint32_t shard_cnt)
58+
: EntryBase{txid, journal::Op::COMMAND, dbid, shard_cnt}, payload{pl} {
5859
}
5960

6061
Payload payload;

src/server/journal_test.cc

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,14 @@ TEST(Journal, WriteRead) {
9595
auto slice = [v = &slices](auto... ss) { return StoreSlice(v, ss...); };
9696
auto list = [v = &lists](auto... ss) { return StoreList(v, ss...); };
9797

98-
std::vector<journal::Entry> test_entries = {{0, 0, make_pair("MSET", slice("A", "1", "B", "2"))},
99-
{1, 0, make_pair("MSET", slice("C", "3"))},
100-
{2, 0, list("DEL", "A", "B")},
101-
{3, 1, list("LPUSH", "l", "v1", "v2")},
102-
{4, 0, make_pair("MSET", slice("D", "4"))},
103-
{5, 1, list("DEL", "l1")},
104-
{6, 2, list("SET", "E", "2")}};
98+
std::vector<journal::Entry> test_entries = {
99+
{0, 0, make_pair("MSET", slice("A", "1", "B", "2")), 2},
100+
{0, 0, make_pair("MSET", slice("C", "3")), 2},
101+
{1, 0, list("DEL", "A", "B"), 2},
102+
{2, 1, list("LPUSH", "l", "v1", "v2"), 1},
103+
{3, 0, make_pair("MSET", slice("D", "4")), 1},
104+
{4, 1, list("DEL", "l1"), 1},
105+
{5, 2, list("SET", "E", "2"), 1}};
105106

106107
// Write all entries to string file.
107108
io::StringSink ss;

src/server/rdb_load.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1970,7 +1970,7 @@ error_code RdbLoaderBase::HandleJournalBlob(Service* service, DbIndex dbid) {
19701970
while (done < num_entries) {
19711971
journal::ParsedEntry entry{};
19721972
SET_OR_RETURN(journal_reader_.ReadEntry(&bs), entry);
1973-
ex.Execute(std::move(entry));
1973+
ex.Execute(entry);
19741974
done++;
19751975
}
19761976

src/server/replica.cc

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,11 @@ Replica::Replica(string host, uint16_t port, Service* se) : service_(*se) {
9797
master_context_.port = port;
9898
}
9999

100-
Replica::Replica(const MasterContext& context, uint32_t dfly_flow_id, Service* service)
100+
Replica::Replica(const MasterContext& context, uint32_t dfly_flow_id, Service* service,
101+
std::shared_ptr<Replica::MultiShardExecution> shared_exe_data)
101102
: service_(*service), master_context_(context) {
102103
master_context_.dfly_flow_id = dfly_flow_id;
104+
multi_shard_exe_ = shared_exe_data;
103105
}
104106

105107
Replica::~Replica() {
@@ -427,13 +429,13 @@ error_code Replica::InitiatePSync() {
427429
// Initialize and start sub-replica for each flow.
428430
error_code Replica::InitiateDflySync() {
429431
DCHECK_GT(num_df_flows_, 0u);
430-
432+
multi_shard_exe_.reset(new MultiShardExecution());
431433
shard_flows_.resize(num_df_flows_);
432434
for (unsigned i = 0; i < num_df_flows_; ++i) {
433-
shard_flows_[i].reset(new Replica(master_context_, i, &service_));
435+
shard_flows_[i].reset(new Replica(master_context_, i, &service_, multi_shard_exe_));
434436
}
435437

436-
// Blocked on untill all flows got full sync cut.
438+
// Blocked on until all flows got full sync cut.
437439
fibers_ext::BlockingCounter sync_block{num_df_flows_};
438440

439441
auto err_handler = [this, sync_block](const auto& ge) mutable {
@@ -705,14 +707,63 @@ void Replica::StableSyncDflyFb(Context* cntx) {
705707
cntx->Error(res.error(), "Journal format error");
706708
return;
707709
}
708-
709-
executor.Execute(std::move(res.value()));
710-
710+
ExecuteEntry(&executor, res.value());
711711
last_io_time_ = sock_->proactor()->GetMonotonicTimeNs();
712712
}
713713
return;
714714
}
715715

716+
void Replica::ExecuteEntry(JournalExecutor* executor, journal::ParsedEntry& entry) {
717+
if (entry.shard_cnt <= 1) { // not multi shard cmd
718+
executor->Execute(entry);
719+
return;
720+
}
721+
722+
// Multi shard command flow:
723+
// step 1: Fiber wait until all the fibers that should execute this tranaction got
724+
// to the journal entry of the transaction.
725+
// step 2: execute the command (All fibers)
726+
// step 3: Fiber wait until all fibers finished the execution
727+
// By step 1 we enforce that replica will execute multi shard commands that finished on master
728+
// By step 3 we ensures the correctness of flushall/flushdb commands
729+
730+
// TODO: this implemantaion does not support atomicity in replica
731+
// Although multi shard transaction happen in step 2 very close to each other,
732+
// user can query replica between executions.
733+
// To support atomicity we should have one fiber in step 2 which will excute all the entries of
734+
// the transaction together. In case of global comand such as flushdb the command can be executed
735+
// by only one fiber.
736+
737+
// TODO: support error handler in this flow
738+
739+
// Only the first fiber to reach the transaction will create data for transaction in map
740+
multi_shard_exe_->map_mu.lock();
741+
auto [it, was_insert] = multi_shard_exe_->tx_sync_execution.emplace(entry.txid, entry.shard_cnt);
742+
743+
// Note: we must release the mutex befor calling wait on barrier
744+
multi_shard_exe_->map_mu.unlock();
745+
746+
VLOG(2) << "txid: " << entry.txid << " unique_shard_cnt_: " << entry.shard_cnt
747+
<< " was_insert: " << was_insert;
748+
749+
// step 1
750+
it->second.barrier.wait();
751+
// step 2
752+
executor->Execute(entry);
753+
// step 3
754+
it->second.barrier.wait();
755+
756+
// Note: erase from map can be done only after all fibers returned from wait.
757+
// The last fiber which will decrease the counter to 0 will be the one to erase the data from map
758+
auto val = it->second.counter.fetch_sub(1, std::memory_order_relaxed);
759+
VLOG(2) << "txid: " << entry.txid << " unique_shard_cnt_: " << entry.shard_cnt
760+
<< " counter: " << val;
761+
if (val == 1) {
762+
std::lock_guard lg{multi_shard_exe_->map_mu};
763+
multi_shard_exe_->tx_sync_execution.erase(entry.txid);
764+
}
765+
}
766+
716767
error_code Replica::ReadRespReply(base::IoBuf* io_buf, uint32_t* consumed) {
717768
DCHECK(parser_);
718769

src/server/replica.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33
//
44
#pragma once
55

6+
#include <boost/fiber/barrier.hpp>
67
#include <boost/fiber/fiber.hpp>
8+
#include <boost/fiber/mutex.hpp>
79
#include <variant>
810

911
#include "base/io_buf.h"
1012
#include "facade/facade_types.h"
1113
#include "facade/redis_parser.h"
1214
#include "server/common.h"
15+
#include "server/journal/types.h"
1316
#include "util/fiber_socket_base.h"
1417
#include "util/fibers/fibers_ext.h"
1518

@@ -21,6 +24,7 @@ namespace dfly {
2124

2225
class Service;
2326
class ConnectionContext;
27+
class JournalExecutor;
2428

2529
class Replica {
2630
private:
@@ -46,6 +50,19 @@ class Replica {
4650
R_SYNC_OK = 0x10,
4751
};
4852

53+
struct MultiShardExecution {
54+
boost::fibers::mutex map_mu;
55+
56+
struct TxExecutionSync {
57+
boost::fibers::barrier barrier;
58+
std::atomic_uint32_t counter;
59+
TxExecutionSync(uint32_t counter) : barrier(counter), counter(counter) {
60+
}
61+
};
62+
63+
std::unordered_map<TxId, TxExecutionSync> tx_sync_execution;
64+
};
65+
4966
public:
5067
Replica(std::string master_host, uint16_t port, Service* se);
5168
~Replica();
@@ -81,7 +98,8 @@ class Replica {
8198

8299
private: /* Main dlfly flow mode functions */
83100
// Initialize as single dfly flow.
84-
Replica(const MasterContext& context, uint32_t dfly_flow_id, Service* service);
101+
Replica(const MasterContext& context, uint32_t dfly_flow_id, Service* service,
102+
std::shared_ptr<MultiShardExecution> shared_exe_data);
85103

86104
// Start replica initialized as dfly flow.
87105
std::error_code StartFullSyncFlow(util::fibers_ext::BlockingCounter block, Context* cntx);
@@ -122,6 +140,8 @@ class Replica {
122140
// Send command, update last_io_time, return error.
123141
std::error_code SendCommand(std::string_view command, facade::ReqSerializer* serializer);
124142

143+
void ExecuteEntry(JournalExecutor* executor, journal::ParsedEntry& entry);
144+
125145
public: /* Utility */
126146
struct Info {
127147
std::string host;
@@ -154,6 +174,8 @@ class Replica {
154174
MasterContext master_context_;
155175
std::unique_ptr<util::LinuxSocketBase> sock_;
156176

177+
std::shared_ptr<MultiShardExecution> multi_shard_exe_;
178+
157179
// MainReplicationFb in standalone mode, FullSyncDflyFb in flow mode.
158180
::boost::fibers::fiber sync_fb_;
159181
std::vector<std::unique_ptr<Replica>> shard_flows_;

src/server/transaction.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1221,7 +1221,7 @@ void Transaction::LogJournalOnShard(EngineShard* shard) {
12211221
entry_payload =
12221222
make_pair(facade::ToSV(cmd_with_full_args_.front()), ShardArgsInShard(shard->shard_id()));
12231223
}
1224-
journal->RecordEntry(journal::Entry{txid_, db_index_, entry_payload});
1224+
journal->RecordEntry(journal::Entry{txid_, db_index_, entry_payload, unique_shard_cnt_});
12251225
}
12261226

12271227
void Transaction::BreakOnShutdown() {

0 commit comments

Comments
 (0)