Skip to content

Commit 620f35c

Browse files
committed
chore: lock free info command with replicaof v2
Signed-off-by: kostas <[email protected]>
1 parent 7abc6a9 commit 620f35c

File tree

8 files changed

+168
-46
lines changed

8 files changed

+168
-46
lines changed

helio

src/server/cluster/outgoing_slot_migration.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ void OutgoingMigration::Finish(const GenericError& error) {
166166
switch (state_) {
167167
case MigrationState::C_FATAL:
168168
case MigrationState::C_FINISHED:
169+
CloseSocket();
169170
return; // Already finished, nothing else to do
170171

171172
case MigrationState::C_CONNECTING:
@@ -192,6 +193,9 @@ void OutgoingMigration::Finish(const GenericError& error) {
192193
});
193194
exec_st_.JoinErrorHandler();
194195
}
196+
197+
// Close socket for clean disconnect.
198+
CloseSocket();
195199
}
196200

197201
MigrationState OutgoingMigration::GetState() const {

src/server/protocol_client.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ error_code ProtocolClient::ResolveHostDns() {
162162
error_code ProtocolClient::ConnectAndAuth(std::chrono::milliseconds connect_timeout_ms,
163163
ExecutionState* cntx) {
164164
ProactorBase* mythread = ProactorBase::me();
165+
DCHECK(mythread == socket_thread_);
165166
CHECK(mythread);
166167
{
167168
unique_lock lk(sock_mu_);
@@ -235,6 +236,9 @@ void ProtocolClient::CloseSocket() {
235236
auto ec = sock_->Shutdown(SHUT_RDWR);
236237
LOG_IF(ERROR, ec) << "Could not shutdown socket " << ec;
237238
}
239+
auto ec = sock_->Close(); // Quietly close.
240+
241+
LOG_IF(WARNING, ec) << "Error closing socket " << ec << "/" << ec.message();
238242
});
239243
}
240244
}

src/server/protocol_client.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ class ProtocolClient {
107107
}
108108

109109
auto* Proactor() const {
110-
return sock_->proactor();
110+
return socket_thread_;
111111
}
112112

113113
util::FiberSocketBase* Sock() const {
@@ -142,6 +142,7 @@ class ProtocolClient {
142142
#else
143143
void* ssl_ctx_{nullptr};
144144
#endif
145+
util::fb2::ProactorBase* socket_thread_;
145146
};
146147

147148
} // namespace dfly

src/server/replica.cc

Lines changed: 38 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,13 @@ std::optional<Replica::LastMasterSyncData> Replica::Stop() {
170170
sync_fb_.JoinIfNeeded();
171171
DVLOG(1) << "MainReplicationFb stopped " << this;
172172
acks_fb_.JoinIfNeeded();
173-
for (auto& flow : shard_flows_) {
174-
flow.reset();
175-
}
173+
174+
proactor_->Await([this]() {
175+
for (auto& flow : shard_flows_) {
176+
flow.reset();
177+
}
178+
shard_flows_.clear();
179+
});
176180

177181
if (last_journal_LSNs_.has_value()) {
178182
return LastMasterSyncData{master_context_.master_repl_id, last_journal_LSNs_.value()};
@@ -501,29 +505,41 @@ error_code Replica::InitiatePSync() {
501505
return error_code{};
502506
}
503507

508+
void Replica::InitializeShardFlows() {
509+
decltype(shard_flows_) shard_flows_copy;
510+
shard_flows_copy.resize(master_context_.num_flows);
511+
DCHECK(!shard_flows_copy.empty());
512+
thread_flow_map_ = Partition(shard_flows_copy.size());
513+
const size_t pool_sz = shard_set->pool()->size();
514+
515+
shard_set->pool()->AwaitFiberOnAll([pool_sz, this, &shard_flows_copy](auto index, auto* ctx) {
516+
for (unsigned i = index; i < shard_flows_copy.size(); i += pool_sz) {
517+
uint64_t partial_sync_lsn = 0;
518+
if (shard_flows_[i]) {
519+
partial_sync_lsn = shard_flows_[i]->JournalExecutedCount();
520+
}
521+
shard_flows_copy[i].reset(
522+
new DflyShardReplica(server(), master_context_, i, &service_, multi_shard_exe_));
523+
if (partial_sync_lsn > 0) {
524+
shard_flows_[i]->SetRecordsExecuted(partial_sync_lsn);
525+
}
526+
}
527+
});
528+
// now update shard_flows on proactor thread
529+
shard_flows_ = std::move(shard_flows_copy);
530+
}
531+
504532
// Initialize and start sub-replica for each flow.
505533
error_code Replica::InitiateDflySync(std::optional<LastMasterSyncData> last_master_sync_data) {
506534
auto start_time = absl::Now();
507535

508536
// Initialize MultiShardExecution.
509537
multi_shard_exe_.reset(new MultiShardExecution());
510538

511-
// Initialize shard flows.
512-
shard_flows_.resize(master_context_.num_flows);
513-
DCHECK(!shard_flows_.empty());
514-
for (unsigned i = 0; i < shard_flows_.size(); ++i) {
515-
// Transfer LSN state for partial sync
516-
uint64_t partial_sync_lsn = 0;
517-
if (shard_flows_[i]) {
518-
partial_sync_lsn = shard_flows_[i]->JournalExecutedCount();
519-
}
520-
shard_flows_[i].reset(
521-
new DflyShardReplica(server(), master_context_, i, &service_, multi_shard_exe_));
522-
if (partial_sync_lsn > 0) {
523-
shard_flows_[i]->SetRecordsExecuted(partial_sync_lsn);
524-
}
525-
}
526-
thread_flow_map_ = Partition(shard_flows_.size());
539+
// Initialize shard flows. The update to the shard_flows_ should be done by this thread.
540+
// Otherwise, there is a race condition between GetSummary() and the shard_flows_[i].reset()
541+
// below.
542+
InitializeShardFlows();
527543

528544
// Blocked on until all flows got full sync cut.
529545
BlockingCounter sync_block{unsigned(shard_flows_.size())};
@@ -1215,6 +1231,7 @@ auto Replica::GetSummary() const -> Summary {
12151231
// Note: we access LastIoTime from foreigh thread in unsafe manner. However, specifically here
12161232
// it's unlikely to cause a real bug.
12171233
for (const auto& flow : shard_flows_) { // Get last io time from all sub flows.
1234+
DCHECK(Proactor() == ProactorBase::me());
12181235
last_io_time = std::max(last_io_time, flow->LastIoTime());
12191236
}
12201237

@@ -1246,25 +1263,14 @@ auto Replica::GetSummary() const -> Summary {
12461263
return res;
12471264
};
12481265

1249-
if (Sock())
1250-
return Proactor()->AwaitBrief(f);
1251-
1252-
/**
1253-
* when this branch happens: there is a very short grace period
1254-
* where Sock() is not initialized, yet the server can
1255-
* receive ROLE/INFO commands. That period happens when launching
1256-
* an instance with '--replicaof' and then immediately
1257-
* sending a command.
1258-
*
1259-
* In that instance, we have to run f() on the current fiber.
1260-
*/
1261-
return f();
1266+
return Proactor()->AwaitBrief(f);
12621267
}
12631268

12641269
std::vector<uint64_t> Replica::GetReplicaOffset() const {
12651270
std::vector<uint64_t> flow_rec_count;
12661271
flow_rec_count.resize(shard_flows_.size());
12671272
for (const auto& flow : shard_flows_) {
1273+
DCHECK(flow.get());
12681274
uint32_t flow_id = flow->FlowId();
12691275
uint64_t rec_count = flow->JournalExecutedCount();
12701276
DCHECK_LT(flow_id, shard_flows_.size());

src/server/replica.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ class Replica : ProtocolClient {
154154
size_t GetRecCountExecutedPerShard(const std::vector<unsigned>& indexes) const;
155155

156156
private:
157+
void InitializeShardFlows();
158+
157159
util::fb2::ProactorBase* proactor_ = nullptr;
158160
Service& service_;
159161
MasterContext master_context_;

0 commit comments

Comments
 (0)