Skip to content

Commit 16e69bc

Browse files
authored
fix(takeover): pick correct replica to reconcile slots (#6034)
Signed-off-by: Kostas Kyrimis <[email protected]>
1 parent ad7fd2c commit 16e69bc

File tree

4 files changed

+38
-36
lines changed

4 files changed

+38
-36
lines changed

src/server/cluster/cluster_family.cc

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,27 +1098,50 @@ size_t ClusterFamily::MigrationsErrorsCount() const {
10981098
return error_num;
10991099
}
11001100

1101-
void ClusterFamily::ReconcileMasterFlow() {
1102-
auto config = cluster::ClusterConfig::Current();
1101+
void ClusterFamily::ReconcileMasterSlots(std::string_view repl_id) {
1102+
util::fb2::LockGuard gu(set_config_mu);
1103+
util::fb2::LockGuard lk(migration_mu_);
1104+
1105+
auto config = ClusterConfig::Current();
1106+
1107+
// Sanity -- we should not reach there
1108+
if (!config) {
1109+
LOG(ERROR) << "Cluster config after takeover is empty";
1110+
return;
1111+
}
11031112

11041113
for (auto& info : config->GetMutableConfig()) {
11051114
if (info.master.id == id_) {
11061115
if (!info.replicas.empty()) {
1107-
LOG_IF(ERROR, info.replicas.size() > 1)
1108-
<< "More than one replica found, slot redirection after takeover corrupted";
1116+
auto target = std::find_if(info.replicas.begin(), info.replicas.end(),
1117+
[repl_id](const auto& e) { return e.id == repl_id; });
11091118

1110-
// assumes there is one replica per master node
1111-
// TODO figure a smart way to get the replica id_ so
1112-
// we can find it here
1113-
info.master = info.replicas.front();
1119+
if (target == info.replicas.end()) {
1120+
LOG(ERROR) << "Could not find repl_id: " << repl_id
1121+
<< " in cluster topology. Slot redirection after takeover corrupted";
1122+
return;
1123+
}
1124+
1125+
info.master = *target;
11141126
info.replicas.clear();
11151127
}
11161128
break;
11171129
}
11181130
}
11191131
}
11201132

1121-
void ClusterFamily::ReconcileReplicaFlow() {
1133+
void ClusterFamily::ReconcileReplicaSlots() {
1134+
util::fb2::LockGuard gu(set_config_mu);
1135+
util::fb2::LockGuard lk(migration_mu_);
1136+
1137+
auto config = ClusterConfig::Current();
1138+
1139+
// Sanity -- we should not reach there
1140+
if (!config) {
1141+
LOG(ERROR) << "Cluster config after takeover is empty";
1142+
return;
1143+
}
1144+
11221145
auto new_config = ClusterConfig::Current()->CloneWithChanges({}, {});
11231146
// Replace master with replica in shard config.
11241147
bool found = false;
@@ -1142,26 +1165,6 @@ void ClusterFamily::ReconcileReplicaFlow() {
11421165
[&new_config](util::ProactorBase*) { ClusterConfig::SetCurrent(new_config); });
11431166
}
11441167

1145-
void ClusterFamily::ReconcileMasterReplicaTakeoverSlots(bool was_master) {
1146-
util::fb2::LockGuard gu(set_config_mu);
1147-
util::fb2::LockGuard lk(migration_mu_);
1148-
1149-
auto config = ClusterConfig::Current();
1150-
1151-
// Sanity -- we should not reach there
1152-
if (!config) {
1153-
LOG(ERROR) << "Cluster config after takeover is empty";
1154-
return;
1155-
}
1156-
1157-
if (was_master) {
1158-
ReconcileMasterFlow();
1159-
return;
1160-
}
1161-
1162-
ReconcileReplicaFlow();
1163-
}
1164-
11651168
using EngineFunc = void (ClusterFamily::*)(CmdArgList args, const CommandContext& cmd_cntx);
11661169

11671170
inline CommandId::Handler3 HandlerFunc(ClusterFamily* se, EngineFunc f) {

src/server/cluster/cluster_family.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,16 @@ class ClusterFamily {
4444

4545
size_t MigrationsErrorsCount() const ABSL_LOCKS_EXCLUDED(migration_mu_);
4646

47-
// Helper function to be used during takeover from both nodes (master and replica).
47+
// Helper functions to be used during takeover from both nodes (master and replica).
4848
// It reconciles the cluster configuration for both nodes to reflect the node
4949
// role changes after the takeover.
5050
// For the taking over node it's called at the end of the ReplTakeOver flow
5151
// and for the taken over node it's called at the end of the dflycmd::TakeOver
52-
void ReconcileMasterReplicaTakeoverSlots(bool was_master)
52+
void ReconcileMasterSlots(std::string_view repl_id)
5353
ABSL_LOCKS_EXCLUDED(set_config_mu, migration_mu_);
5454

55+
void ReconcileReplicaSlots() ABSL_LOCKS_EXCLUDED(set_config_mu, migration_mu_);
56+
5557
private:
5658
using SinkReplyBuilder = facade::SinkReplyBuilder;
5759

@@ -125,9 +127,6 @@ class ClusterFamily {
125127

126128
ClusterShardInfo GetEmulatedShardInfo(ConnectionContext* cntx) const;
127129

128-
void ReconcileMasterFlow() ABSL_EXCLUSIVE_LOCKS_REQUIRED(set_config_mu, migration_mu_);
129-
void ReconcileReplicaFlow() ABSL_EXCLUSIVE_LOCKS_REQUIRED(set_config_mu, migration_mu_);
130-
131130
// Guards set configuration, so that we won't handle 2 in parallel.
132131
mutable util::fb2::Mutex set_config_mu;
133132

src/server/dflycmd.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -596,7 +596,7 @@ void DflyCmd::TakeOver(CmdArgList args, RedisReplyBuilder* rb, ConnectionContext
596596
return;
597597
}
598598

599-
sf_->service().cluster_family().ReconcileMasterReplicaTakeoverSlots(true);
599+
sf_->service().cluster_family().ReconcileMasterSlots(replica_ptr->id);
600600
}
601601

602602
void DflyCmd::Expire(CmdArgList args, Transaction* tx, RedisReplyBuilder* rb) {

src/server/server_family.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3766,7 +3766,7 @@ void ServerFamily::ReplTakeOver(CmdArgList args, const CommandContext& cmd_cntx)
37663766
LOG(INFO) << "Takeover successful, promoting this instance to master.";
37673767

37683768
if (IsClusterEnabled()) {
3769-
service().cluster_family().ReconcileMasterReplicaTakeoverSlots(false);
3769+
service().cluster_family().ReconcileReplicaSlots();
37703770
}
37713771

37723772
last_master_data_ = replica_->Stop();

0 commit comments

Comments
 (0)