-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[POC - do not review yet] chore: almost mutex free replication #5758
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
bf087da
95ca692
6ed0f0c
4b48d9a
7db60eb
6f06bba
f932141
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -242,6 +242,11 @@ using detail::SaveStagesController; | |
using http::StringResponse; | ||
using strings::HumanReadableNumBytes; | ||
|
||
// Initialized by REPLICAOF | ||
thread_local std::shared_ptr<Replica> tl_replica = nullptr; | ||
// Initialized by ADDREPLICAOF | ||
thread_local std::vector<std::shared_ptr<Replica>> tl_cluster_replicas; | ||
|
||
namespace { | ||
|
||
// TODO these should be configurable as command line flag and at runtime via config set | ||
|
@@ -1228,6 +1233,11 @@ void ServerFamily::Shutdown() { | |
dfly_cmd_->Shutdown(); | ||
DebugCmd::Shutdown(); | ||
}); | ||
|
||
service_.proactor_pool().AwaitFiberOnAll([](auto index, auto* cntx) { | ||
tl_replica = nullptr; | ||
tl_cluster_replicas.clear(); | ||
}); | ||
} | ||
|
||
bool ServerFamily::HasPrivilegedInterface() { | ||
|
@@ -3130,12 +3140,15 @@ string ServerFamily::FormatInfoMetrics(const Metrics& m, std::string_view sectio | |
append("psync_attempts", rinfo.psync_attempts); | ||
append("psync_successes", rinfo.psync_successes); | ||
}; | ||
fb2::LockGuard lk(replicaof_mu_); | ||
// Deep copy because tl_replica might be overwritten inbetween | ||
auto replica = tl_replica; | ||
|
||
replication_info_cb(replica_->GetSummary()); | ||
replication_info_cb(replica->GetSummary()); | ||
|
||
// Deep copy because tl_cluster_replicas might be overwritten inbetween | ||
auto cluster_replicas = tl_cluster_replicas; | ||
// Special case, when multiple masters replicate to a single replica. | ||
for (const auto& replica : cluster_replicas_) { | ||
for (const auto& replica : cluster_replicas) { | ||
replication_info_cb(replica->GetSummary()); | ||
} | ||
} | ||
|
@@ -3417,7 +3430,7 @@ void ServerFamily::AddReplicaOf(CmdArgList args, const CommandContext& cmd_cntx) | |
} | ||
LOG(INFO) << "Add Replica " << *replicaof_args; | ||
|
||
auto add_replica = make_unique<Replica>(replicaof_args->host, replicaof_args->port, &service_, | ||
auto add_replica = make_shared<Replica>(replicaof_args->host, replicaof_args->port, &service_, | ||
master_replid(), replicaof_args->slot_range); | ||
GenericError ec = add_replica->Start(); | ||
if (ec) { | ||
|
@@ -3426,77 +3439,76 @@ void ServerFamily::AddReplicaOf(CmdArgList args, const CommandContext& cmd_cntx) | |
} | ||
add_replica->StartMainReplicationFiber(nullopt); | ||
cluster_replicas_.push_back(std::move(add_replica)); | ||
cmd_cntx.rb->SendOk(); | ||
} | ||
|
||
void ServerFamily::ReplicaOfInternal(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder, | ||
ActionOnConnectionFail on_err) { | ||
std::shared_ptr<Replica> new_replica; | ||
std::optional<Replica::LastMasterSyncData> last_master_data; | ||
{ | ||
util::fb2::LockGuard lk(replicaof_mu_); // Only one REPLICAOF command can run at a time | ||
service_.proactor_pool().AwaitFiberOnAll( | ||
[this](auto index, auto* cntx) | ||
ABSL_NO_THREAD_SAFETY_ANALYSIS { tl_cluster_replicas = cluster_replicas_; }); | ||
|
||
// We should not execute replica of command while loading from snapshot. | ||
ServerState* ss = ServerState::tlocal(); | ||
if (ss->is_master && ss->gstate() == GlobalState::LOADING) { | ||
builder->SendError(kLoadingErr); | ||
return; | ||
} | ||
|
||
auto replicaof_args = ReplicaOfArgs::FromCmdArgs(args, builder); | ||
if (!replicaof_args.has_value()) { | ||
return; | ||
} | ||
|
||
LOG(INFO) << "Replicating " << *replicaof_args; | ||
|
||
// If NO ONE was supplied, just stop the current replica (if it exists) | ||
if (replicaof_args->IsReplicaOfNoOne()) { | ||
if (!ss->is_master) { | ||
CHECK(replica_); | ||
|
||
SetMasterFlagOnAllThreads(true); // Flip flag before clearing replica | ||
last_master_data_ = replica_->Stop(); | ||
replica_.reset(); | ||
cmd_cntx.rb->SendOk(); | ||
} | ||
|
||
StopAllClusterReplicas(); | ||
} | ||
void ServerFamily::ReplicaOfNoOne(SinkReplyBuilder* builder) { | ||
util::fb2::LockGuard lk(replicaof_mu_); | ||
ServerState* ss = ServerState::tlocal(); | ||
|
||
// May not switch to ACTIVE if the process is, for example, shutting down at the same time. | ||
service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE); | ||
if (!ss->is_master) { | ||
CHECK(replica_); | ||
// flip flag before clearing replica_ | ||
SetMasterFlagOnAllThreads(true); | ||
// TODO we should not allow partial sync after NO-ONE. Only after Takeover. | ||
last_master_data_ = replica_->Stop(); | ||
// TODO set thread locals to nullptr | ||
kostasrim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
replica_.reset(); | ||
StopAllClusterReplicas(); | ||
service_.proactor_pool().AwaitFiberOnAll([](auto index, auto* cntx) { tl_replica = nullptr; }); | ||
} | ||
|
||
return builder->SendOk(); | ||
} | ||
// May not switch to ACTIVE if the process is, for example, shutting down at the same time. | ||
service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE); | ||
|
||
// If any replication is in progress, stop it, cancellation should kick in immediately | ||
return builder->SendOk(); | ||
} | ||
|
||
if (replica_) | ||
last_master_data = replica_->Stop(); | ||
StopAllClusterReplicas(); | ||
bool ServerFamily::IsDragonflyLoadingAtomic() { | ||
util::fb2::LockGuard lk(replicaof_mu_); | ||
ServerState* ss = ServerState::tlocal(); | ||
|
||
const GlobalState gstate = ServerState::tlocal()->gstate(); | ||
if (gstate == GlobalState::TAKEN_OVER) { | ||
service_.SwitchState(GlobalState::TAKEN_OVER, GlobalState::LOADING); | ||
} else if (auto new_state = service_.SwitchState(GlobalState::ACTIVE, GlobalState::LOADING); | ||
new_state != GlobalState::LOADING) { | ||
LOG(WARNING) << new_state << " in progress, ignored"; | ||
builder->SendError("Invalid state"); | ||
return; | ||
} | ||
return ss->is_master && ss->gstate() == GlobalState::LOADING; | ||
} | ||
|
||
// Create a new replica and assign it | ||
new_replica = make_shared<Replica>(replicaof_args->host, replicaof_args->port, &service_, | ||
master_replid(), replicaof_args->slot_range); | ||
void ServerFamily::ReplicaOfInternal(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder, | ||
ActionOnConnectionFail on_err) { | ||
std::optional<Replica::LastMasterSyncData> last_master_data; | ||
|
||
replica_ = new_replica; | ||
auto replicaof_args = ReplicaOfArgs::FromCmdArgs(args, builder); | ||
if (!replicaof_args.has_value()) { | ||
return; | ||
} | ||
|
||
// TODO: disconnect pending blocked clients (pubsub, blocking commands) | ||
SetMasterFlagOnAllThreads(false); // Flip flag after assiging replica | ||
LOG(INFO) << "Initiate replication with: " << *replicaof_args; | ||
// This is a "weak" check. For example, if the node is already a replica, | ||
// it could be the case that one of the flows disconnects. The MainReplicationFiber | ||
// will then loop and if it can't partial sync it will enter LOADING state because of | ||
// full sync. Note that the fiber is not aware of the replicaof_mu_ so even | ||
// if that mutex is locked below before any state check we can't really enforce | ||
// that the old replication fiber won't try to full sync and update the state to LOADING. | ||
// What is more here is that we always call `replica->Stop()`. So even if we end up in the | ||
// scenario described, the semantics are well defined. First, cancel the old replica and | ||
// move on with the new one. Cancelation will be slower and ReplicaOf() will | ||
// induce higher latency -- but that's ok because it's an highly improbable flow with | ||
// well defined semantics. | ||
if (IsDragonflyLoadingAtomic()) { | ||
builder->SendError(kLoadingErr); | ||
return; | ||
} | ||
|
||
} // release the lock, lk.unlock() | ||
// We proceed connecting below without the lock to allow interrupting the replica immediately. | ||
// From this point and onward, it should be highly responsive. | ||
// replicaof no one | ||
if (replicaof_args->IsReplicaOfNoOne()) { | ||
return ReplicaOfNoOne(builder); | ||
} | ||
|
||
auto new_replica = make_shared<Replica>(replicaof_args->host, replicaof_args->port, &service_, | ||
master_replid(), replicaof_args->slot_range); | ||
GenericError ec{}; | ||
switch (on_err) { | ||
case ActionOnConnectionFail::kReturnOnError: | ||
|
@@ -3507,30 +3519,31 @@ void ServerFamily::ReplicaOfInternal(CmdArgList args, Transaction* tx, SinkReply | |
break; | ||
}; | ||
|
||
// If the replication attempt failed, clean up global state. The replica should have stopped | ||
// internally. | ||
util::fb2::LockGuard lk(replicaof_mu_); // Only one REPLICAOF command can run at a time | ||
|
||
// If there was an error above during Start we must not start the main replication fiber. | ||
// However, it could be the case that Start() above connected succefully and by the time | ||
// we acquire the lock, the context got cancelled because another ReplicaOf command | ||
// executed and acquired the replicaof_mu_ before us. | ||
const bool cancelled = new_replica->IsContextCancelled(); | ||
if (ec || cancelled) { | ||
if (replica_ == new_replica) { | ||
service_.SwitchState(GlobalState::LOADING, GlobalState::ACTIVE); | ||
SetMasterFlagOnAllThreads(true); | ||
replica_.reset(); | ||
} | ||
builder->SendError(ec ? ec.Format() : "replication cancelled"); | ||
return; | ||
if (ec || new_replica->IsContextCancelled()) { | ||
return builder->SendError(ec ? ec.Format() : "replication cancelled"); | ||
} | ||
// Successfully connected now we flush | ||
// If we are called by "Replicate", tx will be null but we do not need | ||
// to flush anything. | ||
|
||
util::fb2::LockGuard lk(replicaof_mu_); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bread and butter no2. No more |
||
if (replica_) | ||
last_master_data = replica_->Stop(); | ||
|
||
StopAllClusterReplicas(); | ||
|
||
if (ServerState::tlocal()->gstate() == GlobalState::TAKEN_OVER) | ||
service_.SwitchState(GlobalState::TAKEN_OVER, GlobalState::LOADING); | ||
|
||
// Update thread locals. That way INFO never blocks | ||
replica_ = new_replica; | ||
service_.proactor_pool().AwaitFiberOnAll([new_replica](auto index, auto* context) { | ||
tl_replica = new_replica; | ||
tl_cluster_replicas.clear(); | ||
}); | ||
SetMasterFlagOnAllThreads(false); | ||
|
||
if (on_err == ActionOnConnectionFail::kReturnOnError) { | ||
new_replica->StartMainReplicationFiber(last_master_data); | ||
replica_->StartMainReplicationFiber(last_master_data); | ||
} | ||
|
||
builder->SendOk(); | ||
} | ||
|
||
|
@@ -3608,6 +3621,9 @@ void ServerFamily::ReplTakeOver(CmdArgList args, const CommandContext& cmd_cntx) | |
SetMasterFlagOnAllThreads(true); | ||
last_master_data_ = replica_->Stop(); | ||
replica_.reset(); | ||
|
||
service_.proactor_pool().AwaitFiberOnAll([](auto index, auto* context) { tl_replica = nullptr; }); | ||
kostasrim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
return builder->SendOk(); | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -448,9 +448,6 @@ async def test_cancel_replication_immediately(df_factory, df_seeder_factory: Dfl | |
""" | ||
Issue 100 replication commands. This checks that the replication state | ||
machine can handle cancellation well. | ||
We assert that at least one command was cancelled. | ||
After we finish the 'fuzzing' part, replicate the first master and check that | ||
all the data is correct. | ||
""" | ||
COMMANDS_TO_ISSUE = 100 | ||
|
||
|
@@ -484,7 +481,7 @@ async def replicate(): | |
num_successes += await result | ||
|
||
logging.info(f"succeses: {num_successes}") | ||
assert COMMANDS_TO_ISSUE > num_successes, "At least one REPLICAOF must be cancelled" | ||
assert COMMANDS_TO_ISSUE == num_successes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The new algorithm does not use
This can not happen anymore because we lock only once and atomically update everything including stopping the previous replica. So by the time (2) grabs the lock in the example above, the previous REPLICAOF command is not in some intermediate state. To observe that indeed we cancelled, we should read the logs and see ("Stopping replication") Bonus points is that I suspect we might be able to also solve #4685 but I will need to follow up with that |
||
|
||
await wait_available_async(c_replica) | ||
capture = await seeder.capture() | ||
|
@@ -493,6 +490,9 @@ async def replicate(): | |
|
||
ping_job.cancel() | ||
|
||
replica.stop() | ||
lines = replica.find_in_logs("Stopping replication") | ||
|
||
|
||
""" | ||
Test flushall command. Set data to master send flashall and set more data. | ||
|
@@ -2967,8 +2967,7 @@ async def replicate_inside_multi(): | |
num_successes += await result | ||
|
||
logging.info(f"succeses: {num_successes}") | ||
assert MULTI_COMMANDS_TO_ISSUE > num_successes, "At least one REPLICAOF must be cancelled" | ||
assert num_successes > 0, "At least one REPLICAOF must success" | ||
assert MULTI_COMMANDS_TO_ISSUE == num_successes | ||
|
||
|
||
async def test_preempt_in_atomic_section_of_heartbeat(df_factory: DflyInstanceFactory): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bread and butter no1 of this PR. Bye bye blocking info command because of the mutex.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO worth adding a test