Skip to content

Commit f5d3828

Browse files
author
yuwmao
committed
Improve ReplaceMemberRollback UT
- Don't check rdev being destroyed on removed member, because there is a known issue that if the removed member can not catch up within 5*HB since respond leave_cluster_request, it has no chance to detect itself removed from the group. - Disable complete_replace_member in ReplaceMemberRollback UT.
1 parent 4f4a98f commit f5d3828

File tree

4 files changed

+35
-7
lines changed

4 files changed

+35
-7
lines changed

conanfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
class HomestoreConan(ConanFile):
1111
name = "homestore"
12-
version = "7.1.2"
12+
version = "7.1.3"
1313

1414
homepage = "https://github.com/eBay/Homestore"
1515
description = "HomeStore Storage Engine"

src/lib/replication/service/raft_repl_service.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
*********************************************************************************/
1515
#include <sisl/logging/logging.h>
1616
#include <iomgr/io_environment.hpp>
17+
#include <iomgr/iomgr_flip.hpp>
1718
#include <chrono>
1819

1920
#include <boost/uuid/string_generator.hpp>
@@ -750,6 +751,12 @@ void RaftReplService::flush_durable_commit_lsn() {
750751

751752
void RaftReplService::monitor_replace_member_replication_status() {
752753
std::unique_lock lg(m_rd_map_mtx);
754+
#ifdef _PRERELEASE
755+
if (iomgr_flip::instance()->test_flip("disable_monitor_replace_member_replication_status")) {
756+
LOGINFOMOD(replication, "flip disable_monitor_replace_member_replication_status triggered");
757+
return;
758+
}
759+
#endif
753760
for (auto& rdev_parent : m_rd_map) {
754761
auto rdev = std::dynamic_pointer_cast< RaftReplDev >(rdev_parent.second);
755762
rdev->monitor_replace_member_replication_status();

src/tests/test_common/homestore_test_common.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ class HSTestHelper {
250250
freq.set_count(count);
251251
freq.set_percent(percent);
252252
m_fc.inject_noreturn_flip(flip_name, {null_cond}, freq);
253-
LOGDEBUG("Flip {} set", flip_name);
253+
LOGINFO("Flip {} set", flip_name);
254254
}
255255

256256
void set_delay_flip(const std::string flip_name, uint64_t delay_usec, uint32_t count = 1, uint32_t percent = 100) {

src/tests/test_raft_repl_dev_dynamic.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ class ReplDevDynamicTest : public RaftReplDevTestBase {
2525
}
2626
};
2727

28+
#ifdef _PRERELEASE
2829
TEST_F(ReplDevDynamicTest, ReplaceMember) {
2930
LOGINFO("ReplaceMember test started replica={}", g_helper->replica_num());
31+
g_helper->set_basic_flip("disable_monitor_replace_member_replication_status", 1);
3032
// Write some IO's, replace a member, validate all members data except which is out.
3133
auto db = dbs_.back();
3234
auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
@@ -65,6 +67,10 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) {
6567
std::string new_task_id = "mismatched_task_id";
6668
replace_member(db, new_task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0,
6769
ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH);
70+
auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service());
71+
// trigger complete_replace_member, default laggy threshold=2000, so complete_replace_member should be
72+
// triggered.
73+
raft_repl_svc.monitor_replace_member_replication_status();
6874
});
6975
if (is_replica_num_in({0, 1, member_in})) {
7076
// Skip the member which is going to be replaced. Validate data on all other replica's.
@@ -93,13 +99,14 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) {
9399
check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)),
94100
ReplaceMemberStatus::COMPLETED);
95101
});
102+
g_helper->remove_flip("disable_monitor_replace_member_replication_status");
96103
LOGINFO("ReplaceMember test done replica={}", g_helper->replica_num());
97104
}
98105

99-
// After replace member is in progress, rollback replace member operation before complete_replace_member is
100-
// called(triggered after 60s).
106+
// After replace member is in progress, rollback replace member operation(complete_replace_member will be disabled)
101107
TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
102108
LOGINFO("ReplaceMember test started replica={}", g_helper->replica_num());
109+
g_helper->set_basic_flip("disable_monitor_replace_member_replication_status", 1);
103110
// Write some IO's, replace a member, validate all members data except which is out.
104111
auto db = dbs_.back();
105112
auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
@@ -125,6 +132,12 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
125132
LOGDEBUG("Not added to group yet")
126133
std::this_thread::sleep_for(std::chrono::microseconds(300));
127134
}
135+
// Need to wait for log being caught up. There is a known issue that if the removed member can not catch up
136+
// within 5*HB since respond leave_cluster_request, it has no chance to detect itself removed from the group.
137+
// Unlike other tests, what we remove is a normal member who has all logs, here we remove new member which is
138+
// very likely to be behind.
139+
wait_for_commits(num_io_entries);
140+
LOGINFO("Member in got all commits");
128141
}
129142

130143
g_helper->sync_for_verify_start(num_members);
@@ -141,9 +154,8 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
141154

142155
g_helper->sync_for_verify_start(num_members);
143156
LOGINFO("rollback triggered, sync_for_verify_start replica={} ", g_helper->replica_num());
144-
// verify member_in is removed from group.
157+
145158
if (g_helper->replica_num() == member_in) {
146-
// The member_in will have the repl dev destroyed.
147159
auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev());
148160
while (repl_dev && !repl_dev->is_destroyed()) {
149161
std::this_thread::sleep_for(std::chrono::seconds(1));
@@ -164,12 +176,13 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
164176
}
165177

166178
g_helper->sync_for_cleanup_start(num_members);
179+
g_helper->remove_flip("disable_monitor_replace_member_replication_status");
167180
LOGINFO("ReplaceMember test done replica={}", g_helper->replica_num());
168181
}
169182

170183
TEST_F(ReplDevDynamicTest, TwoMemberDown) {
171184
LOGINFO("TwoMemberDown test started replica={}", g_helper->replica_num());
172-
185+
g_helper->set_basic_flip("disable_monitor_replace_member_replication_status", 1);
173186
// Make two members down in a group and leader cant reach a quorum.
174187
// We set the custom quorum size to 1 and call replace member.
175188
// Leader should do some writes to validate it has reach quorum size.
@@ -238,6 +251,7 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) {
238251
}
239252

240253
g_helper->sync_for_cleanup_start(num_members);
254+
g_helper->remove_flip("disable_monitor_replace_member_replication_status");
241255
LOGINFO("TwoMemberDown test done replica={}", g_helper->replica_num());
242256
}
243257

@@ -246,6 +260,7 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) {
246260
// replica0 should be able to baseline resync to replica4(new member).
247261
// Write some IO's, replace a member, validate all members data except which is out.
248262
LOGINFO("OutMemberDown test started replica={}", g_helper->replica_num());
263+
g_helper->set_basic_flip("disable_monitor_replace_member_replication_status", 1);
249264
auto db = dbs_.back();
250265
auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
251266
auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >();
@@ -333,10 +348,12 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) {
333348
});
334349
}
335350
g_helper->sync_for_cleanup_start(num_members);
351+
g_helper->remove_flip("disable_monitor_replace_member_replication_status");
336352
LOGINFO("OutMemberDown test done replica={}", g_helper->replica_num());
337353
}
338354

339355
TEST_F(ReplDevDynamicTest, LeaderReplace) {
356+
g_helper->set_basic_flip("disable_monitor_replace_member_replication_status", 1);
340357
// replica0(leader) and replica1 and replica2 is up. Replace replica0(leader) with replica3.
341358
// replica0 will yield leadership and any other replica will be come leader and leader
342359
// will do baseline resync to replica4(new member).
@@ -389,6 +406,8 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) {
389406
ASSERT_EQ(
390407
check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)),
391408
ReplaceMemberStatus::IN_PROGRESS);
409+
auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service());
410+
raft_repl_svc.monitor_replace_member_replication_status();
392411
});
393412
if (g_helper->replica_num() == member_out) {
394413
// The out member will have the repl dev destroyed.
@@ -409,6 +428,7 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) {
409428
check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)),
410429
ReplaceMemberStatus::COMPLETED);
411430
});
431+
g_helper->remove_flip("disable_monitor_replace_member_replication_status");
412432
LOGINFO("LeaderReplace test done replica={}", g_helper->replica_num());
413433
}
414434

@@ -479,6 +499,7 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) {
479499
});
480500
LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num());
481501
}
502+
#endif
482503

483504
TEST_F(ReplDevDynamicTest, ValidateRequest) {
484505
LOGINFO("ValidateRequest test started replica={}", g_helper->replica_num());

0 commit comments

Comments
 (0)