Skip to content

Commit e999956

Browse files
author
yuwmao
committed
Improve ReplaceMemberRollback UT
- Add flip to control complete_replace_member. - In ReplaceMemberRollback UT, wait for in member being caught up before rollback.
1 parent 4f4a98f commit e999956

File tree

4 files changed

+46
-8
lines changed

4 files changed

+46
-8
lines changed

conanfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
class HomestoreConan(ConanFile):
1111
name = "homestore"
12-
version = "7.1.2"
12+
version = "7.1.3"
1313

1414
homepage = "https://github.com/eBay/Homestore"
1515
description = "HomeStore Storage Engine"

src/lib/replication/service/raft_repl_service.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
*********************************************************************************/
1515
#include <sisl/logging/logging.h>
1616
#include <iomgr/io_environment.hpp>
17+
#include <iomgr/iomgr_flip.hpp>
1718
#include <chrono>
1819

1920
#include <boost/uuid/string_generator.hpp>
@@ -655,7 +656,16 @@ void RaftReplService::start_repl_service_timers() {
655656

656657
m_replace_member_sync_check_timer_hdl = iomanager.schedule_global_timer(
657658
HS_DYNAMIC_CONFIG(consensus.replace_member_sync_check_interval_ms) * 1000 * 1000, true /* recurring */, nullptr,
658-
iomgr::reactor_regex::all_worker, [this](void*) { monitor_replace_member_replication_status(); },
659+
iomgr::reactor_regex::all_worker,
660+
[this](void*) {
661+
#ifdef _PRERELEASE
662+
if (iomgr_flip::instance()->test_flip("skip_monitor_replace_member_replication_status")) {
663+
LOGINFOMOD(replication, "flip skip_monitor_replace_member_replication_status triggered");
664+
return;
665+
}
666+
#endif
667+
monitor_replace_member_replication_status();
668+
},
659669
true /* wait_to_schedule */);
660670
}
661671

src/tests/test_common/homestore_test_common.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ class HSTestHelper {
250250
freq.set_count(count);
251251
freq.set_percent(percent);
252252
m_fc.inject_noreturn_flip(flip_name, {null_cond}, freq);
253-
LOGDEBUG("Flip {} set", flip_name);
253+
LOGINFO("Flip {} set", flip_name);
254254
}
255255

256256
void set_delay_flip(const std::string flip_name, uint64_t delay_usec, uint32_t count = 1, uint32_t percent = 100) {

src/tests/test_raft_repl_dev_dynamic.cpp

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ class ReplDevDynamicTest : public RaftReplDevTestBase {
2525
}
2626
};
2727

28+
#ifdef _PRERELEASE
2829
TEST_F(ReplDevDynamicTest, ReplaceMember) {
2930
LOGINFO("ReplaceMember test started replica={}", g_helper->replica_num());
31+
g_helper->set_basic_flip("skip_monitor_replace_member_replication_status", 1000);
3032
// Write some IO's, replace a member, validate all members data except which is out.
3133
auto db = dbs_.back();
3234
auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
@@ -65,7 +67,13 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) {
6567
std::string new_task_id = "mismatched_task_id";
6668
replace_member(db, new_task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in), 0,
6769
ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH);
70+
auto& raft_repl_svc = dynamic_cast< RaftReplService& >(hs()->repl_service());
6871
});
72+
// If the manual monitor_replace_member_replication_status fails, restore the periodical check.
73+
// restore the periodical check.
74+
LOGINFO("restore monitor_replace_member_replication_status")
75+
g_helper->remove_flip("skip_monitor_replace_member_replication_status");
76+
6977
if (is_replica_num_in({0, 1, member_in})) {
7078
// Skip the member which is going to be replaced. Validate data on all other replica's.
7179
LOGINFO("Validate all data written so far by reading them replica={}", g_helper->replica_num());
@@ -96,10 +104,11 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) {
96104
LOGINFO("ReplaceMember test done replica={}", g_helper->replica_num());
97105
}
98106

99-
// After replace member is in progress, rollback replace member operation before complete_replace_member is
100-
// called(triggered after 60s).
107+
// After replace member is in progress, rollback replace member operation(complete_replace_member will be disabled)
101108
TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
102109
LOGINFO("ReplaceMember test started replica={}", g_helper->replica_num());
110+
// don't execute complete_replace_member in the background reaper thread within this test.
111+
g_helper->set_basic_flip("skip_monitor_replace_member_replication_status", 1000);
103112
// Write some IO's, replace a member, validate all members data except which is out.
104113
auto db = dbs_.back();
105114
auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
@@ -125,6 +134,12 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
125134
LOGDEBUG("Not added to group yet")
126135
std::this_thread::sleep_for(std::chrono::microseconds(300));
127136
}
137+
// Need to wait for log being caught up. There is a known issue that if the removed member can not catch up
138+
// within 5*HB since respond leave_cluster_request, it has no chance to detect itself removed from the group.
139+
// Unlike other tests, what we remove is a normal member who has all logs, here we remove new member which is
140+
// very likely to be behind.
141+
wait_for_commits(num_io_entries);
142+
LOGINFO("Member in got all commits");
128143
}
129144

130145
g_helper->sync_for_verify_start(num_members);
@@ -141,9 +156,8 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
141156

142157
g_helper->sync_for_verify_start(num_members);
143158
LOGINFO("rollback triggered, sync_for_verify_start replica={} ", g_helper->replica_num());
144-
// verify member_in is removed from group.
159+
145160
if (g_helper->replica_num() == member_in) {
146-
// The member_in will have the repl dev destroyed.
147161
auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev());
148162
while (repl_dev && !repl_dev->is_destroyed()) {
149163
std::this_thread::sleep_for(std::chrono::seconds(1));
@@ -164,12 +178,13 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
164178
}
165179

166180
g_helper->sync_for_cleanup_start(num_members);
181+
g_helper->remove_flip("skip_monitor_replace_member_replication_status");
167182
LOGINFO("ReplaceMember test done replica={}", g_helper->replica_num());
168183
}
169184

170185
TEST_F(ReplDevDynamicTest, TwoMemberDown) {
171186
LOGINFO("TwoMemberDown test started replica={}", g_helper->replica_num());
172-
187+
g_helper->set_basic_flip("skip_monitor_replace_member_replication_status", 1000);
173188
// Make two members down in a group and leader cant reach a quorum.
174189
// We set the custom quorum size to 1 and call replace member.
175190
// Leader should do some writes to validate it has reach quorum size.
@@ -238,6 +253,7 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) {
238253
}
239254

240255
g_helper->sync_for_cleanup_start(num_members);
256+
g_helper->remove_flip("skip_monitor_replace_member_replication_status");
241257
LOGINFO("TwoMemberDown test done replica={}", g_helper->replica_num());
242258
}
243259

@@ -246,6 +262,7 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) {
246262
// replica0 should be able to baseline resync to replica4(new member).
247263
// Write some IO's, replace a member, validate all members data except which is out.
248264
LOGINFO("OutMemberDown test started replica={}", g_helper->replica_num());
265+
g_helper->set_basic_flip("skip_monitor_replace_member_replication_status", 1000);
249266
auto db = dbs_.back();
250267
auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
251268
auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >();
@@ -298,6 +315,8 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) {
298315
check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)),
299316
ReplaceMemberStatus::IN_PROGRESS);
300317
});
318+
LOGINFO("restore monitor_replace_member_replication_status")
319+
g_helper->remove_flip("skip_monitor_replace_member_replication_status");
301320
// Since the out_member stopped, it cannot response to remove_srv req, as a result the first time will get CANCELLED
302321
// error, so waiting time is longer than other tests.
303322
if (g_helper->replica_num() == 2) {
@@ -337,6 +356,7 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) {
337356
}
338357

339358
TEST_F(ReplDevDynamicTest, LeaderReplace) {
359+
g_helper->set_basic_flip("skip_monitor_replace_member_replication_status", 1000);
340360
// replica0(leader) and replica1 and replica2 is up. Replace replica0(leader) with replica3.
341361
// replica0 will yield leadership and any other replica will be come leader and leader
342362
// will do baseline resync to replica4(new member).
@@ -390,6 +410,9 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) {
390410
check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)),
391411
ReplaceMemberStatus::IN_PROGRESS);
392412
});
413+
// restore the periodical check.
414+
LOGINFO("restore monitor_replace_member_replication_status")
415+
g_helper->remove_flip("skip_monitor_replace_member_replication_status");
393416
if (g_helper->replica_num() == member_out) {
394417
// The out member will have the repl dev destroyed.
395418
auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev());
@@ -417,6 +440,7 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) {
417440
// replica0 should be able to baseline resync to replica4(new member).
418441
// Write some IO's, replace a member, validate all members data except which is out.
419442
LOGINFO("OneMemberRestart test started replica={}", g_helper->replica_num());
443+
g_helper->set_basic_flip("skip_monitor_replace_member_replication_status", 1000);
420444
auto db = dbs_.back();
421445
auto num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >();
422446
auto num_members = SISL_OPTIONS["replicas"].as< uint32_t >() + SISL_OPTIONS["spare_replicas"].as< uint32_t >();
@@ -459,6 +483,9 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) {
459483
check_replace_member_status(db, task_id, g_helper->replica_id(member_out), g_helper->replica_id(member_in)),
460484
ReplaceMemberStatus::IN_PROGRESS);
461485
});
486+
LOGINFO("restore monitor_replace_member_replication_status")
487+
g_helper->remove_flip("skip_monitor_replace_member_replication_status");
488+
462489
if (g_helper->replica_num() == member_out) {
463490
// The out member will have the repl dev destroyed.
464491
auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev());
@@ -479,6 +506,7 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) {
479506
});
480507
LOGINFO("OneMemberRestart test done replica={}", g_helper->replica_num());
481508
}
509+
#endif
482510

483511
TEST_F(ReplDevDynamicTest, ValidateRequest) {
484512
LOGINFO("ValidateRequest test started replica={}", g_helper->replica_num());

0 commit comments

Comments
 (0)