@@ -25,8 +25,10 @@ class ReplDevDynamicTest : public RaftReplDevTestBase {
2525 }
2626};
2727
28+ #ifdef _PRERELEASE
2829TEST_F (ReplDevDynamicTest, ReplaceMember) {
2930 LOGINFO (" ReplaceMember test started replica={}" , g_helper->replica_num ());
31+ g_helper->set_basic_flip (" skip_monitor_replace_member_replication_status" , 1000 );
3032 // Write some IO's, replace a member, validate all members data except which is out.
3133 auto db = dbs_.back ();
3234 auto num_replicas = SISL_OPTIONS[" replicas" ].as < uint32_t >();
@@ -65,7 +67,13 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) {
6567 std::string new_task_id = " mismatched_task_id" ;
6668 replace_member (db, new_task_id, g_helper->replica_id (member_out), g_helper->replica_id (member_in), 0 ,
6769 ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH);
70+ auto & raft_repl_svc = dynamic_cast < RaftReplService& >(hs ()->repl_service ());
6871 });
72+ // If the manual monitor_replace_member_replication_status fails, restore the periodical check.
73+ // restore the periodical check.
74+ LOGINFO (" restore monitor_replace_member_replication_status" )
75+ g_helper->remove_flip (" skip_monitor_replace_member_replication_status" );
76+
6977 if (is_replica_num_in ({0 , 1 , member_in})) {
7078 // Skip the member which is going to be replaced. Validate data on all other replica's.
7179 LOGINFO (" Validate all data written so far by reading them replica={}" , g_helper->replica_num ());
@@ -96,10 +104,11 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) {
96104 LOGINFO (" ReplaceMember test done replica={}" , g_helper->replica_num ());
97105}
98106
99- // After replace member is in progress, rollback replace member operation before complete_replace_member is
100- // called(triggered after 60s).
107+ // After replace member is in progress, rollback replace member operation(complete_replace_member will be disabled)
101108TEST_F (ReplDevDynamicTest, ReplaceMemberRollback) {
102109 LOGINFO (" ReplaceMember test started replica={}" , g_helper->replica_num ());
110+ // don't execute complete_replace_member in the background reaper thread within this test.
111+ g_helper->set_basic_flip (" skip_monitor_replace_member_replication_status" , 1000 );
103112 // Write some IO's, replace a member, validate all members data except which is out.
104113 auto db = dbs_.back ();
105114 auto num_replicas = SISL_OPTIONS[" replicas" ].as < uint32_t >();
@@ -125,6 +134,12 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
125134 LOGDEBUG (" Not added to group yet" )
126135 std::this_thread::sleep_for (std::chrono::microseconds (300 ));
127136 }
137+ // Need to wait for log being caught up. There is a known issue that if the removed member can not catch up
138+ // within 5*HB since respond leave_cluster_request, it has no chance to detect itself removed from the group.
139+ // Unlike other tests, what we remove is a normal member who has all logs, here we remove new member which is
140+ // very likely to be behind.
141+ wait_for_commits (num_io_entries);
142+ LOGINFO (" Member in got all commits" );
128143 }
129144
130145 g_helper->sync_for_verify_start (num_members);
@@ -141,9 +156,8 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
141156
142157 g_helper->sync_for_verify_start (num_members);
143158 LOGINFO (" rollback triggered, sync_for_verify_start replica={} " , g_helper->replica_num ());
144- // verify member_in is removed from group.
159+
145160 if (g_helper->replica_num () == member_in) {
146- // The member_in will have the repl dev destroyed.
147161 auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev ());
148162 while (repl_dev && !repl_dev->is_destroyed ()) {
149163 std::this_thread::sleep_for (std::chrono::seconds (1 ));
@@ -164,12 +178,13 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
164178 }
165179
166180 g_helper->sync_for_cleanup_start (num_members);
181+ g_helper->remove_flip (" skip_monitor_replace_member_replication_status" );
167182 LOGINFO (" ReplaceMember test done replica={}" , g_helper->replica_num ());
168183}
169184
170185TEST_F (ReplDevDynamicTest, TwoMemberDown) {
171186 LOGINFO (" TwoMemberDown test started replica={}" , g_helper->replica_num ());
172-
187+ g_helper-> set_basic_flip ( " skip_monitor_replace_member_replication_status " , 1000 );
173188 // Make two members down in a group and leader cant reach a quorum.
174189 // We set the custom quorum size to 1 and call replace member.
175190 // Leader should do some writes to validate it has reach quorum size.
@@ -238,6 +253,7 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) {
238253 }
239254
240255 g_helper->sync_for_cleanup_start (num_members);
256+ g_helper->remove_flip (" skip_monitor_replace_member_replication_status" );
241257 LOGINFO (" TwoMemberDown test done replica={}" , g_helper->replica_num ());
242258}
243259
@@ -246,6 +262,7 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) {
246262 // replica0 should be able to baseline resync to replica4(new member).
247263 // Write some IO's, replace a member, validate all members data except which is out.
248264 LOGINFO (" OutMemberDown test started replica={}" , g_helper->replica_num ());
265+ g_helper->set_basic_flip (" skip_monitor_replace_member_replication_status" , 1000 );
249266 auto db = dbs_.back ();
250267 auto num_replicas = SISL_OPTIONS[" replicas" ].as < uint32_t >();
251268 auto num_members = SISL_OPTIONS[" replicas" ].as < uint32_t >() + SISL_OPTIONS[" spare_replicas" ].as < uint32_t >();
@@ -298,6 +315,8 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) {
298315 check_replace_member_status (db, task_id, g_helper->replica_id (member_out), g_helper->replica_id (member_in)),
299316 ReplaceMemberStatus::IN_PROGRESS);
300317 });
318+ LOGINFO (" restore monitor_replace_member_replication_status" )
319+ g_helper->remove_flip (" skip_monitor_replace_member_replication_status" );
301320 // Since the out_member stopped, it cannot response to remove_srv req, as a result the first time will get CANCELLED
302321 // error, so waiting time is longer than other tests.
303322 if (g_helper->replica_num () == 2 ) {
@@ -337,6 +356,7 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) {
337356}
338357
339358TEST_F (ReplDevDynamicTest, LeaderReplace) {
359+ g_helper->set_basic_flip (" skip_monitor_replace_member_replication_status" , 1000 );
340360 // replica0(leader) and replica1 and replica2 is up. Replace replica0(leader) with replica3.
341361 // replica0 will yield leadership and any other replica will be come leader and leader
342362 // will do baseline resync to replica4(new member).
@@ -390,6 +410,9 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) {
390410 check_replace_member_status (db, task_id, g_helper->replica_id (member_out), g_helper->replica_id (member_in)),
391411 ReplaceMemberStatus::IN_PROGRESS);
392412 });
413+ // restore the periodical check.
414+ LOGINFO (" restore monitor_replace_member_replication_status" )
415+ g_helper->remove_flip (" skip_monitor_replace_member_replication_status" );
393416 if (g_helper->replica_num () == member_out) {
394417 // The out member will have the repl dev destroyed.
395418 auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev ());
@@ -417,6 +440,7 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) {
417440 // replica0 should be able to baseline resync to replica4(new member).
418441 // Write some IO's, replace a member, validate all members data except which is out.
419442 LOGINFO (" OneMemberRestart test started replica={}" , g_helper->replica_num ());
443+ g_helper->set_basic_flip (" skip_monitor_replace_member_replication_status" , 1000 );
420444 auto db = dbs_.back ();
421445 auto num_replicas = SISL_OPTIONS[" replicas" ].as < uint32_t >();
422446 auto num_members = SISL_OPTIONS[" replicas" ].as < uint32_t >() + SISL_OPTIONS[" spare_replicas" ].as < uint32_t >();
@@ -459,6 +483,9 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) {
459483 check_replace_member_status (db, task_id, g_helper->replica_id (member_out), g_helper->replica_id (member_in)),
460484 ReplaceMemberStatus::IN_PROGRESS);
461485 });
486+ LOGINFO (" restore monitor_replace_member_replication_status" )
487+ g_helper->remove_flip (" skip_monitor_replace_member_replication_status" );
488+
462489 if (g_helper->replica_num () == member_out) {
463490 // The out member will have the repl dev destroyed.
464491 auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev ());
@@ -479,6 +506,7 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) {
479506 });
480507 LOGINFO (" OneMemberRestart test done replica={}" , g_helper->replica_num ());
481508}
509+ #endif
482510
483511TEST_F (ReplDevDynamicTest, ValidateRequest) {
484512 LOGINFO (" ValidateRequest test started replica={}" , g_helper->replica_num ());
0 commit comments