@@ -25,8 +25,10 @@ class ReplDevDynamicTest : public RaftReplDevTestBase {
2525 }
2626};
2727
28+ #ifdef _PRERELEASE
2829TEST_F (ReplDevDynamicTest, ReplaceMember) {
2930 LOGINFO (" ReplaceMember test started replica={}" , g_helper->replica_num ());
31+ g_helper->set_basic_flip (" disable_monitor_replace_member_replication_status" , 1 );
3032 // Write some IO's, replace a member, validate all members data except which is out.
3133 auto db = dbs_.back ();
3234 auto num_replicas = SISL_OPTIONS[" replicas" ].as < uint32_t >();
@@ -65,6 +67,10 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) {
6567 std::string new_task_id = " mismatched_task_id" ;
6668 replace_member (db, new_task_id, g_helper->replica_id (member_out), g_helper->replica_id (member_in), 0 ,
6769 ReplServiceError::REPLACE_MEMBER_TASK_MISMATCH);
70+ auto & raft_repl_svc = dynamic_cast < RaftReplService& >(hs ()->repl_service ());
71+ // trigger complete_replace_member, default laggy threshold=2000, so complete_replace_member should be
72+ // triggered.
73+ raft_repl_svc.monitor_replace_member_replication_status ();
6874 });
6975 if (is_replica_num_in ({0 , 1 , member_in})) {
7076 // Skip the member which is going to be replaced. Validate data on all other replica's.
@@ -93,13 +99,14 @@ TEST_F(ReplDevDynamicTest, ReplaceMember) {
9399 check_replace_member_status (db, task_id, g_helper->replica_id (member_out), g_helper->replica_id (member_in)),
94100 ReplaceMemberStatus::COMPLETED);
95101 });
102+ g_helper->remove_flip (" disable_monitor_replace_member_replication_status" );
96103 LOGINFO (" ReplaceMember test done replica={}" , g_helper->replica_num ());
97104}
98105
99- // After replace member is in progress, rollback replace member operation before complete_replace_member is
100- // called(triggered after 60s).
106+ // After replace member is in progress, rollback replace member operation(complete_replace_member will be disabled)
101107TEST_F (ReplDevDynamicTest, ReplaceMemberRollback) {
102108 LOGINFO (" ReplaceMember test started replica={}" , g_helper->replica_num ());
109+ g_helper->set_basic_flip (" disable_monitor_replace_member_replication_status" , 1 );
103110 // Write some IO's, replace a member, validate all members data except which is out.
104111 auto db = dbs_.back ();
105112 auto num_replicas = SISL_OPTIONS[" replicas" ].as < uint32_t >();
@@ -125,6 +132,12 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
125132 LOGDEBUG (" Not added to group yet" )
126133 std::this_thread::sleep_for (std::chrono::microseconds (300 ));
127134 }
135+ // Need to wait for log being caught up. There is a known issue that if the removed member can not catch up
136+ // within 5*HB since respond leave_cluster_request, it has no chance to detect itself removed from the group.
137+ // Unlike other tests, what we remove is a normal member who has all logs, here we remove new member which is
138+ // very likely to be behind.
139+ wait_for_commits (num_io_entries);
140+ LOGINFO (" Member in got all commits" );
128141 }
129142
130143 g_helper->sync_for_verify_start (num_members);
@@ -141,9 +154,8 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
141154
142155 g_helper->sync_for_verify_start (num_members);
143156 LOGINFO (" rollback triggered, sync_for_verify_start replica={} " , g_helper->replica_num ());
144- // verify member_in is removed from group.
157+
145158 if (g_helper->replica_num () == member_in) {
146- // The member_in will have the repl dev destroyed.
147159 auto repl_dev = std::dynamic_pointer_cast< RaftReplDev >(db->repl_dev ());
148160 while (repl_dev && !repl_dev->is_destroyed ()) {
149161 std::this_thread::sleep_for (std::chrono::seconds (1 ));
@@ -164,12 +176,13 @@ TEST_F(ReplDevDynamicTest, ReplaceMemberRollback) {
164176 }
165177
166178 g_helper->sync_for_cleanup_start (num_members);
179+ g_helper->remove_flip (" disable_monitor_replace_member_replication_status" );
167180 LOGINFO (" ReplaceMember test done replica={}" , g_helper->replica_num ());
168181}
169182
170183TEST_F (ReplDevDynamicTest, TwoMemberDown) {
171184 LOGINFO (" TwoMemberDown test started replica={}" , g_helper->replica_num ());
172-
185+ g_helper-> set_basic_flip ( " disable_monitor_replace_member_replication_status " , 1 );
173186 // Make two members down in a group and leader cant reach a quorum.
174187 // We set the custom quorum size to 1 and call replace member.
175188 // Leader should do some writes to validate it has reach quorum size.
@@ -238,6 +251,7 @@ TEST_F(ReplDevDynamicTest, TwoMemberDown) {
238251 }
239252
240253 g_helper->sync_for_cleanup_start (num_members);
254+ g_helper->remove_flip (" disable_monitor_replace_member_replication_status" );
241255 LOGINFO (" TwoMemberDown test done replica={}" , g_helper->replica_num ());
242256}
243257
@@ -246,6 +260,7 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) {
246260 // replica0 should be able to baseline resync to replica4(new member).
247261 // Write some IO's, replace a member, validate all members data except which is out.
248262 LOGINFO (" OutMemberDown test started replica={}" , g_helper->replica_num ());
263+ g_helper->set_basic_flip (" disable_monitor_replace_member_replication_status" , 1 );
249264 auto db = dbs_.back ();
250265 auto num_replicas = SISL_OPTIONS[" replicas" ].as < uint32_t >();
251266 auto num_members = SISL_OPTIONS[" replicas" ].as < uint32_t >() + SISL_OPTIONS[" spare_replicas" ].as < uint32_t >();
@@ -333,10 +348,12 @@ TEST_F(ReplDevDynamicTest, OutMemberDown) {
333348 });
334349 }
335350 g_helper->sync_for_cleanup_start (num_members);
351+ g_helper->remove_flip (" disable_monitor_replace_member_replication_status" );
336352 LOGINFO (" OutMemberDown test done replica={}" , g_helper->replica_num ());
337353}
338354
339355TEST_F (ReplDevDynamicTest, LeaderReplace) {
356+ g_helper->set_basic_flip (" disable_monitor_replace_member_replication_status" , 1 );
340357 // replica0(leader) and replica1 and replica2 is up. Replace replica0(leader) with replica3.
341358 // replica0 will yield leadership and any other replica will be come leader and leader
342359 // will do baseline resync to replica4(new member).
@@ -389,6 +406,8 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) {
389406 ASSERT_EQ (
390407 check_replace_member_status (db, task_id, g_helper->replica_id (member_out), g_helper->replica_id (member_in)),
391408 ReplaceMemberStatus::IN_PROGRESS);
409+ auto & raft_repl_svc = dynamic_cast < RaftReplService& >(hs ()->repl_service ());
410+ raft_repl_svc.monitor_replace_member_replication_status ();
392411 });
393412 if (g_helper->replica_num () == member_out) {
394413 // The out member will have the repl dev destroyed.
@@ -409,6 +428,7 @@ TEST_F(ReplDevDynamicTest, LeaderReplace) {
409428 check_replace_member_status (db, task_id, g_helper->replica_id (member_out), g_helper->replica_id (member_in)),
410429 ReplaceMemberStatus::COMPLETED);
411430 });
431+ g_helper->remove_flip (" disable_monitor_replace_member_replication_status" );
412432 LOGINFO (" LeaderReplace test done replica={}" , g_helper->replica_num ());
413433}
414434
@@ -479,6 +499,7 @@ TEST_F(ReplDevDynamicTest, OneMemberRestart) {
479499 });
480500 LOGINFO (" OneMemberRestart test done replica={}" , g_helper->replica_num ());
481501}
502+ #endif
482503
483504TEST_F (ReplDevDynamicTest, ValidateRequest) {
484505 LOGINFO (" ValidateRequest test started replica={}" , g_helper->replica_num ());
0 commit comments