@@ -3258,10 +3258,17 @@ chk_leader_query(int pool_nr, uuid_t pools[], chk_query_head_cb_t head_cb,
32583258
32593259 while (ver == ins -> ci_ns_ver && ins -> ci_skip_oog == 0 && ins -> ci_pause == 0 ) {
32603260 dss_sleep (500 );
3261- if (++ wait_cnt % 40 == 0 )
3261+ if (++ wait_cnt % 40 == 0 ) {
32623262 D_WARN ("Leader (" DF_X64 ") query is blocked because of %d for "
32633263 "about %d seconds.\n" ,
32643264 gen , rc , wait_cnt / 2 );
3265+ /*
3266+ * Let's retry query in case of related dead rank recovered back
3267+ * before being handled by chk_dead_rank_ult, although it is rare.
3268+ */
3269+ break ;
3270+ }
3271+
32653272 if (rc != - DER_OOG )
32663273 break ;
32673274 }
@@ -3771,7 +3778,7 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src
37713778
37723779 /* Ignore the event that is not applicable to current rank. */
37733780
3774- if (src != CRT_EVS_SWIM )
3781+ if (src != CRT_EVS_SWIM && src != CRT_EVS_GRPMOD )
37753782 D_GOTO (out , rc = - DER_NOTAPPLICABLE );
37763783
37773784 if (type != CRT_EVT_DEAD && type != CRT_EVT_ALIVE )
@@ -3783,14 +3790,37 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src
37833790 D_GOTO (out , rc = - DER_NOMEM );
37843791
37853792 cdr -> cdr_rank = rank ;
3793+ } else if (d_list_empty (& ins -> ci_dead_ranks )) {
3794+ D_GOTO (out , rc = - DER_NOTAPPLICABLE );
37863795 }
37873796
37883797 ABT_mutex_lock (ins -> ci_abt_mutex );
37893798 if (cdr != NULL ) {
3799+ struct chk_dead_rank * tmp ;
3800+
37903801 /*
3791- * The event may be triggered on non-system SX. Let's notify the leader scheduler
3802+ * The event may be triggered on non-system SX (SWIM) . Let's ask chk_dead_rank_ult
37923803 * to handle that on system XS.
3804+ *
3805+ * The callback for one rank dead event maybe triggered twice from multiple source:
3806+ * SWIM and PG memberskip changes. Let's only add once into the ins->ci_dead_ranks.
3807+ *
3808+ * Generally, ins->ci_dead_ranks is very short. Then it is very fast to go through
3809+ * the whole list.
37933810 */
3811+ d_list_for_each_entry (tmp , & ins -> ci_dead_ranks , cdr_link ) {
3812+ if (tmp -> cdr_rank == rank ) {
3813+ /* Repeated one, ignore it. */
3814+ D_FREE (cdr );
3815+ D_GOTO (unlock , rc = - DER_NOTAPPLICABLE );
3816+ }
3817+
3818+ if (tmp -> cdr_rank > rank ) {
3819+ d_list_add (& cdr -> cdr_link , & tmp -> cdr_link );
3820+ D_GOTO (unlock , rc = 0 );
3821+ }
3822+ }
3823+
37943824 d_list_add_tail (& cdr -> cdr_link , & ins -> ci_dead_ranks );
37953825 } else {
37963826 /* Remove former non-handled dead rank from the list. */
@@ -3800,8 +3830,13 @@ chk_rank_event_cb(d_rank_t rank, uint64_t incarnation, enum crt_event_source src
38003830 D_FREE (cdr );
38013831 break ;
38023832 }
3833+
3834+ if (cdr -> cdr_rank > rank )
3835+ D_GOTO (unlock , rc = - DER_NOTAPPLICABLE );
38033836 }
38043837 }
3838+
3839+ unlock :
38053840 ABT_mutex_unlock (ins -> ci_abt_mutex );
38063841
38073842out :
0 commit comments