11/**
22 * (C) Copyright 2020-2024 Intel Corporation.
33 * (C) Copyright 2025 Google LLC
4- * (C) Copyright 2025 Hewlett Packard Enterprise Development LP
4+ * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
55 *
66 * SPDX-License-Identifier: BSD-2-Clause-Patent
77 */
@@ -1278,6 +1278,40 @@ agg_process_partial_stripe(struct ec_agg_entry *entry)
12781278 return rc ;
12791279}
12801280
1281+ static bool
1282+ agg_peer_failed (struct ec_agg_param * agg_param , struct daos_shard_loc * peer_loc )
1283+ {
1284+ struct pool_target * targets = NULL ;
1285+ uint32_t failed_tgts_cnt = 0 ;
1286+ int i ;
1287+ int rc ;
1288+
1289+ rc = pool_map_find_failed_tgts (agg_param -> ap_pool_info .api_pool -> sp_map , & targets ,
1290+ & failed_tgts_cnt );
1291+ if (rc ) {
1292+ DL_ERROR (rc , DF_CONT " pool_map_find_failed_tgts failed." ,
1293+ DP_CONT (agg_param -> ap_pool_info .api_pool_uuid ,
1294+ agg_param -> ap_pool_info .api_cont_uuid ));
1295+ return false;
1296+ }
1297+
1298+ if (targets == NULL || failed_tgts_cnt == 0 )
1299+ return false;
1300+
1301+ for (i = 0 ; i < failed_tgts_cnt ; i ++ ) {
1302+ if (targets [i ].ta_comp .co_rank == peer_loc -> sd_rank &&
1303+ targets [i ].ta_comp .co_index == peer_loc -> sd_tgt_idx ) {
1304+ D_DEBUG (DB_EPC , DF_CONT " peer parity tgt failed rank %d, tgt_idx %d.\n" ,
1305+ DP_CONT (agg_param -> ap_pool_info .api_pool_uuid ,
1306+ agg_param -> ap_pool_info .api_cont_uuid ),
1307+ peer_loc -> sd_rank , peer_loc -> sd_tgt_idx );
1308+ return true;
1309+ }
1310+ }
1311+
1312+ return false;
1313+ }
1314+
12811315int
12821316agg_peer_check_avail (struct ec_agg_param * agg_param , struct ec_agg_entry * entry )
12831317{
@@ -1334,6 +1368,12 @@ agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
13341368 return rc ;
13351369}
13361370
1371+ static bool
1372+ agg_peer_retryable_err (int err )
1373+ {
1374+ return err == - DER_STALE || err == - DER_TIMEDOUT || daos_crt_network_error (err );
1375+ }
1376+
13371377/* Sends the generated parity and the stripe number to the peer
13381378 * parity target. Handler writes the parity and deletes the replicas
13391379 * for the stripe.
@@ -1382,15 +1422,15 @@ agg_peer_update_ult(void *arg)
13821422 obj = obj_hdl2ptr (entry -> ae_obj_hdl );
13831423 for (peer = 0 ; peer < p ; peer ++ ) {
13841424 uint64_t enqueue_id = 0 ;
1385- bool overloaded ;
1425+ bool peer_retry ;
13861426
13871427 if (peer == pidx )
13881428 continue ;
13891429 D_ASSERT (entry -> ae_peer_pshards [peer ].sd_rank != DAOS_TGT_IGNORE );
13901430 tgt_ep .ep_rank = entry -> ae_peer_pshards [peer ].sd_rank ;
13911431 tgt_ep .ep_tag = entry -> ae_peer_pshards [peer ].sd_tgt_idx ;
13921432retry :
1393- overloaded = false;
1433+ peer_retry = false;
13941434 rc = ds_obj_req_create (dss_get_module_info ()-> dmi_ctx , & tgt_ep ,
13951435 DAOS_OBJ_RPC_EC_AGGREGATE , & rpc );
13961436 if (rc ) {
@@ -1470,13 +1510,20 @@ agg_peer_update_ult(void *arg)
14701510 rc = ec_agg_out -> ea_status ;
14711511 if (rc == - DER_OVERLOAD_RETRY ) {
14721512 enqueue_id = ec_agg_out -> ea_comm_out .req_out_enqueue_id ;
1473- overloaded = true;
1513+ peer_retry = true;
14741514 }
14751515 D_CDEBUG (rc == 0 , DB_TRACE , DLOG_ERR ,
14761516 "update parity[%d] to %d:%d, status = " DF_RC "\n" , peer ,
14771517 tgt_ep .ep_rank , tgt_ep .ep_tag , DP_RC (rc ));
14781518 peer_updated += rc == 0 ;
14791519 }
1520+ if (rc != 0 && peer_updated && agg_peer_retryable_err (rc ) &&
1521+ !agg_peer_failed (agg_param , & entry -> ae_peer_pshards [peer ])) {
1522+ DL_INFO (rc , DF_UOID " pidx %d to parity[%d] will retry." ,
1523+ DP_UOID (entry -> ae_oid ), pidx , peer );
1524+ peer_retry = true;
1525+ }
1526+
14801527next :
14811528 if (bulk_hdl )
14821529 crt_bulk_free (bulk_hdl );
@@ -1487,7 +1534,7 @@ agg_peer_update_ult(void *arg)
14871534 rpc = NULL ;
14881535 bulk_hdl = NULL ;
14891536 iod_csums = NULL ;
1490- if (overloaded ) {
1537+ if (peer_retry ) {
14911538 dss_sleep (daos_rpc_rand_delay (max_delay ) << 10 );
14921539 goto retry ;
14931540 }
@@ -1665,13 +1712,13 @@ agg_process_holes_ult(void *arg)
16651712 for (peer = 0 ; peer < p ; peer ++ ) {
16661713 uint64_t enqueue_id = 0 ;
16671714 uint32_t peer_shard ;
1668- bool overloaded ;
1715+ bool peer_retry ;
16691716
16701717 if (pidx == peer )
16711718 continue ;
16721719
16731720retry :
1674- overloaded = false;
1721+ peer_retry = false;
16751722 D_ASSERT (entry -> ae_peer_pshards [peer ].sd_rank != DAOS_TGT_IGNORE );
16761723 tgt_ep .ep_rank = entry -> ae_peer_pshards [peer ].sd_rank ;
16771724 tgt_ep .ep_tag = entry -> ae_peer_pshards [peer ].sd_tgt_idx ;
@@ -1719,7 +1766,7 @@ agg_process_holes_ult(void *arg)
17191766 rc = ec_rep_out -> er_status ;
17201767 if (rc == - DER_OVERLOAD_RETRY ) {
17211768 enqueue_id = ec_rep_out -> er_comm_out .req_out_enqueue_id ;
1722- overloaded = true;
1769+ peer_retry = true;
17231770 }
17241771 D_CDEBUG (rc == 0 , DB_TRACE , DLOG_ERR ,
17251772 DF_UOID " parity[%d] er_status = " DF_RC "\n" ,
@@ -1728,7 +1775,13 @@ agg_process_holes_ult(void *arg)
17281775 }
17291776 crt_req_decref (rpc );
17301777 rpc = NULL ;
1731- if (overloaded ) {
1778+ if (rc != 0 && peer_updated && agg_peer_retryable_err (rc ) &&
1779+ !agg_peer_failed (agg_param , & entry -> ae_peer_pshards [peer ])) {
1780+ DL_INFO (rc , DF_UOID " pidx %d to parity[%d] will retry." ,
1781+ DP_UOID (entry -> ae_oid ), pidx , peer );
1782+ peer_retry = true;
1783+ }
1784+ if (peer_retry ) {
17321785 dss_sleep (daos_rpc_rand_delay (max_delay ) << 10 );
17331786 goto retry ;
17341787 }
0 commit comments