Skip to content

Commit 56751dc

Browse files
committed
DAOS-18368 object: refine EC agg peer update
Some failures need to be retried. Signed-off-by: Xuezhao Liu <xuezhao.liu@hpe.com>
1 parent 601df3c commit 56751dc

File tree

1 file changed

+62
-9
lines changed

1 file changed

+62
-9
lines changed

src/object/srv_ec_aggregate.c

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/**
22
* (C) Copyright 2020-2024 Intel Corporation.
33
* (C) Copyright 2025 Google LLC
4-
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
4+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
55
*
66
* SPDX-License-Identifier: BSD-2-Clause-Patent
77
*/
@@ -1278,6 +1278,40 @@ agg_process_partial_stripe(struct ec_agg_entry *entry)
12781278
return rc;
12791279
}
12801280

1281+
static bool
1282+
agg_peer_failed(struct ec_agg_param *agg_param, struct daos_shard_loc *peer_loc)
1283+
{
1284+
struct pool_target *targets = NULL;
1285+
uint32_t failed_tgts_cnt = 0;
1286+
int i;
1287+
int rc;
1288+
1289+
rc = pool_map_find_failed_tgts(agg_param->ap_pool_info.api_pool->sp_map, &targets,
1290+
&failed_tgts_cnt);
1291+
if (rc) {
1292+
DL_ERROR(rc, DF_CONT " pool_map_find_failed_tgts failed.",
1293+
DP_CONT(agg_param->ap_pool_info.api_pool_uuid,
1294+
agg_param->ap_pool_info.api_cont_uuid));
1295+
return false;
1296+
}
1297+
1298+
if (targets == NULL || failed_tgts_cnt == 0)
1299+
return false;
1300+
1301+
for (i = 0; i < failed_tgts_cnt; i++) {
1302+
if (targets[i].ta_comp.co_rank == peer_loc->sd_rank &&
1303+
targets[i].ta_comp.co_index == peer_loc->sd_tgt_idx) {
1304+
D_DEBUG(DB_EPC, DF_CONT " peer parity tgt failed rank %d, tgt_idx %d.\n",
1305+
DP_CONT(agg_param->ap_pool_info.api_pool_uuid,
1306+
agg_param->ap_pool_info.api_cont_uuid),
1307+
peer_loc->sd_rank, peer_loc->sd_tgt_idx);
1308+
return true;
1309+
}
1310+
}
1311+
1312+
return false;
1313+
}
1314+
12811315
int
12821316
agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
12831317
{
@@ -1334,6 +1368,12 @@ agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
13341368
return rc;
13351369
}
13361370

1371+
static bool
1372+
agg_peer_retryable_err(int err)
1373+
{
1374+
return err == -DER_STALE || err == -DER_TIMEDOUT || daos_crt_network_error(err);
1375+
}
1376+
13371377
/* Sends the generated parity and the stripe number to the peer
13381378
* parity target. Handler writes the parity and deletes the replicas
13391379
* for the stripe.
@@ -1382,15 +1422,15 @@ agg_peer_update_ult(void *arg)
13821422
obj = obj_hdl2ptr(entry->ae_obj_hdl);
13831423
for (peer = 0; peer < p; peer++) {
13841424
uint64_t enqueue_id = 0;
1385-
bool overloaded;
1425+
bool peer_retry;
13861426

13871427
if (peer == pidx)
13881428
continue;
13891429
D_ASSERT(entry->ae_peer_pshards[peer].sd_rank != DAOS_TGT_IGNORE);
13901430
tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank;
13911431
tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx;
13921432
retry:
1393-
overloaded = false;
1433+
peer_retry = false;
13941434
rc = ds_obj_req_create(dss_get_module_info()->dmi_ctx, &tgt_ep,
13951435
DAOS_OBJ_RPC_EC_AGGREGATE, &rpc);
13961436
if (rc) {
@@ -1470,13 +1510,20 @@ agg_peer_update_ult(void *arg)
14701510
rc = ec_agg_out->ea_status;
14711511
if (rc == -DER_OVERLOAD_RETRY) {
14721512
enqueue_id = ec_agg_out->ea_comm_out.req_out_enqueue_id;
1473-
overloaded = true;
1513+
peer_retry = true;
14741514
}
14751515
D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR,
14761516
"update parity[%d] to %d:%d, status = " DF_RC "\n", peer,
14771517
tgt_ep.ep_rank, tgt_ep.ep_tag, DP_RC(rc));
14781518
peer_updated += rc == 0;
14791519
}
1520+
if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) &&
1521+
!agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) {
1522+
DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.",
1523+
DP_UOID(entry->ae_oid), pidx, peer);
1524+
peer_retry = true;
1525+
}
1526+
14801527
next:
14811528
if (bulk_hdl)
14821529
crt_bulk_free(bulk_hdl);
@@ -1487,7 +1534,7 @@ agg_peer_update_ult(void *arg)
14871534
rpc = NULL;
14881535
bulk_hdl = NULL;
14891536
iod_csums = NULL;
1490-
if (overloaded) {
1537+
if (peer_retry) {
14911538
dss_sleep(daos_rpc_rand_delay(max_delay) << 10);
14921539
goto retry;
14931540
}
@@ -1665,13 +1712,13 @@ agg_process_holes_ult(void *arg)
16651712
for (peer = 0; peer < p; peer++) {
16661713
uint64_t enqueue_id = 0;
16671714
uint32_t peer_shard;
1668-
bool overloaded;
1715+
bool peer_retry;
16691716

16701717
if (pidx == peer)
16711718
continue;
16721719

16731720
retry:
1674-
overloaded = false;
1721+
peer_retry = false;
16751722
D_ASSERT(entry->ae_peer_pshards[peer].sd_rank != DAOS_TGT_IGNORE);
16761723
tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank;
16771724
tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx;
@@ -1719,7 +1766,7 @@ agg_process_holes_ult(void *arg)
17191766
rc = ec_rep_out->er_status;
17201767
if (rc == -DER_OVERLOAD_RETRY) {
17211768
enqueue_id = ec_rep_out->er_comm_out.req_out_enqueue_id;
1722-
overloaded = true;
1769+
peer_retry = true;
17231770
}
17241771
D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR,
17251772
DF_UOID " parity[%d] er_status = " DF_RC "\n",
@@ -1728,7 +1775,13 @@ agg_process_holes_ult(void *arg)
17281775
}
17291776
crt_req_decref(rpc);
17301777
rpc = NULL;
1731-
if (overloaded) {
1778+
if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) &&
1779+
!agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) {
1780+
DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.",
1781+
DP_UOID(entry->ae_oid), pidx, peer);
1782+
peer_retry = true;
1783+
}
1784+
if (peer_retry) {
17321785
dss_sleep(daos_rpc_rand_delay(max_delay) << 10);
17331786
goto retry;
17341787
}

0 commit comments

Comments
 (0)