Skip to content

Commit e717e1f

Browse files
authored
DAOS-18368 rebuild: fix bug of ec_agg_boundary and agg peer update (#17324)
1. fix a bug of using ec_agg_boundary before checking its valid 2. add some more logs for rebuild fetch getting zero iod_size, to provide some hints for layout information. 3. fix a bug of EC agg peer update, some failed update need to be retried to avoid data corruption. 4. refine some detailed process of dtx_resync wating for rebuild scan. Signed-off-by: Xuezhao Liu <[email protected]>
1 parent 8a24f84 commit e717e1f

File tree

8 files changed

+298
-94
lines changed

8 files changed

+298
-94
lines changed

src/include/daos_srv/pool.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* (C) Copyright 2016-2024 Intel Corporation.
3-
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44
*
55
* SPDX-License-Identifier: BSD-2-Clause-Patent
66
*/
@@ -80,6 +80,7 @@ struct ds_pool {
8080
struct sched_request *sp_ec_ephs_req;
8181

8282
uint32_t sp_dtx_resync_version;
83+
uint32_t sp_gl_dtx_resync_version; /* global DTX resync version */
8384
/* Special pool/container handle uuid, which are
8485
* created on the pool leader step up, and propagated
8586
* to all servers by IV. Then they will be used by server

src/object/obj_internal.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
22
* (C) Copyright 2016-2024 Intel Corporation.
3-
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
3+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
44
*
55
* SPDX-License-Identifier: BSD-2-Clause-Patent
66
*/
@@ -1181,6 +1181,8 @@ iov_alloc_for_csum_info(d_iov_t *iov, struct dcs_csum_info *csum_info);
11811181
/* obj_layout.c */
11821182
int
11831183
obj_pl_grp_idx(uint32_t layout_gl_ver, uint64_t hash, uint32_t grp_nr);
1184+
void
1185+
obj_dump_grp_layout(daos_handle_t oh, uint32_t shard);
11841186

11851187
int
11861188
obj_pl_place(struct pl_map *map, uint16_t layout_ver, struct daos_obj_md *md,

src/object/obj_layout.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
* (C) Copyright 2016-2023 Intel Corporation.
3+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
34
*
45
* SPDX-License-Identifier: BSD-2-Clause-Patent
56
*/
@@ -95,3 +96,36 @@ obj_layout_diff(struct pl_map *map, daos_unit_oid_t oid, uint32_t new_ver, uint3
9596

9697
return rc;
9798
}
99+
100+
void
101+
obj_dump_grp_layout(daos_handle_t oh, uint32_t shard)
102+
{
103+
struct dc_object *obj;
104+
struct dc_obj_shard *obj_shard;
105+
uint32_t grp_idx, i, nr;
106+
107+
obj = obj_hdl2ptr(oh);
108+
if (obj == NULL) {
109+
D_INFO("invalid oh");
110+
return;
111+
}
112+
if (shard >= obj->cob_shards_nr) {
113+
D_ERROR("bad shard %d, cob_shards_nr %d", shard, obj->cob_shards_nr);
114+
goto out;
115+
}
116+
117+
grp_idx = shard / obj->cob_grp_size;
118+
D_INFO(DF_OID " shard %d, grp_idx %d, grp_size %d", DP_OID(obj->cob_md.omd_id), shard,
119+
grp_idx, obj->cob_grp_size);
120+
for (i = grp_idx * obj->cob_grp_size, nr = 0; nr < obj->cob_grp_size; i++, nr++) {
121+
obj_shard = &obj->cob_shards->do_shards[i];
122+
D_INFO("shard %d/%d/%d, tgt_id %d, rank %d, tgt_idx %d, "
123+
"rebuilding %d, reintegrating %d, fseq %d",
124+
i, obj_shard->do_shard_idx, obj_shard->do_shard, obj_shard->do_target_id,
125+
obj_shard->do_target_rank, obj_shard->do_target_idx,
126+
obj_shard->do_rebuilding, obj_shard->do_reintegrating, obj_shard->do_fseq);
127+
}
128+
129+
out:
130+
obj_decref(obj);
131+
}

src/object/srv_ec_aggregate.c

Lines changed: 64 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/**
22
* (C) Copyright 2020-2024 Intel Corporation.
33
* (C) Copyright 2025 Google LLC
4-
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
4+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
55
*
66
* SPDX-License-Identifier: BSD-2-Clause-Patent
77
*/
@@ -1278,6 +1278,42 @@ agg_process_partial_stripe(struct ec_agg_entry *entry)
12781278
return rc;
12791279
}
12801280

1281+
static bool
1282+
agg_peer_failed(struct ec_agg_param *agg_param, struct daos_shard_loc *peer_loc)
1283+
{
1284+
struct pool_target *targets = NULL;
1285+
uint32_t failed_tgts_cnt = 0;
1286+
int i;
1287+
int rc;
1288+
1289+
rc = pool_map_find_failed_tgts(agg_param->ap_pool_info.api_pool->sp_map, &targets,
1290+
&failed_tgts_cnt);
1291+
if (rc) {
1292+
DL_ERROR(rc, DF_CONT " pool_map_find_failed_tgts failed.",
1293+
DP_CONT(agg_param->ap_pool_info.api_pool_uuid,
1294+
agg_param->ap_pool_info.api_cont_uuid));
1295+
return false;
1296+
}
1297+
1298+
if (targets == NULL || failed_tgts_cnt == 0)
1299+
return false;
1300+
1301+
for (i = 0; i < failed_tgts_cnt; i++) {
1302+
if (targets[i].ta_comp.co_rank == peer_loc->sd_rank &&
1303+
targets[i].ta_comp.co_index == peer_loc->sd_tgt_idx) {
1304+
D_DEBUG(DB_EPC, DF_CONT " peer parity tgt failed rank %d, tgt_idx %d.\n",
1305+
DP_CONT(agg_param->ap_pool_info.api_pool_uuid,
1306+
agg_param->ap_pool_info.api_cont_uuid),
1307+
peer_loc->sd_rank, peer_loc->sd_tgt_idx);
1308+
D_FREE(targets);
1309+
return true;
1310+
}
1311+
}
1312+
1313+
D_FREE(targets);
1314+
return false;
1315+
}
1316+
12811317
int
12821318
agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
12831319
{
@@ -1334,6 +1370,12 @@ agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
13341370
return rc;
13351371
}
13361372

1373+
static bool
1374+
agg_peer_retryable_err(int err)
1375+
{
1376+
return err == -DER_STALE || err == -DER_TIMEDOUT || daos_crt_network_error(err);
1377+
}
1378+
13371379
/* Sends the generated parity and the stripe number to the peer
13381380
* parity target. Handler writes the parity and deletes the replicas
13391381
* for the stripe.
@@ -1382,15 +1424,15 @@ agg_peer_update_ult(void *arg)
13821424
obj = obj_hdl2ptr(entry->ae_obj_hdl);
13831425
for (peer = 0; peer < p; peer++) {
13841426
uint64_t enqueue_id = 0;
1385-
bool overloaded;
1427+
bool peer_retry;
13861428

13871429
if (peer == pidx)
13881430
continue;
13891431
D_ASSERT(entry->ae_peer_pshards[peer].sd_rank != DAOS_TGT_IGNORE);
13901432
tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank;
13911433
tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx;
13921434
retry:
1393-
overloaded = false;
1435+
peer_retry = false;
13941436
rc = ds_obj_req_create(dss_get_module_info()->dmi_ctx, &tgt_ep,
13951437
DAOS_OBJ_RPC_EC_AGGREGATE, &rpc);
13961438
if (rc) {
@@ -1470,13 +1512,20 @@ agg_peer_update_ult(void *arg)
14701512
rc = ec_agg_out->ea_status;
14711513
if (rc == -DER_OVERLOAD_RETRY) {
14721514
enqueue_id = ec_agg_out->ea_comm_out.req_out_enqueue_id;
1473-
overloaded = true;
1515+
peer_retry = true;
14741516
}
14751517
D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR,
14761518
"update parity[%d] to %d:%d, status = " DF_RC "\n", peer,
14771519
tgt_ep.ep_rank, tgt_ep.ep_tag, DP_RC(rc));
14781520
peer_updated += rc == 0;
14791521
}
1522+
if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) &&
1523+
!agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) {
1524+
DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.",
1525+
DP_UOID(entry->ae_oid), pidx, peer);
1526+
peer_retry = true;
1527+
}
1528+
14801529
next:
14811530
if (bulk_hdl)
14821531
crt_bulk_free(bulk_hdl);
@@ -1487,7 +1536,7 @@ agg_peer_update_ult(void *arg)
14871536
rpc = NULL;
14881537
bulk_hdl = NULL;
14891538
iod_csums = NULL;
1490-
if (overloaded) {
1539+
if (peer_retry) {
14911540
dss_sleep(daos_rpc_rand_delay(max_delay) << 10);
14921541
goto retry;
14931542
}
@@ -1665,13 +1714,13 @@ agg_process_holes_ult(void *arg)
16651714
for (peer = 0; peer < p; peer++) {
16661715
uint64_t enqueue_id = 0;
16671716
uint32_t peer_shard;
1668-
bool overloaded;
1717+
bool peer_retry;
16691718

16701719
if (pidx == peer)
16711720
continue;
16721721

16731722
retry:
1674-
overloaded = false;
1723+
peer_retry = false;
16751724
D_ASSERT(entry->ae_peer_pshards[peer].sd_rank != DAOS_TGT_IGNORE);
16761725
tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank;
16771726
tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx;
@@ -1719,7 +1768,7 @@ agg_process_holes_ult(void *arg)
17191768
rc = ec_rep_out->er_status;
17201769
if (rc == -DER_OVERLOAD_RETRY) {
17211770
enqueue_id = ec_rep_out->er_comm_out.req_out_enqueue_id;
1722-
overloaded = true;
1771+
peer_retry = true;
17231772
}
17241773
D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR,
17251774
DF_UOID " parity[%d] er_status = " DF_RC "\n",
@@ -1728,7 +1777,13 @@ agg_process_holes_ult(void *arg)
17281777
}
17291778
crt_req_decref(rpc);
17301779
rpc = NULL;
1731-
if (overloaded) {
1780+
if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) &&
1781+
!agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) {
1782+
DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.",
1783+
DP_UOID(entry->ae_oid), pidx, peer);
1784+
peer_retry = true;
1785+
}
1786+
if (peer_retry) {
17321787
dss_sleep(daos_rpc_rand_delay(max_delay) << 10);
17331788
goto retry;
17341789
}

src/object/srv_obj.c

Lines changed: 61 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/**
22
* (C) Copyright 2016-2024 Intel Corporation.
33
* (C) Copyright 2025 Google LLC
4-
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
4+
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
55
*
66
* SPDX-License-Identifier: BSD-2-Clause-Patent
77
*/
@@ -701,6 +701,22 @@ obj_set_reply_sizes(crt_rpc_t *rpc, daos_iod_t *iods, int iod_nr, uint8_t *skips
701701
sizes[i] = iods[idx].iod_size;
702702
D_DEBUG(DB_IO, DF_UOID" %d:"DF_U64"\n", DP_UOID(orw->orw_oid),
703703
i, iods[idx].iod_size);
704+
if ((orw->orw_flags & ORF_FOR_MIGRATION) && sizes[i] == 0) {
705+
D_DEBUG(DB_REBUILD,
706+
DF_CONT " obj " DF_UOID "rebuild fetch zero iod_size, "
707+
"i:%d/idx:%d, iod_nr %d, orw_epoch " DF_X64
708+
", orw_epoch_first " DF_X64 " may cause DER_DATA_LOSS",
709+
DP_CONT(orw->orw_pool_uuid, orw->orw_co_uuid),
710+
DP_UOID(orw->orw_oid), i, idx, iods[idx].iod_nr, orw->orw_epoch,
711+
orw->orw_epoch_first);
712+
if (iods[idx].iod_type == DAOS_IOD_ARRAY) {
713+
int j;
714+
715+
for (j = 0; j < min(8, iods[idx].iod_nr); j++)
716+
D_DEBUG(DB_REBUILD, "recx[%d] - " DF_RECX, j,
717+
DP_RECX(iods[idx].iod_recxs[j]));
718+
}
719+
}
704720
idx++;
705721
}
706722

@@ -1368,7 +1384,7 @@ struct ec_agg_boundary_arg {
13681384
};
13691385

13701386
static int
1371-
obj_fetch_ec_agg_boundary(void *data)
1387+
obj_fetch_ec_agg_boundary_ult(void *data)
13721388
{
13731389
struct ec_agg_boundary_arg *arg = data;
13741390
int rc;
@@ -1381,6 +1397,33 @@ obj_fetch_ec_agg_boundary(void *data)
13811397
return rc;
13821398
}
13831399

1400+
static int
1401+
obj_fetch_ec_agg_boundary(struct obj_io_context *ioc, daos_unit_oid_t *uoid)
1402+
{
1403+
struct ec_agg_boundary_arg arg;
1404+
int rc;
1405+
1406+
arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool;
1407+
uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid);
1408+
rc = dss_ult_execute(obj_fetch_ec_agg_boundary_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, 0);
1409+
if (rc) {
1410+
DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.",
1411+
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
1412+
DP_UOID(*uoid));
1413+
return rc;
1414+
}
1415+
if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) {
1416+
rc = -DER_FETCH_AGAIN;
1417+
DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.",
1418+
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid));
1419+
return rc;
1420+
}
1421+
D_DEBUG(DB_IO, DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n",
1422+
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid),
1423+
ioc->ioc_coc->sc_ec_agg_eph_boundary);
1424+
return 0;
1425+
}
1426+
13841427
static int
13851428
obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *iods,
13861429
struct dcs_iod_csums *iod_csums, uint64_t *offs, uint8_t *skips,
@@ -1503,29 +1546,14 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io
15031546
}
15041547
if ((ec_deg_fetch || (ec_recov && get_parity_list)) &&
15051548
ioc->ioc_coc->sc_ec_agg_eph_valid == 0) {
1506-
struct ec_agg_boundary_arg arg;
1507-
1508-
arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool;
1509-
uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid);
1510-
rc = dss_ult_execute(obj_fetch_ec_agg_boundary, &arg, NULL, NULL,
1511-
DSS_XS_SYS, 0, 0);
1549+
rc = obj_fetch_ec_agg_boundary(ioc, &orw->orw_oid);
15121550
if (rc) {
15131551
DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.",
15141552
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
15151553
DP_UOID(orw->orw_oid));
15161554
goto out;
15171555
}
1518-
if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) {
1519-
rc = -DER_FETCH_AGAIN;
1520-
DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.",
1521-
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
1522-
DP_UOID(orw->orw_oid));
1523-
goto out;
1524-
}
1525-
D_DEBUG(DB_IO,
1526-
DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n",
1527-
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
1528-
DP_UOID(orw->orw_oid), ioc->ioc_coc->sc_ec_agg_eph_boundary);
1556+
D_ASSERT(ioc->ioc_coc->sc_ec_agg_eph_valid);
15291557
}
15301558
if (get_parity_list) {
15311559
D_ASSERT(!ec_deg_fetch);
@@ -3030,6 +3058,20 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
30303058
if (orw->orw_flags & ORF_FETCH_EPOCH_EC_AGG_BOUNDARY) {
30313059
uint64_t rebuild_epoch;
30323060

3061+
if (ioc.ioc_coc->sc_ec_agg_eph_valid == 0) {
3062+
rc = obj_fetch_ec_agg_boundary(&ioc, &orw->orw_oid);
3063+
if (rc) {
3064+
DL_ERROR(rc,
3065+
DF_CONT ", " DF_UOID " fetch ec_agg_boundary "
3066+
"failed.",
3067+
DP_CONT(ioc.ioc_coc->sc_pool_uuid,
3068+
ioc.ioc_coc->sc_uuid),
3069+
DP_UOID(orw->orw_oid));
3070+
goto out;
3071+
}
3072+
D_ASSERT(ioc.ioc_coc->sc_ec_agg_eph_valid);
3073+
}
3074+
30333075
D_ASSERTF(orw->orw_epoch <= orw->orw_epoch_first,
30343076
"bad orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 "\n",
30353077
orw->orw_epoch, orw->orw_epoch_first);

0 commit comments

Comments
 (0)