From a2876786d65cc08ee5c55c87000d0a9c40910d66 Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Mon, 29 Dec 2025 04:00:15 +0000 Subject: [PATCH 1/5] DAOS-18368 rebuild: fix use before check of ec_agg_boundary 1. fix a bug of using ec_agg_boundary before checking its valid 2. add some more logs for rebuild fetch getting zero iod_size, to provide some hints for layout information. Signed-off-by: Xuezhao Liu --- src/object/obj_internal.h | 2 + src/object/obj_layout.c | 34 ++++++++++++++++ src/object/srv_obj.c | 76 +++++++++++++++++++++++++++--------- src/object/srv_obj_migrate.c | 56 +++++++++++++++++++++----- 4 files changed, 140 insertions(+), 28 deletions(-) diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 598c37644ee..3f630795f56 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -1181,6 +1181,8 @@ iov_alloc_for_csum_info(d_iov_t *iov, struct dcs_csum_info *csum_info); /* obj_layout.c */ int obj_pl_grp_idx(uint32_t layout_gl_ver, uint64_t hash, uint32_t grp_nr); +void +obj_dump_grp_layout(daos_handle_t oh, uint32_t shard); int obj_pl_place(struct pl_map *map, uint16_t layout_ver, struct daos_obj_md *md, diff --git a/src/object/obj_layout.c b/src/object/obj_layout.c index 189261ad31e..337adab92c4 100644 --- a/src/object/obj_layout.c +++ b/src/object/obj_layout.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -95,3 +96,36 @@ obj_layout_diff(struct pl_map *map, daos_unit_oid_t oid, uint32_t new_ver, uint3 return rc; } + +void +obj_dump_grp_layout(daos_handle_t oh, uint32_t shard) +{ + struct dc_object *obj; + struct dc_obj_shard *obj_shard; + uint32_t grp_idx, i, nr; + + obj = obj_hdl2ptr(oh); + if (obj == NULL) { + D_INFO("invalid oh"); + return; + } + if (shard >= obj->cob_shards_nr) { + D_ERROR("bad shard %d, cob_shards_nr %d", shard, obj->cob_shards_nr); + goto out; + } + + grp_idx = shard / obj->cob_grp_size; + D_INFO(DF_OID " shard %d, grp_idx %d, grp_size %d", DP_OID(obj->cob_md.omd_id), shard, + grp_idx, obj->cob_grp_size); + for (i = grp_idx * obj->cob_grp_size, nr = 0; nr < obj->cob_grp_size; i++, nr++) { + obj_shard = &obj->cob_shards->do_shards[i]; + D_INFO("shard %d/%d/%d, tgt_id %d, rank %d, tgt_idx %d, " + "rebuilding %d, reintegrating %d, fseq %d", + i, obj_shard->do_shard_idx, obj_shard->do_shard, obj_shard->do_target_id, + obj_shard->do_target_rank, obj_shard->do_target_idx, + obj_shard->do_rebuilding, obj_shard->do_reintegrating, obj_shard->do_fseq); + } + +out: + obj_decref(obj); +} diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 6ea4bb63ab6..78989187fe9 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -701,6 +701,20 @@ obj_set_reply_sizes(crt_rpc_t *rpc, daos_iod_t *iods, int iod_nr, uint8_t *skips sizes[i] = iods[idx].iod_size; D_DEBUG(DB_IO, DF_UOID" %d:"DF_U64"\n", DP_UOID(orw->orw_oid), i, iods[idx].iod_size); + if ((orw->orw_flags & ORF_FOR_MIGRATION) && sizes[i] == 0) { + D_INFO(DF_CONT " obj " DF_UOID "rebuild fetch zero iod_size, i:%d/idx:%d, " + "iod_nr %d, orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 + " may cause DER_DATA_LOSS", + DP_CONT(orw->orw_pool_uuid, orw->orw_co_uuid), DP_UOID(orw->orw_oid), + i, idx, iods[idx].iod_nr, orw->orw_epoch, orw->orw_epoch_first); + if (iods[idx].iod_type == DAOS_IOD_ARRAY) { + int j; + + for (j = 0; j < min(8, iods[idx].iod_nr); j++) + D_INFO("recx[%d] - " DF_RECX, j, + DP_RECX(iods[idx].iod_recxs[j])); + } + } idx++; } @@ -1368,7 +1382,7 @@ struct ec_agg_boundary_arg { }; static int -obj_fetch_ec_agg_boundary(void *data) +obj_fetch_ec_agg_boundary_ult(void *data) { struct ec_agg_boundary_arg *arg = data; int rc; @@ -1381,6 +1395,33 @@ obj_fetch_ec_agg_boundary(void *data) return rc; } +static int +obj_fetch_ec_agg_boundary(struct obj_io_context *ioc, daos_unit_oid_t *uoid) +{ + struct ec_agg_boundary_arg arg; + int rc; + + arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool; + uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid); + rc = dss_ult_execute(obj_fetch_ec_agg_boundary_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, 0); + if (rc) { + DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.", + DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), + DP_UOID(*uoid)); + return rc; + } + if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) { + rc = -DER_FETCH_AGAIN; + DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.", + DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid)); + return rc; + } + D_DEBUG(DB_IO, DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n", + DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid), + ioc->ioc_coc->sc_ec_agg_eph_boundary); + return 0; +} + static int obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *iods, struct dcs_iod_csums *iod_csums, uint64_t *offs, uint8_t *skips, @@ -1503,29 +1544,14 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io } if ((ec_deg_fetch || (ec_recov && get_parity_list)) && ioc->ioc_coc->sc_ec_agg_eph_valid == 0) { - struct ec_agg_boundary_arg arg; - - arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool; - uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid); - rc = dss_ult_execute(obj_fetch_ec_agg_boundary, &arg, NULL, NULL, - DSS_XS_SYS, 0, 0); + rc = obj_fetch_ec_agg_boundary(ioc, &orw->orw_oid); if (rc) { DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.", DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(orw->orw_oid)); goto out; } - if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) { - rc = -DER_FETCH_AGAIN; - DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.", - DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), - DP_UOID(orw->orw_oid)); - goto out; - } - D_DEBUG(DB_IO, - DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n", - DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), - DP_UOID(orw->orw_oid), ioc->ioc_coc->sc_ec_agg_eph_boundary); + D_ASSERT(ioc->ioc_coc->sc_ec_agg_eph_valid); } if (get_parity_list) { D_ASSERT(!ec_deg_fetch); @@ -3030,6 +3056,20 @@ ds_obj_rw_handler(crt_rpc_t *rpc) if (orw->orw_flags & ORF_FETCH_EPOCH_EC_AGG_BOUNDARY) { uint64_t rebuild_epoch; + if (ioc.ioc_coc->sc_ec_agg_eph_valid == 0) { + rc = obj_fetch_ec_agg_boundary(&ioc, &orw->orw_oid); + if (rc) { + DL_ERROR(rc, + DF_CONT ", " DF_UOID " fetch ec_agg_boundary " + "failed.", + DP_CONT(ioc.ioc_coc->sc_pool_uuid, + ioc.ioc_coc->sc_uuid), + DP_UOID(orw->orw_oid)); + goto out; + } + D_ASSERT(ioc.ioc_coc->sc_ec_agg_eph_valid); + } + D_ASSERTF(orw->orw_epoch <= orw->orw_epoch_first, "bad orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 "\n", orw->orw_epoch, orw->orw_epoch_first); diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index d2a95aa3c79..3809780f93e 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -1156,6 +1156,28 @@ migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh, return rc; } +static void +mrone_dump_info(struct migrate_one *mrone, daos_handle_t oh, daos_iod_t *iod) +{ + int i; + + if (daos_is_dkey_uint64(mrone->mo_oid.id_pub) && mrone->mo_dkey.iov_len == 8) + D_INFO(DF_RB ": " DF_UOID " int dkey " DF_U64 ", akey " DF_KEY ", iod_type %d, " + " iod_nr %d, iod_size " DF_U64, + DP_RB_MPT(mrone->mo_tls), DP_UOID(mrone->mo_oid), + *(uint64_t *)mrone->mo_dkey.iov_buf, DP_KEY(&iod->iod_name), iod->iod_type, + iod->iod_nr, iod->iod_size); + else + D_INFO(DF_RB ": " DF_UOID " dkey " DF_KEY ", akey " DF_KEY ", iod_type %d, " + " iod_nr %d, iod_size " DF_U64, + DP_RB_MPT(mrone->mo_tls), DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), + DP_KEY(&iod->iod_name), iod->iod_type, iod->iod_nr, iod->iod_size); + if (iod->iod_type == DAOS_IOD_ARRAY) + for (i = 0; i < min(8, iod->iod_nr); i++) + D_INFO("recxs[%d] - " DF_RECX, i, DP_RECX(iod->iod_recxs[i])); + obj_dump_grp_layout(oh, mrone->mo_oid.id_shard); +} + static int migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, struct ds_cont_child *ds_cont) @@ -1224,6 +1246,8 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, daos_iod_t *iod = &mrone->mo_iods[i]; if (mrone->mo_iods[i].iod_size == 0) { + static __thread int log_nr; + /* zero size iod will cause assertion failure * in VOS, so let's check here. * So the object is being destroyed between @@ -1236,11 +1260,16 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, */ rc = -DER_DATA_LOSS; D_DEBUG(DB_REBUILD, - DF_RB ": " DF_UOID " %p dkey " DF_KEY " " DF_KEY - " nr %d/%d eph " DF_U64 " " DF_RC "\n", - DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, - DP_KEY(&mrone->mo_dkey), DP_KEY(&mrone->mo_iods[i].iod_name), - mrone->mo_iod_num, i, mrone->mo_epoch, DP_RC(rc)); + DF_RB ": cont " DF_UUID " obj " DF_UOID " dkey " DF_KEY " " DF_KEY + " nr %d/%d eph " DF_X64 " " DF_RC "\n", + DP_RB_MRO(mrone), DP_UUID(mrone->mo_cont_uuid), + DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), + DP_KEY(&mrone->mo_iods[i].iod_name), mrone->mo_iod_num, i, + mrone->mo_epoch, DP_RC(rc)); + if (log_nr <= 128) { + mrone_dump_info(mrone, oh, &mrone->mo_iods[i]); + log_nr++; + } D_GOTO(out, rc); } @@ -1407,6 +1436,8 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, for (i = 0; rc == 0 && i < iod_num; i++) { if (iods[i].iod_size == 0) { + static __thread int log_nr; + /* zero size iod will cause assertion failure * in VOS, so let's check here. * So the object is being destroyed between @@ -1418,11 +1449,16 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh, * the rebuild and retry. */ rc = -DER_DATA_LOSS; - D_INFO(DF_RB ": " DF_UOID " %p dkey " DF_KEY " " DF_KEY - " nr %d/%d eph " DF_U64 " " DF_RC "\n", - DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone, - DP_KEY(&mrone->mo_dkey), DP_KEY(&iods[i].iod_name), iod_num, i, - mrone->mo_epoch, DP_RC(rc)); + DL_INFO(rc, + DF_RB ": cont " DF_UUID " obj " DF_UOID " dkey " DF_KEY " " DF_KEY + " nr %d/%d mo_epoch " DF_X64 " fetch_eph " DF_X64, + DP_RB_MRO(mrone), DP_UUID(mrone->mo_cont_uuid), + DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), + DP_KEY(&iods[i].iod_name), iod_num, i, mrone->mo_epoch, fetch_eph); + if (log_nr <= 128) { + mrone_dump_info(mrone, oh, &mrone->mo_iods[i]); + log_nr++; + } D_GOTO(end, rc); } } From da0699cde703f978c1d70286399390da26920606 Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Sun, 4 Jan 2026 10:32:44 +0000 Subject: [PATCH 2/5] DAOS-18368 rebuild: a few log change Signed-off-by: Xuezhao Liu --- src/object/obj_internal.h | 2 +- src/object/obj_layout.c | 2 +- src/object/srv_obj.c | 18 ++++++++++-------- src/object/srv_obj_migrate.c | 8 ++++---- src/rebuild/scan.c | 8 ++++---- 5 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 3f630795f56..ba3191e761b 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ diff --git a/src/object/obj_layout.c b/src/object/obj_layout.c index 337adab92c4..87958b70a11 100644 --- a/src/object/obj_layout.c +++ b/src/object/obj_layout.c @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 78989187fe9..b08b8981dee 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -702,17 +702,19 @@ obj_set_reply_sizes(crt_rpc_t *rpc, daos_iod_t *iods, int iod_nr, uint8_t *skips D_DEBUG(DB_IO, DF_UOID" %d:"DF_U64"\n", DP_UOID(orw->orw_oid), i, iods[idx].iod_size); if ((orw->orw_flags & ORF_FOR_MIGRATION) && sizes[i] == 0) { - D_INFO(DF_CONT " obj " DF_UOID "rebuild fetch zero iod_size, i:%d/idx:%d, " - "iod_nr %d, orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 - " may cause DER_DATA_LOSS", - DP_CONT(orw->orw_pool_uuid, orw->orw_co_uuid), DP_UOID(orw->orw_oid), - i, idx, iods[idx].iod_nr, orw->orw_epoch, orw->orw_epoch_first); + D_DEBUG(DB_REBUILD, + DF_CONT " obj " DF_UOID "rebuild fetch zero iod_size, " + "i:%d/idx:%d, iod_nr %d, orw_epoch " DF_X64 + ", orw_epoch_first " DF_X64 " may cause DER_DATA_LOSS", + DP_CONT(orw->orw_pool_uuid, orw->orw_co_uuid), + DP_UOID(orw->orw_oid), i, idx, iods[idx].iod_nr, orw->orw_epoch, + orw->orw_epoch_first); if (iods[idx].iod_type == DAOS_IOD_ARRAY) { int j; for (j = 0; j < min(8, iods[idx].iod_nr); j++) - D_INFO("recx[%d] - " DF_RECX, j, - DP_RECX(iods[idx].iod_recxs[j])); + D_DEBUG(DB_REBUILD, "recx[%d] - " DF_RECX, j, + DP_RECX(iods[idx].iod_recxs[j])); } } idx++; diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 3809780f93e..26bcf9cd1ca 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -790,7 +790,7 @@ migrate_fetch_update_inline(struct migrate_one *mrone, daos_handle_t oh, struct dcs_iod_csums *iod_csums = NULL; int iod_cnt = 0; int start; - char iov_buf[OBJ_ENUM_UNPACK_MAX_IODS][MAX_BUF_SIZE]; + char iov_buf[OBJ_ENUM_UNPACK_MAX_IODS][MAX_BUF_SIZE]; bool fetch = false; int i; int rc = 0; @@ -1259,13 +1259,13 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh, * the rebuild and retry. */ rc = -DER_DATA_LOSS; - D_DEBUG(DB_REBUILD, + DL_INFO(rc, DF_RB ": cont " DF_UUID " obj " DF_UOID " dkey " DF_KEY " " DF_KEY - " nr %d/%d eph " DF_X64 " " DF_RC "\n", + " nr %d/%d eph " DF_X64, DP_RB_MRO(mrone), DP_UUID(mrone->mo_cont_uuid), DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey), DP_KEY(&mrone->mo_iods[i].iod_name), mrone->mo_iod_num, i, - mrone->mo_epoch, DP_RC(rc)); + mrone->mo_epoch); if (log_nr <= 128) { mrone_dump_info(mrone, oh, &mrone->mo_iods[i]); log_nr++; diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index e38e9d73e00..75fbc692702 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -68,9 +68,9 @@ rebuild_obj_fill_buf(daos_handle_t ih, d_iov_t *key_iov, shards[count] = obj_val->shard; arg->count++; - D_DEBUG(DB_REBUILD, "send oid/con "DF_UOID"/"DF_UUID" ephs "DF_U64 - "shard %d cnt %d tgt_id %d\n", DP_UOID(oids[count]), - DP_UUID(arg->cont_uuid), obj_val->eph, shards[count], + D_DEBUG(DB_REBUILD, + "send oid/con " DF_UOID "/" DF_UUID " ephs " DF_X64 " shard %d cnt %d tgt_id %d\n", + DP_UOID(oids[count]), DP_UUID(arg->cont_uuid), obj_val->eph, shards[count], arg->count, arg->tgt_id); rc = dbtree_iter_delete(ih, NULL); From aac91d7523266d6492b93eb6fecb6a3d589c287a Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Wed, 7 Jan 2026 07:16:34 +0000 Subject: [PATCH 3/5] DAOS-18368 rebuild: refine rebuild_leader_status_check Signed-off-by: Xuezhao Liu --- src/object/srv_obj_migrate.c | 4 +-- src/rebuild/srv.c | 47 +++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 26bcf9cd1ca..c17da3a9527 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -3070,8 +3070,8 @@ migrate_one_epoch_object(daos_epoch_range_t *epr, struct migrate_pool_tls *tls, /* Each object enumeration RPC will at least one OID */ if (num < minimum_nr && (enum_flags & DIOF_TO_SPEC_GROUP)) { - D_DEBUG(DB_REBUILD, DF_RB ": enumeration buffer %u empty" DF_UOID "\n", - DP_RB_MPT(tls), num, DP_UOID(arg->oid)); + D_INFO(DF_RB ": enumeration buffer %u empty" DF_UOID, DP_RB_MPT(tls), num, + DP_UOID(arg->oid)); break; } diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 88e6b53c851..28941d1df43 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -958,53 +958,63 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, char sbuf[RBLD_SBUF_LEN]; double now; char *str; - d_rank_list_t excluded = {0}; + d_rank_list_t rank_list = {0}; bool rebuild_abort = false; int i; + now = ABT_get_wtime(); ABT_rwlock_rdlock(pool->sp_lock); rc = map_ranks_init(pool->sp_map, - PO_COMP_ST_UP | PO_COMP_ST_DOWN | - PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW, - &excluded); + PO_COMP_ST_UP | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | + PO_COMP_ST_NEW, + &rank_list); if (rc != 0) { D_INFO(DF_RB ": get rank list: %d\n", DP_RB_RGT(rgt), rc); ABT_rwlock_unlock(pool->sp_lock); goto sleep; } - for (i = 0; i < excluded.rl_nr; i++) { + for (i = 0; i < rank_list.rl_nr; i++) { struct pool_domain *dom; - dom = pool_map_find_dom_by_rank(pool->sp_map, excluded.rl_ranks[i]); + dom = pool_map_find_dom_by_rank(pool->sp_map, rank_list.rl_ranks[i]); D_ASSERT(dom != NULL); if (rgt->rgt_opc == RB_OP_REBUILD) { if (dom->do_comp.co_status == PO_COMP_ST_UP) { if (dom->do_comp.co_in_ver > rgt->rgt_rebuild_ver) { - D_INFO(DF_RB ": cancel rebuild co_in_ver=%u\n", - DP_RB_RGT(rgt), dom->do_comp.co_in_ver); + D_INFO(DF_RB ": cancel rebuild due to new REINT, " + "co_rank %d, co_in_ver %u\n", + DP_RB_RGT(rgt), dom->do_comp.co_rank, + dom->do_comp.co_in_ver); rebuild_abort = true; break; - } else { - continue; } } else if (dom->do_comp.co_status == PO_COMP_ST_DOWN) { if (dom->do_comp.co_fseq > rgt->rgt_rebuild_ver) { - D_INFO(DF_RB ": cancel rebuild co_fseq=%u\n", - DP_RB_RGT(rgt), dom->do_comp.co_fseq); + D_INFO(DF_RB ": cancel rebuild due to new DOWN, " + "co_rank %d, co_fseq %u\n", + DP_RB_RGT(rgt), dom->do_comp.co_rank, + dom->do_comp.co_fseq); rebuild_abort = true; break; } } } - D_INFO(DF_RB " exclude rank %d/%x.\n", DP_RB_RGT(rgt), dom->do_comp.co_rank, - dom->do_comp.co_status); - rebuild_leader_set_status(rgt, dom->do_comp.co_rank, - -1, SCAN_DONE | PULL_DONE); + + if (now - last_print > 20) + D_INFO(DF_RB " rank %d, status 0x%x.\n", DP_RB_RGT(rgt), + dom->do_comp.co_rank, dom->do_comp.co_status); + + /* for PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW ranks + * set the completion as no progress/completion will be reported from them. + */ + if (dom->do_comp.co_status != PO_COMP_ST_UP) + rebuild_leader_set_status(rgt, dom->do_comp.co_rank, -1, + SCAN_DONE | PULL_DONE); } ABT_rwlock_unlock(pool->sp_lock); - map_ranks_fini(&excluded); + map_ranks_fini(&rank_list); if (rebuild_abort) { rgt->rgt_abort = 1; @@ -1048,7 +1058,6 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, break; } - now = ABT_get_wtime(); /* print something at least for each 10 seconds */ if (now - last_print > 10) { last_print = now; From d1200f27cae09ae4d3826a5efbb1d581835fed63 Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Thu, 8 Jan 2026 12:30:21 +0000 Subject: [PATCH 4/5] DAOS-18368 object: refine EC agg peer update Some failures need to be retried. Signed-off-by: Xuezhao Liu --- src/object/srv_ec_aggregate.c | 73 ++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 9 deletions(-) diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index 96abd078284..708f77540a8 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -1,7 +1,7 @@ /** * (C) Copyright 2020-2024 Intel Corporation. * (C) Copyright 2025 Google LLC - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1278,6 +1278,42 @@ agg_process_partial_stripe(struct ec_agg_entry *entry) return rc; } +static bool +agg_peer_failed(struct ec_agg_param *agg_param, struct daos_shard_loc *peer_loc) +{ + struct pool_target *targets = NULL; + uint32_t failed_tgts_cnt = 0; + int i; + int rc; + + rc = pool_map_find_failed_tgts(agg_param->ap_pool_info.api_pool->sp_map, &targets, + &failed_tgts_cnt); + if (rc) { + DL_ERROR(rc, DF_CONT " pool_map_find_failed_tgts failed.", + DP_CONT(agg_param->ap_pool_info.api_pool_uuid, + agg_param->ap_pool_info.api_cont_uuid)); + return false; + } + + if (targets == NULL || failed_tgts_cnt == 0) + return false; + + for (i = 0; i < failed_tgts_cnt; i++) { + if (targets[i].ta_comp.co_rank == peer_loc->sd_rank && + targets[i].ta_comp.co_index == peer_loc->sd_tgt_idx) { + D_DEBUG(DB_EPC, DF_CONT " peer parity tgt failed rank %d, tgt_idx %d.\n", + DP_CONT(agg_param->ap_pool_info.api_pool_uuid, + agg_param->ap_pool_info.api_cont_uuid), + peer_loc->sd_rank, peer_loc->sd_tgt_idx); + D_FREE(targets); + return true; + } + } + + D_FREE(targets); + return false; +} + int agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) { @@ -1334,6 +1370,12 @@ agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) return rc; } +static bool +agg_peer_retryable_err(int err) +{ + return err == -DER_STALE || err == -DER_TIMEDOUT || daos_crt_network_error(err); +} + /* Sends the generated parity and the stripe number to the peer * parity target. Handler writes the parity and deletes the replicas * for the stripe. @@ -1382,7 +1424,7 @@ agg_peer_update_ult(void *arg) obj = obj_hdl2ptr(entry->ae_obj_hdl); for (peer = 0; peer < p; peer++) { uint64_t enqueue_id = 0; - bool overloaded; + bool peer_retry; if (peer == pidx) continue; @@ -1390,7 +1432,7 @@ agg_peer_update_ult(void *arg) tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank; tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx; retry: - overloaded = false; + peer_retry = false; rc = ds_obj_req_create(dss_get_module_info()->dmi_ctx, &tgt_ep, DAOS_OBJ_RPC_EC_AGGREGATE, &rpc); if (rc) { @@ -1470,13 +1512,20 @@ agg_peer_update_ult(void *arg) rc = ec_agg_out->ea_status; if (rc == -DER_OVERLOAD_RETRY) { enqueue_id = ec_agg_out->ea_comm_out.req_out_enqueue_id; - overloaded = true; + peer_retry = true; } D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR, "update parity[%d] to %d:%d, status = " DF_RC "\n", peer, tgt_ep.ep_rank, tgt_ep.ep_tag, DP_RC(rc)); peer_updated += rc == 0; } + if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) && + !agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) { + DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.", + DP_UOID(entry->ae_oid), pidx, peer); + peer_retry = true; + } + next: if (bulk_hdl) crt_bulk_free(bulk_hdl); @@ -1487,7 +1536,7 @@ agg_peer_update_ult(void *arg) rpc = NULL; bulk_hdl = NULL; iod_csums = NULL; - if (overloaded) { + if (peer_retry) { dss_sleep(daos_rpc_rand_delay(max_delay) << 10); goto retry; } @@ -1665,13 +1714,13 @@ agg_process_holes_ult(void *arg) for (peer = 0; peer < p; peer++) { uint64_t enqueue_id = 0; uint32_t peer_shard; - bool overloaded; + bool peer_retry; if (pidx == peer) continue; retry: - overloaded = false; + peer_retry = false; D_ASSERT(entry->ae_peer_pshards[peer].sd_rank != DAOS_TGT_IGNORE); tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank; tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx; @@ -1719,7 +1768,7 @@ agg_process_holes_ult(void *arg) rc = ec_rep_out->er_status; if (rc == -DER_OVERLOAD_RETRY) { enqueue_id = ec_rep_out->er_comm_out.req_out_enqueue_id; - overloaded = true; + peer_retry = true; } D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR, DF_UOID " parity[%d] er_status = " DF_RC "\n", @@ -1728,7 +1777,13 @@ agg_process_holes_ult(void *arg) } crt_req_decref(rpc); rpc = NULL; - if (overloaded) { + if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) && + !agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) { + DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.", + DP_UOID(entry->ae_oid), pidx, peer); + peer_retry = true; + } + if (peer_retry) { dss_sleep(daos_rpc_rand_delay(max_delay) << 10); goto retry; } From 90de18f4c0a3a1bd7b95f40ec9082f20016484ce Mon Sep 17 00:00:00 2001 From: Xuezhao Liu Date: Fri, 9 Jan 2026 08:40:54 +0000 Subject: [PATCH 5/5] DAOS-18368 rebuild: fix a possible rebuild stuck For reint ranks is excluded from rebuild/reclaim if the co_in_ver exceed rebuild ver. Should set its completion in rebuild leader to avoid possible stuck. Refine dtx_resync wait handling, need not wait anymore if previously already resynced. Add some log. Signed-off-by: Xuezhao Liu --- src/include/daos_srv/pool.h | 3 ++- src/rebuild/scan.c | 41 ++++++++++++++++++--------------- src/rebuild/srv.c | 46 ++++++++++++++++++++++++++----------- 3 files changed, 58 insertions(+), 32 deletions(-) diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index e27f0bd89a4..9ad67d3e170 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -80,6 +80,7 @@ struct ds_pool { struct sched_request *sp_ec_ephs_req; uint32_t sp_dtx_resync_version; + uint32_t sp_gl_dtx_resync_version; /* global DTX resync version */ /* Special pool/container handle uuid, which are * created on the pool leader step up, and propagated * to all servers by IV. Then they will be used by server diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 75fbc692702..61f8d86680c 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1078,13 +1078,21 @@ static void rebuild_scan_leader(void *data) { struct rebuild_tgt_pool_tracker *rpt = data; - struct rebuild_pool_tls *tls; - int rc; - bool wait = false; - - D_DEBUG(DB_REBUILD, DF_RB " check resync %u/%u < %u\n", DP_RB_RPT(rpt), - rpt->rt_pool->sp_dtx_resync_version, rpt->rt_global_dtx_resync_version, - rpt->rt_rebuild_ver); + struct rebuild_pool_tls *tls; + int rc; + + if (rpt->rt_pool->sp_gl_dtx_resync_version >= rpt->rt_rebuild_ver) { + D_DEBUG(DB_REBUILD, DF_RB " sp_gl_dtx_resync_version %d exceed rt_rebuild_ver %d.", + DP_RB_RPT(rpt), rpt->rt_pool->sp_gl_dtx_resync_version, + rpt->rt_rebuild_ver); + if (rpt->rt_global_dtx_resync_version < rpt->rt_pool->sp_gl_dtx_resync_version) + rpt->rt_global_dtx_resync_version = rpt->rt_pool->sp_gl_dtx_resync_version; + goto do_scan; + } else { + D_DEBUG(DB_REBUILD, DF_RB " check resync %u/%u < %u\n", DP_RB_RPT(rpt), + rpt->rt_pool->sp_dtx_resync_version, rpt->rt_global_dtx_resync_version, + rpt->rt_rebuild_ver); + } /* Wait for dtx resync to finish */ while (rpt->rt_global_dtx_resync_version < rpt->rt_rebuild_ver) { @@ -1093,7 +1101,6 @@ rebuild_scan_leader(void *data) if (rpt->rt_global_dtx_resync_version < rpt->rt_rebuild_ver) { D_INFO(DF_RB " wait for global dtx %u\n", DP_RB_RPT(rpt), rpt->rt_global_dtx_resync_version); - wait = true; ABT_cond_wait(rpt->rt_global_dtx_wait_cond, rpt->rt_lock); } ABT_mutex_unlock(rpt->rt_lock); @@ -1103,23 +1110,21 @@ rebuild_scan_leader(void *data) D_GOTO(out, rc = -DER_SHUTDOWN); } } + if (rpt->rt_pool->sp_gl_dtx_resync_version < rpt->rt_global_dtx_resync_version) { + rpt->rt_pool->sp_gl_dtx_resync_version = rpt->rt_global_dtx_resync_version; + D_INFO(DF_RB " update sp_gl_dtx_resync_version to %d", DP_RB_RPT(rpt), + rpt->rt_pool->sp_gl_dtx_resync_version); + } - if (wait) - D_INFO(DF_RB " scan collective begin\n", DP_RB_RPT(rpt)); - else - D_DEBUG(DB_REBUILD, DF_RB " scan collective begin\n", DP_RB_RPT(rpt)); - +do_scan: + D_INFO(DF_RB " scan collective begin\n", DP_RB_RPT(rpt)); rc = ds_pool_thread_collective(rpt->rt_pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, rebuild_scanner, rpt, DSS_ULT_DEEP_STACK); if (rc) D_GOTO(out, rc); - if (wait) - D_INFO(DF_RB " rebuild scan collective done\n", DP_RB_RPT(rpt)); - else - D_DEBUG(DB_REBUILD, DF_RB "rebuild scan collective done\n", DP_RB_RPT(rpt)); - + D_INFO(DF_RB " rebuild scan collective done\n", DP_RB_RPT(rpt)); ABT_mutex_lock(rpt->rt_lock); rc = ds_pool_task_collective(rpt->rt_pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, rebuild_scan_done, rpt, 0); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 28941d1df43..5dad600030e 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -288,6 +288,9 @@ rebuild_leader_set_status(struct rebuild_global_pool_tracker *rgt, return; } + if (status->dtx_resync_version != resync_ver) + D_INFO(DF_RB " rank %d, update dtx_resync_version from %d to %d", DP_RB_RGT(rgt), + rank, status->dtx_resync_version, resync_ver); status->dtx_resync_version = resync_ver; if (flags & SCAN_DONE) status->scan_done = 1; @@ -309,6 +312,7 @@ rebuild_leader_set_update_time(struct rebuild_global_pool_tracker *rgt, d_rank_t D_INFO("rank %u is not included in this rebuild.\n", rank); } +#define RB_DTX_RESYNC_VER_SKIP ((uint32_t)-1) static uint32_t rebuild_get_global_dtx_resync_ver(struct rebuild_global_pool_tracker *rgt) { @@ -318,7 +322,7 @@ rebuild_get_global_dtx_resync_ver(struct rebuild_global_pool_tracker *rgt) D_ASSERT(rgt->rgt_servers_number > 0); D_ASSERT(rgt->rgt_servers != NULL); for (i = 0; i < rgt->rgt_servers_number; i++) { - if (rgt->rgt_servers[i].dtx_resync_version == (uint32_t)(-1)) + if (rgt->rgt_servers[i].dtx_resync_version == RB_DTX_RESYNC_VER_SKIP) continue; if (min > rgt->rgt_servers[i].dtx_resync_version) @@ -1006,11 +1010,17 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, D_INFO(DF_RB " rank %d, status 0x%x.\n", DP_RB_RGT(rgt), dom->do_comp.co_rank, dom->do_comp.co_status); - /* for PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW ranks - * set the completion as no progress/completion will be reported from them. + /* Some engines don't participate the rebuild that will not report + * progress/completion or dtx resync version through IV, mark the complete/ + * skip. + * 1) PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW ranks + * 2) PO_COMP_ST_UP but co_in_ver > rebuild_ver also will be excluded from + * rebuild request, see rebuild_scan_broadcast(). */ - if (dom->do_comp.co_status != PO_COMP_ST_UP) - rebuild_leader_set_status(rgt, dom->do_comp.co_rank, -1, + if (dom->do_comp.co_status != PO_COMP_ST_UP || + dom->do_comp.co_in_ver > rgt->rgt_rebuild_ver) + rebuild_leader_set_status(rgt, dom->do_comp.co_rank, + RB_DTX_RESYNC_VER_SKIP, SCAN_DONE | PULL_DONE); } ABT_rwlock_unlock(pool->sp_lock); @@ -1313,11 +1323,15 @@ rebuild_scan_broadcast(struct ds_pool *pool, struct rebuild_global_pool_tracker dom = pool_map_find_dom_by_rank(pool->sp_map, up_ranks.rl_ranks[i]); D_ASSERT(dom != NULL); - D_DEBUG(DB_REBUILD, DF_RB " rank %u co_in_ver %u\n", DP_RB_RGT(rgt), - up_ranks.rl_ranks[i], dom->do_comp.co_in_ver); - if (dom->do_comp.co_in_ver < rgt->rgt_rebuild_ver) + D_DEBUG(DB_REBUILD, DF_RB " rank %u co_in_ver %u, rebuild_ver %u.\n", + DP_RB_RGT(rgt), up_ranks.rl_ranks[i], dom->do_comp.co_in_ver, + rgt->rgt_rebuild_ver); + if (dom->do_comp.co_in_ver <= rgt->rgt_rebuild_ver) continue; + D_INFO(DF_RB " bypass UP rank %u co_in_ver %u exceed rebuild_ver %u\n", + DP_RB_RGT(rgt), up_ranks.rl_ranks[i], dom->do_comp.co_in_ver, + rgt->rgt_rebuild_ver); excluded->rl_ranks[nr++] = up_ranks.rl_ranks[i]; } excluded->rl_nr = nr; @@ -1327,13 +1341,11 @@ rebuild_scan_broadcast(struct ds_pool *pool, struct rebuild_global_pool_tracker rc = ds_pool_bcast_create(dss_get_module_info()->dmi_ctx, pool, DAOS_REBUILD_MODULE, REBUILD_OBJECTS_SCAN, rebuild_ver, &rpc, NULL, excluded, NULL); if (rc != 0) { - DL_ERROR(rc, DF_RB " pool map broadcast failed", DP_RB_RGT(rgt)); + DL_ERROR(rc, DF_RB " failed to create scan broadcast request", DP_RB_RGT(rgt)); D_GOTO(out, rc); } rsi = crt_req_get(rpc); - D_DEBUG(DB_REBUILD, DF_RB " scan broadcast\n", DP_RB_RGT(rgt)); - uuid_copy(rsi->rsi_pool_uuid, pool->sp_uuid); rsi->rsi_ns_id = pool->sp_iv_ns->iv_ns_id; rsi->rsi_leader_term = rgt->rgt_leader_term; @@ -1352,11 +1364,13 @@ rebuild_scan_broadcast(struct ds_pool *pool, struct rebuild_global_pool_tracker rso = crt_reply_get(rpc); if (rc == 0) rc = rso->rso_status; + else + DL_ERROR(rc, DF_RB " scan broadcast send failed.", DP_RB_RGT(rgt)); rgt->rgt_init_scan = 1; rgt->rgt_stable_epoch = rso->rso_stable_epoch; - D_DEBUG(DB_REBUILD, DF_RB " " DF_RC " got stable/reclaim epoch " DF_X64 "/" DF_X64 "\n", - DP_RB_RGT(rgt), DP_RC(rc), rgt->rgt_stable_epoch, rgt->rgt_reclaim_epoch); + DL_INFO(rc, DF_RB " got stable/reclaim epoch " DF_X64 "/" DF_X64, DP_RB_RGT(rgt), + rgt->rgt_stable_epoch, rgt->rgt_reclaim_epoch); crt_req_decref(rpc); out: if (excluded) @@ -2787,6 +2801,7 @@ rebuild_tgt_status_check_ult(void *arg) { struct rebuild_tgt_pool_tracker *rpt = arg; struct sched_req_attr attr = { 0 }; + uint32_t reported_dtx_resyc_ver = 0; D_ASSERT(rpt != NULL); sched_req_attr_init(&attr, SCHED_REQ_MIGRATE, &rpt->rt_pool_uuid); @@ -2890,6 +2905,11 @@ rebuild_tgt_status_check_ult(void *arg) rpt->rt_reported_obj_cnt = status.obj_count; rpt->rt_reported_rec_cnt = status.rec_count; rpt->rt_reported_size = status.size; + if (iv.riv_dtx_resyc_version > reported_dtx_resyc_ver) { + D_INFO(DF_RB "reported riv_dtx_resyc_version %d", + DP_RB_RPT(rpt), iv.riv_dtx_resyc_version); + reported_dtx_resyc_ver = iv.riv_dtx_resyc_version; + } } else { DL_WARN(rc, DF_RB " rebuild iv update failed", DP_RB_RPT(rpt)); /* Already finished rebuild, cannot find rebuild status on leader