Skip to content

Commit 10fa58e

Browse files
committed
DAOS-18368 rebuild: fix use before check of ec_agg_boundary
1. fix a bug of using ec_agg_boundary before checking its valid 2. add some more logs for rebuild fetch getting zero iod_size, to provide some hints for layout information. Signed-off-by: Xuezhao Liu <[email protected]>
1 parent 4a8bc72 commit 10fa58e

File tree

4 files changed

+140
-28
lines changed

4 files changed

+140
-28
lines changed

src/object/obj_internal.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,6 +1181,8 @@ iov_alloc_for_csum_info(d_iov_t *iov, struct dcs_csum_info *csum_info);
11811181
/* obj_layout.c */
11821182
int
11831183
obj_pl_grp_idx(uint32_t layout_gl_ver, uint64_t hash, uint32_t grp_nr);
1184+
void
1185+
obj_dump_grp_layout(daos_handle_t oh, uint32_t shard);
11841186

11851187
int
11861188
obj_pl_place(struct pl_map *map, uint16_t layout_ver, struct daos_obj_md *md,

src/object/obj_layout.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
* (C) Copyright 2016-2023 Intel Corporation.
3+
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
34
*
45
* SPDX-License-Identifier: BSD-2-Clause-Patent
56
*/
@@ -95,3 +96,36 @@ obj_layout_diff(struct pl_map *map, daos_unit_oid_t oid, uint32_t new_ver, uint3
9596

9697
return rc;
9798
}
99+
100+
void
101+
obj_dump_grp_layout(daos_handle_t oh, uint32_t shard)
102+
{
103+
struct dc_object *obj;
104+
struct dc_obj_shard *obj_shard;
105+
uint32_t grp_idx, i, nr;
106+
107+
obj = obj_hdl2ptr(oh);
108+
if (obj == NULL) {
109+
D_INFO("invalid oh");
110+
return;
111+
}
112+
if (shard >= obj->cob_shards_nr) {
113+
D_ERROR("bad shard %d, cob_shards_nr %d", shard, obj->cob_shards_nr);
114+
goto out;
115+
}
116+
117+
grp_idx = shard / obj->cob_grp_size;
118+
D_INFO(DF_OID " shard %d, grp_idx %d, grp_size %d", DP_OID(obj->cob_md.omd_id), shard,
119+
grp_idx, obj->cob_grp_size);
120+
for (i = grp_idx * obj->cob_grp_size, nr = 0; nr < obj->cob_grp_size; i++, nr++) {
121+
obj_shard = &obj->cob_shards->do_shards[i];
122+
D_INFO("shard %d/%d/%d, tgt_id %d, rank %d, tgt_idx %d, "
123+
"rebuilding %d, reintegrating %d, fseq %d",
124+
i, obj_shard->do_shard_idx, obj_shard->do_shard, obj_shard->do_target_id,
125+
obj_shard->do_target_rank, obj_shard->do_target_idx,
126+
obj_shard->do_rebuilding, obj_shard->do_reintegrating, obj_shard->do_fseq);
127+
}
128+
129+
out:
130+
obj_decref(obj);
131+
}

src/object/srv_obj.c

Lines changed: 58 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,20 @@ obj_set_reply_sizes(crt_rpc_t *rpc, daos_iod_t *iods, int iod_nr, uint8_t *skips
701701
sizes[i] = iods[idx].iod_size;
702702
D_DEBUG(DB_IO, DF_UOID" %d:"DF_U64"\n", DP_UOID(orw->orw_oid),
703703
i, iods[idx].iod_size);
704+
if ((orw->orw_flags & ORF_FOR_MIGRATION) && sizes[i] == 0) {
705+
D_INFO(DF_CONT " obj " DF_UOID "rebuild fetch zero iod_size, i:%d/idx:%d, "
706+
"iod_nr %d, orw_epoch " DF_X64 ", orw_epoch_first " DF_X64
707+
" may cause DER_DATA_LOSS",
708+
DP_CONT(orw->orw_pool_uuid, orw->orw_co_uuid), DP_UOID(orw->orw_oid),
709+
i, idx, iods[idx].iod_nr, orw->orw_epoch, orw->orw_epoch_first);
710+
if (iods[idx].iod_type == DAOS_IOD_ARRAY) {
711+
int j;
712+
713+
for (j = 0; j < min(8, iods[idx].iod_nr); j++)
714+
D_INFO("recx[%d] - " DF_RECX, j,
715+
DP_RECX(iods[idx].iod_recxs[j]));
716+
}
717+
}
704718
idx++;
705719
}
706720

@@ -1368,7 +1382,7 @@ struct ec_agg_boundary_arg {
13681382
};
13691383

13701384
static int
1371-
obj_fetch_ec_agg_boundary(void *data)
1385+
obj_fetch_ec_agg_boundary_ult(void *data)
13721386
{
13731387
struct ec_agg_boundary_arg *arg = data;
13741388
int rc;
@@ -1381,6 +1395,33 @@ obj_fetch_ec_agg_boundary(void *data)
13811395
return rc;
13821396
}
13831397

1398+
static int
1399+
obj_fetch_ec_agg_boundary(struct obj_io_context *ioc, daos_unit_oid_t *uoid)
1400+
{
1401+
struct ec_agg_boundary_arg arg;
1402+
int rc;
1403+
1404+
arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool;
1405+
uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid);
1406+
rc = dss_ult_execute(obj_fetch_ec_agg_boundary_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, 0);
1407+
if (rc) {
1408+
DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.",
1409+
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
1410+
DP_UOID(*uoid));
1411+
return rc;
1412+
}
1413+
if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) {
1414+
rc = -DER_FETCH_AGAIN;
1415+
DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.",
1416+
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid));
1417+
return rc;
1418+
}
1419+
D_DEBUG(DB_IO, DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n",
1420+
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid),
1421+
ioc->ioc_coc->sc_ec_agg_eph_boundary);
1422+
return 0;
1423+
}
1424+
13841425
static int
13851426
obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *iods,
13861427
struct dcs_iod_csums *iod_csums, uint64_t *offs, uint8_t *skips,
@@ -1503,29 +1544,14 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io
15031544
}
15041545
if ((ec_deg_fetch || (ec_recov && get_parity_list)) &&
15051546
ioc->ioc_coc->sc_ec_agg_eph_valid == 0) {
1506-
struct ec_agg_boundary_arg arg;
1507-
1508-
arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool;
1509-
uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid);
1510-
rc = dss_ult_execute(obj_fetch_ec_agg_boundary, &arg, NULL, NULL,
1511-
DSS_XS_SYS, 0, 0);
1547+
rc = obj_fetch_ec_agg_boundary(ioc, &orw->orw_oid);
15121548
if (rc) {
15131549
DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.",
15141550
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
15151551
DP_UOID(orw->orw_oid));
15161552
goto out;
15171553
}
1518-
if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) {
1519-
rc = -DER_FETCH_AGAIN;
1520-
DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.",
1521-
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
1522-
DP_UOID(orw->orw_oid));
1523-
goto out;
1524-
}
1525-
D_DEBUG(DB_IO,
1526-
DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n",
1527-
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
1528-
DP_UOID(orw->orw_oid), ioc->ioc_coc->sc_ec_agg_eph_boundary);
1554+
D_ASSERT(ioc->ioc_coc->sc_ec_agg_eph_valid);
15291555
}
15301556
if (get_parity_list) {
15311557
D_ASSERT(!ec_deg_fetch);
@@ -3030,6 +3056,20 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
30303056
if (orw->orw_flags & ORF_FETCH_EPOCH_EC_AGG_BOUNDARY) {
30313057
uint64_t rebuild_epoch;
30323058

3059+
if (ioc.ioc_coc->sc_ec_agg_eph_valid == 0) {
3060+
rc = obj_fetch_ec_agg_boundary(&ioc, &orw->orw_oid);
3061+
if (rc) {
3062+
DL_ERROR(rc,
3063+
DF_CONT ", " DF_UOID " fetch ec_agg_boundary "
3064+
"failed.",
3065+
DP_CONT(ioc.ioc_coc->sc_pool_uuid,
3066+
ioc.ioc_coc->sc_uuid),
3067+
DP_UOID(orw->orw_oid));
3068+
goto out;
3069+
}
3070+
D_ASSERT(ioc.ioc_coc->sc_ec_agg_eph_valid);
3071+
}
3072+
30333073
D_ASSERTF(orw->orw_epoch <= orw->orw_epoch_first,
30343074
"bad orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 "\n",
30353075
orw->orw_epoch, orw->orw_epoch_first);

src/object/srv_obj_migrate.c

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,6 +1156,28 @@ migrate_fetch_update_parity(struct migrate_one *mrone, daos_handle_t oh,
11561156
return rc;
11571157
}
11581158

1159+
static void
1160+
mrone_dump_info(struct migrate_one *mrone, daos_handle_t oh, daos_iod_t *iod)
1161+
{
1162+
int i;
1163+
1164+
if (daos_is_dkey_uint64(mrone->mo_oid.id_pub) && mrone->mo_dkey.iov_len == 8)
1165+
D_INFO(DF_RB ": " DF_UOID " int dkey " DF_U64 ", akey " DF_KEY ", iod_type %d, "
1166+
" iod_nr %d, iod_size " DF_U64,
1167+
DP_RB_MPT(mrone->mo_tls), DP_UOID(mrone->mo_oid),
1168+
*(uint64_t *)mrone->mo_dkey.iov_buf, DP_KEY(&iod->iod_name), iod->iod_type,
1169+
iod->iod_nr, iod->iod_size);
1170+
else
1171+
D_INFO(DF_RB ": " DF_UOID " dkey " DF_KEY ", akey " DF_KEY ", iod_type %d, "
1172+
" iod_nr %d, iod_size " DF_U64,
1173+
DP_RB_MPT(mrone->mo_tls), DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey),
1174+
DP_KEY(&iod->iod_name), iod->iod_type, iod->iod_nr, iod->iod_size);
1175+
if (iod->iod_type == DAOS_IOD_ARRAY)
1176+
for (i = 0; i < min(8, iod->iod_nr); i++)
1177+
D_INFO("recxs[%d] - " DF_RECX, i, DP_RECX(iod->iod_recxs[i]));
1178+
obj_dump_grp_layout(oh, mrone->mo_oid.id_shard);
1179+
}
1180+
11591181
static int
11601182
migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh,
11611183
struct ds_cont_child *ds_cont)
@@ -1224,6 +1246,8 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh,
12241246
daos_iod_t *iod = &mrone->mo_iods[i];
12251247

12261248
if (mrone->mo_iods[i].iod_size == 0) {
1249+
static __thread int log_nr;
1250+
12271251
/* zero size iod will cause assertion failure
12281252
* in VOS, so let's check here.
12291253
* So the object is being destroyed between
@@ -1236,11 +1260,16 @@ migrate_fetch_update_single(struct migrate_one *mrone, daos_handle_t oh,
12361260
*/
12371261
rc = -DER_DATA_LOSS;
12381262
D_DEBUG(DB_REBUILD,
1239-
DF_RB ": " DF_UOID " %p dkey " DF_KEY " " DF_KEY
1240-
" nr %d/%d eph " DF_U64 " " DF_RC "\n",
1241-
DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone,
1242-
DP_KEY(&mrone->mo_dkey), DP_KEY(&mrone->mo_iods[i].iod_name),
1243-
mrone->mo_iod_num, i, mrone->mo_epoch, DP_RC(rc));
1263+
DF_RB ": cont " DF_UUID " obj " DF_UOID " dkey " DF_KEY " " DF_KEY
1264+
" nr %d/%d eph " DF_X64 " " DF_RC "\n",
1265+
DP_RB_MRO(mrone), DP_UUID(mrone->mo_cont_uuid),
1266+
DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey),
1267+
DP_KEY(&mrone->mo_iods[i].iod_name), mrone->mo_iod_num, i,
1268+
mrone->mo_epoch, DP_RC(rc));
1269+
if (log_nr <= 128) {
1270+
mrone_dump_info(mrone, oh, &mrone->mo_iods[i]);
1271+
log_nr++;
1272+
}
12441273
D_GOTO(out, rc);
12451274
}
12461275

@@ -1407,6 +1436,8 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh,
14071436

14081437
for (i = 0; rc == 0 && i < iod_num; i++) {
14091438
if (iods[i].iod_size == 0) {
1439+
static __thread int log_nr;
1440+
14101441
/* zero size iod will cause assertion failure
14111442
* in VOS, so let's check here.
14121443
* So the object is being destroyed between
@@ -1418,11 +1449,16 @@ __migrate_fetch_update_bulk(struct migrate_one *mrone, daos_handle_t oh,
14181449
* the rebuild and retry.
14191450
*/
14201451
rc = -DER_DATA_LOSS;
1421-
D_INFO(DF_RB ": " DF_UOID " %p dkey " DF_KEY " " DF_KEY
1422-
" nr %d/%d eph " DF_U64 " " DF_RC "\n",
1423-
DP_RB_MRO(mrone), DP_UOID(mrone->mo_oid), mrone,
1424-
DP_KEY(&mrone->mo_dkey), DP_KEY(&iods[i].iod_name), iod_num, i,
1425-
mrone->mo_epoch, DP_RC(rc));
1452+
DL_INFO(rc,
1453+
DF_RB ": cont " DF_UUID " obj " DF_UOID " dkey " DF_KEY " " DF_KEY
1454+
" nr %d/%d mo_epoch " DF_X64 " fetch_eph " DF_X64,
1455+
DP_RB_MRO(mrone), DP_UUID(mrone->mo_cont_uuid),
1456+
DP_UOID(mrone->mo_oid), DP_KEY(&mrone->mo_dkey),
1457+
DP_KEY(&iods[i].iod_name), iod_num, i, mrone->mo_epoch, fetch_eph);
1458+
if (log_nr <= 128) {
1459+
mrone_dump_info(mrone, oh, &mrone->mo_iods[i]);
1460+
log_nr++;
1461+
}
14261462
D_GOTO(end, rc);
14271463
}
14281464
}

0 commit comments

Comments
 (0)