Skip to content

Commit 4a8bc72

Browse files
committed
DAOS-17843 rebuild: add some logs for rebuild enumerate
1. add some DEBUG logs for rebuild enumerate 2. add WARN log if some engines did not report EC agg epoch progress in 600S. 3. Fix a typo for sub_anchors->sa_nr compare Signed-off-by: Xuezhao Liu <[email protected]>
1 parent f5686ae commit 4a8bc72

File tree

6 files changed

+96
-16
lines changed

6 files changed

+96
-16
lines changed

src/container/srv_container.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1734,6 +1734,7 @@ cont_track_eph_leader_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid,
17341734
eph_ldr->cte_server_ephs[i].re_rank = doms[i].do_comp.co_rank;
17351735
eph_ldr->cte_server_ephs[i].re_ec_agg_eph = 0;
17361736
eph_ldr->cte_server_ephs[i].re_stable_eph = 0;
1737+
eph_ldr->cte_server_ephs[i].re_ec_agg_eph_update_ts = daos_gettime_coarse();
17371738
}
17381739
d_list_add(&eph_ldr->cte_list, &cont_svc->cs_cont_ephs_leader_list);
17391740
*leader_p = eph_ldr;
@@ -1790,8 +1791,11 @@ ds_cont_leader_update_track_eph(uuid_t pool_uuid, uuid_t cont_uuid, d_rank_t ran
17901791

17911792
for (i = 0; i < eph_ldr->cte_servers_num; i++) {
17921793
if (eph_ldr->cte_server_ephs[i].re_rank == rank) {
1793-
if (eph_ldr->cte_server_ephs[i].re_ec_agg_eph < ec_agg_eph)
1794+
if (eph_ldr->cte_server_ephs[i].re_ec_agg_eph < ec_agg_eph) {
17941795
eph_ldr->cte_server_ephs[i].re_ec_agg_eph = ec_agg_eph;
1796+
eph_ldr->cte_server_ephs[i].re_ec_agg_eph_update_ts =
1797+
daos_gettime_coarse();
1798+
}
17951799
if (eph_ldr->cte_server_ephs[i].re_stable_eph < stable_eph)
17961800
eph_ldr->cte_server_ephs[i].re_stable_eph = stable_eph;
17971801
break;
@@ -2056,6 +2060,7 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc)
20562060
uint64_t cur_eph, new_eph;
20572061
daos_epoch_t min_ec_agg_eph;
20582062
daos_epoch_t min_stable_eph;
2063+
uint64_t cur_ts;
20592064
int i;
20602065
int rc = 0;
20612066

@@ -2090,6 +2095,7 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc)
20902095

20912096
min_ec_agg_eph = DAOS_EPOCH_MAX;
20922097
min_stable_eph = DAOS_EPOCH_MAX;
2098+
cur_ts = daos_gettime_coarse();
20932099
for (i = 0; i < eph_ldr->cte_servers_num; i++) {
20942100
d_rank_t rank = eph_ldr->cte_server_ephs[i].re_rank;
20952101

@@ -2099,6 +2105,14 @@ cont_agg_eph_sync(struct ds_pool *pool, struct cont_svc *svc)
20992105
continue;
21002106
}
21012107

2108+
if (pool->sp_reclaim != DAOS_RECLAIM_DISABLED &&
2109+
cur_ts > eph_ldr->cte_server_ephs[i].re_ec_agg_eph_update_ts + 600)
2110+
D_WARN(DF_CONT ": Sluggish EC boundary report from rank %d, " DF_U64
2111+
" Seconds.",
2112+
DP_CONT(svc->cs_pool_uuid, eph_ldr->cte_cont_uuid), rank,
2113+
cur_ts -
2114+
eph_ldr->cte_server_ephs[i].re_ec_agg_eph_update_ts);
2115+
21022116
if (eph_ldr->cte_server_ephs[i].re_ec_agg_eph < min_ec_agg_eph)
21032117
min_ec_agg_eph = eph_ldr->cte_server_ephs[i].re_ec_agg_eph;
21042118
if (eph_ldr->cte_server_ephs[i].re_stable_eph < min_stable_eph)

src/container/srv_internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ struct rank_eph {
6262
d_rank_t re_rank;
6363
daos_epoch_t re_ec_agg_eph;
6464
daos_epoch_t re_stable_eph;
65+
uint64_t re_ec_agg_eph_update_ts; /* re_ec_agg_eph update timestamp */
6566
};
6667

6768
/* container EC aggregation epoch and stable epoch control descriptor, which is only on leader */

src/object/cli_obj.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4138,8 +4138,10 @@ anchor_update_check_eof(struct obj_auxi_args *obj_auxi, daos_anchor_t *anchor)
41384138
obj_auxi_shards_iterate(obj_auxi, update_sub_anchor_cb, NULL);
41394139

41404140
sub_anchors = (struct shard_anchors *)anchor->da_sub_anchors;
4141-
if (!d_list_empty(&sub_anchors->sa_merged_list))
4141+
if (!d_list_empty(&sub_anchors->sa_merged_list)) {
4142+
D_ASSERT(obj_auxi->opc != DAOS_OBJ_RPC_ENUMERATE);
41424143
return;
4144+
}
41434145

41444146
if (sub_anchors_is_eof(sub_anchors)) {
41454147
daos_obj_list_t *obj_args;
@@ -4148,6 +4150,18 @@ anchor_update_check_eof(struct obj_auxi_args *obj_auxi, daos_anchor_t *anchor)
41484150

41494151
obj_args = dc_task_get_args(obj_auxi->obj_task);
41504152
sub_anchors_free(obj_args, obj_auxi->opc);
4153+
} else if (obj_auxi->opc == DAOS_OBJ_RPC_ENUMERATE) {
4154+
for (int i = 0; i < sub_anchors->sa_anchors_nr; i++) {
4155+
daos_anchor_t *sub_anchor;
4156+
4157+
sub_anchor = &sub_anchors->sa_anchors[i].ssa_anchor;
4158+
if (!daos_anchor_is_eof(sub_anchor)) {
4159+
D_DEBUG(DB_REBUILD, "shard %d sub_anchor %d/%d non EOF",
4160+
sub_anchors->sa_anchors[i].ssa_shard, i,
4161+
sub_anchors->sa_anchors_nr);
4162+
break;
4163+
}
4164+
}
41514165
}
41524166
}
41534167

src/object/obj_enum.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
* (C) Copyright 2018-2022 Intel Corporation.
3+
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
34
*
45
* SPDX-License-Identifier: BSD-2-Clause-Patent
56
*/
@@ -689,9 +690,8 @@ obj_enum_iterate(daos_key_desc_t *kdss, d_sg_list_t *sgl, int nr,
689690
ptr = sgl_indexed_byte(sgl, &sgl_idx);
690691
D_ASSERTF(ptr != NULL, "kds and sgl don't line up");
691692

692-
D_DEBUG(DB_REBUILD, "process %d, type %d, ptr %p, len "DF_U64
693-
", total %zd\n", i, kds->kd_val_type, ptr,
694-
kds->kd_key_len, sgl->sg_iovs[0].iov_len);
693+
D_DEBUG(DB_REBUILD, "process %d/%d, type %d, ptr %p, len " DF_U64 ", total %zd\n",
694+
i, nr, kds->kd_val_type, ptr, kds->kd_key_len, sgl->sg_iovs[0].iov_len);
695695
if (kds->kd_val_type == 0 ||
696696
(kds->kd_val_type != type && type != -1)) {
697697
sgl_move_forward(sgl, &sgl_idx, kds->kd_key_len);

src/object/srv_obj.c

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3246,6 +3246,27 @@ obj_enum_complete(crt_rpc_t *rpc, int status, int map_version,
32463246
D_FREE(oeo->oeo_csum_iov.iov_buf);
32473247
}
32483248

3249+
static void
3250+
dump_enum_anchor(daos_unit_oid_t uoid, daos_anchor_t *anchor, char *str)
3251+
{
3252+
int nr = DAOS_ANCHOR_BUF_MAX / 8;
3253+
int i;
3254+
uint64_t data[nr];
3255+
3256+
D_DEBUG(DB_REBUILD, DF_UOID "%s anchor -", DP_UOID(uoid), str);
3257+
D_DEBUG(DB_REBUILD, "type %d, shard %d, flags 0x%x\n", anchor->da_type, anchor->da_shard,
3258+
anchor->da_flags);
3259+
for (i = 0; i < nr; i++)
3260+
data[i] = *(uint64_t *)((char *)anchor->da_buf + i * 8);
3261+
if (nr >= 13)
3262+
D_DEBUG(DB_REBUILD,
3263+
"da_buf " DF_X64 "," DF_X64 "," DF_X64 "," DF_X64 "," DF_X64 "," DF_X64
3264+
"," DF_X64 "," DF_X64 "," DF_X64 "," DF_X64 "," DF_X64 "," DF_X64
3265+
"," DF_X64,
3266+
data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
3267+
data[8], data[9], data[10], data[11], data[12]);
3268+
}
3269+
32493270
static int
32503271
obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc,
32513272
struct vos_iter_anchors *anchors, struct ds_obj_enum_arg *enum_arg,
@@ -3314,6 +3335,8 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc,
33143335
D_ASSERT(opc == DAOS_OBJ_RPC_ENUMERATE);
33153336
type = VOS_ITER_DKEY;
33163337
param.ip_flags |= VOS_IT_RECX_VISIBLE;
3338+
dump_enum_anchor(oei->oei_oid, &anchors->ia_dkey, "dkey");
3339+
dump_enum_anchor(oei->oei_oid, &anchors->ia_akey, "akey");
33173340
if (daos_anchor_get_flags(&anchors->ia_dkey) &
33183341
DIOF_WITH_SPEC_EPOCH) {
33193342
/* For obj verification case. */
@@ -3331,7 +3354,12 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc,
33313354
enum_arg->chk_key2big = 1;
33323355
enum_arg->need_punch = 1;
33333356
enum_arg->copy_data_cb = vos_iter_copy;
3334-
fill_oid(oei->oei_oid, enum_arg);
3357+
rc = fill_oid(oei->oei_oid, enum_arg);
3358+
if (rc != 0) {
3359+
rc = -DER_KEY2BIG;
3360+
DL_ERROR(rc, DF_UOID "fill oid failed", DP_UOID(oei->oei_oid));
3361+
goto failed;
3362+
}
33353363
}
33363364

33373365
/*

src/object/srv_obj_migrate.c

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2583,9 +2583,18 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data)
25832583
if (rc == 1 &&
25842584
(is_ec_data_shard_by_tgt_off(unpack_tgt_off, &arg->oc_attr) ||
25852585
(io->ui_oid.id_layout_ver > 0 && io->ui_oid.id_shard != parity_shard))) {
2586-
D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID " ignore shard " DF_KEY "/%u/%d/%u/%d.\n",
2587-
DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard,
2588-
(int)obj_ec_shard_off(obj, io->ui_dkey_hash, 0), parity_shard, rc);
2586+
if (daos_is_dkey_uint64(io->ui_oid.id_pub) && io->ui_dkey.iov_len == 8)
2587+
D_DEBUG(DB_REBUILD,
2588+
DF_RB ": " DF_UOID " ignore shard, int dkey " DF_U64
2589+
"/%u/%d/%u/%d.\n",
2590+
DP_RB_MPT(tls), DP_UOID(io->ui_oid),
2591+
*(uint64_t *)io->ui_dkey.iov_buf, shard,
2592+
(int)obj_ec_shard_off(obj, io->ui_dkey_hash, 0), parity_shard, rc);
2593+
else
2594+
D_DEBUG(DB_REBUILD,
2595+
DF_RB ": " DF_UOID " ignore shard " DF_KEY "/%u/%d/%u/%d.\n",
2596+
DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard,
2597+
(int)obj_ec_shard_off(obj, io->ui_dkey_hash, 0), parity_shard, rc);
25892598
D_GOTO(put, rc = 0);
25902599
}
25912600
rc = 0;
@@ -2601,11 +2610,19 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data)
26012610
continue;
26022611
}
26032612

2604-
D_DEBUG(DB_REBUILD,
2605-
DF_RB ": " DF_UOID " unpack " DF_KEY " for shard "
2606-
"%u/%u/%u/" DF_X64 "/%u\n",
2607-
DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard,
2608-
unpack_tgt_off, migrate_tgt_off, io->ui_dkey_hash, parity_shard);
2613+
if (daos_is_dkey_uint64(io->ui_oid.id_pub) && io->ui_dkey.iov_len == 8)
2614+
D_DEBUG(DB_REBUILD,
2615+
DF_RB ": " DF_UOID " unpack int dkey " DF_U64 " for shard "
2616+
"%u/%u/%u/" DF_X64 "/%u\n",
2617+
DP_RB_MPT(tls), DP_UOID(io->ui_oid),
2618+
*(uint64_t *)io->ui_dkey.iov_buf, shard, unpack_tgt_off,
2619+
migrate_tgt_off, io->ui_dkey_hash, parity_shard);
2620+
else
2621+
D_DEBUG(DB_REBUILD,
2622+
DF_RB ": " DF_UOID " unpack " DF_KEY " for shard "
2623+
"%u/%u/%u/" DF_X64 "/%u\n",
2624+
DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey), shard,
2625+
unpack_tgt_off, migrate_tgt_off, io->ui_dkey_hash, parity_shard);
26092626

26102627
/**
26112628
* Since we do not need split the rebuild into parity rebuild
@@ -2643,8 +2660,14 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data)
26432660
if (!create_migrate_one) {
26442661
struct ds_cont_child *cont = NULL;
26452662

2646-
D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID "/" DF_KEY " does not need rebuild.\n",
2647-
DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey));
2663+
if (daos_is_dkey_uint64(io->ui_oid.id_pub) && io->ui_dkey.iov_len == 8)
2664+
D_DEBUG(DB_REBUILD,
2665+
DF_RB ": " DF_UOID "/int dkey: " DF_U64 " does not need rebuild.",
2666+
DP_RB_MPT(tls), DP_UOID(io->ui_oid),
2667+
*(uint64_t *)io->ui_dkey.iov_buf);
2668+
else
2669+
D_DEBUG(DB_REBUILD, DF_RB ": " DF_UOID "/" DF_KEY " does not need rebuild.",
2670+
DP_RB_MPT(tls), DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey));
26482671

26492672
/* Create the vos container when no record need to be rebuilt for this shard,
26502673
* for the case of reintegrate the container was discarded ahead.

0 commit comments

Comments
 (0)