Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/include/daos_srv/pool.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -80,6 +80,7 @@ struct ds_pool {
struct sched_request *sp_ec_ephs_req;

uint32_t sp_dtx_resync_version;
uint32_t sp_gl_dtx_resync_version; /* global DTX resync version */
/* Special pool/container handle uuid, which are
* created on the pool leader step up, and propagated
* to all servers by IV. Then they will be used by server
Expand Down
4 changes: 3 additions & 1 deletion src/object/obj_internal.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1181,6 +1181,8 @@ iov_alloc_for_csum_info(d_iov_t *iov, struct dcs_csum_info *csum_info);
/* obj_layout.c */
int
obj_pl_grp_idx(uint32_t layout_gl_ver, uint64_t hash, uint32_t grp_nr);
void
obj_dump_grp_layout(daos_handle_t oh, uint32_t shard);

int
obj_pl_place(struct pl_map *map, uint16_t layout_ver, struct daos_obj_md *md,
Expand Down
34 changes: 34 additions & 0 deletions src/object/obj_layout.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -95,3 +96,36 @@ obj_layout_diff(struct pl_map *map, daos_unit_oid_t oid, uint32_t new_ver, uint3

return rc;
}

void
obj_dump_grp_layout(daos_handle_t oh, uint32_t shard)
{
struct dc_object *obj;
struct dc_obj_shard *obj_shard;
uint32_t grp_idx, i, nr;

obj = obj_hdl2ptr(oh);
if (obj == NULL) {
D_INFO("invalid oh");
return;
}
if (shard >= obj->cob_shards_nr) {
D_ERROR("bad shard %d, cob_shards_nr %d", shard, obj->cob_shards_nr);
goto out;
}

grp_idx = shard / obj->cob_grp_size;
D_INFO(DF_OID " shard %d, grp_idx %d, grp_size %d", DP_OID(obj->cob_md.omd_id), shard,
grp_idx, obj->cob_grp_size);
for (i = grp_idx * obj->cob_grp_size, nr = 0; nr < obj->cob_grp_size; i++, nr++) {
obj_shard = &obj->cob_shards->do_shards[i];
D_INFO("shard %d/%d/%d, tgt_id %d, rank %d, tgt_idx %d, "
"rebuilding %d, reintegrating %d, fseq %d",
i, obj_shard->do_shard_idx, obj_shard->do_shard, obj_shard->do_target_id,
obj_shard->do_target_rank, obj_shard->do_target_idx,
obj_shard->do_rebuilding, obj_shard->do_reintegrating, obj_shard->do_fseq);
}

out:
obj_decref(obj);
}
73 changes: 64 additions & 9 deletions src/object/srv_ec_aggregate.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* (C) Copyright 2020-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1278,6 +1278,42 @@ agg_process_partial_stripe(struct ec_agg_entry *entry)
return rc;
}

static bool
agg_peer_failed(struct ec_agg_param *agg_param, struct daos_shard_loc *peer_loc)
{
struct pool_target *targets = NULL;
uint32_t failed_tgts_cnt = 0;
int i;
int rc;

rc = pool_map_find_failed_tgts(agg_param->ap_pool_info.api_pool->sp_map, &targets,
&failed_tgts_cnt);
if (rc) {
DL_ERROR(rc, DF_CONT " pool_map_find_failed_tgts failed.",
DP_CONT(agg_param->ap_pool_info.api_pool_uuid,
agg_param->ap_pool_info.api_cont_uuid));
return false;
}

if (targets == NULL || failed_tgts_cnt == 0)
return false;

for (i = 0; i < failed_tgts_cnt; i++) {
if (targets[i].ta_comp.co_rank == peer_loc->sd_rank &&
targets[i].ta_comp.co_index == peer_loc->sd_tgt_idx) {
D_DEBUG(DB_EPC, DF_CONT " peer parity tgt failed rank %d, tgt_idx %d.\n",
DP_CONT(agg_param->ap_pool_info.api_pool_uuid,
agg_param->ap_pool_info.api_cont_uuid),
peer_loc->sd_rank, peer_loc->sd_tgt_idx);
D_FREE(targets);
return true;
}
}

D_FREE(targets);
return false;
}

int
agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
{
Expand Down Expand Up @@ -1334,6 +1370,12 @@ agg_peer_check_avail(struct ec_agg_param *agg_param, struct ec_agg_entry *entry)
return rc;
}

static bool
agg_peer_retryable_err(int err)
{
return err == -DER_STALE || err == -DER_TIMEDOUT || daos_crt_network_error(err);
}

/* Sends the generated parity and the stripe number to the peer
* parity target. Handler writes the parity and deletes the replicas
* for the stripe.
Expand Down Expand Up @@ -1382,15 +1424,15 @@ agg_peer_update_ult(void *arg)
obj = obj_hdl2ptr(entry->ae_obj_hdl);
for (peer = 0; peer < p; peer++) {
uint64_t enqueue_id = 0;
bool overloaded;
bool peer_retry;

if (peer == pidx)
continue;
D_ASSERT(entry->ae_peer_pshards[peer].sd_rank != DAOS_TGT_IGNORE);
tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank;
tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx;
retry:
overloaded = false;
peer_retry = false;
rc = ds_obj_req_create(dss_get_module_info()->dmi_ctx, &tgt_ep,
DAOS_OBJ_RPC_EC_AGGREGATE, &rpc);
if (rc) {
Expand Down Expand Up @@ -1470,13 +1512,20 @@ agg_peer_update_ult(void *arg)
rc = ec_agg_out->ea_status;
if (rc == -DER_OVERLOAD_RETRY) {
enqueue_id = ec_agg_out->ea_comm_out.req_out_enqueue_id;
overloaded = true;
peer_retry = true;
}
D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR,
"update parity[%d] to %d:%d, status = " DF_RC "\n", peer,
tgt_ep.ep_rank, tgt_ep.ep_tag, DP_RC(rc));
peer_updated += rc == 0;
}
if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) &&
!agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) {
DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.",
DP_UOID(entry->ae_oid), pidx, peer);
peer_retry = true;
}

next:
if (bulk_hdl)
crt_bulk_free(bulk_hdl);
Expand All @@ -1487,7 +1536,7 @@ agg_peer_update_ult(void *arg)
rpc = NULL;
bulk_hdl = NULL;
iod_csums = NULL;
if (overloaded) {
if (peer_retry) {
dss_sleep(daos_rpc_rand_delay(max_delay) << 10);
goto retry;
}
Expand Down Expand Up @@ -1665,13 +1714,13 @@ agg_process_holes_ult(void *arg)
for (peer = 0; peer < p; peer++) {
uint64_t enqueue_id = 0;
uint32_t peer_shard;
bool overloaded;
bool peer_retry;

if (pidx == peer)
continue;

retry:
overloaded = false;
peer_retry = false;
D_ASSERT(entry->ae_peer_pshards[peer].sd_rank != DAOS_TGT_IGNORE);
tgt_ep.ep_rank = entry->ae_peer_pshards[peer].sd_rank;
tgt_ep.ep_tag = entry->ae_peer_pshards[peer].sd_tgt_idx;
Expand Down Expand Up @@ -1719,7 +1768,7 @@ agg_process_holes_ult(void *arg)
rc = ec_rep_out->er_status;
if (rc == -DER_OVERLOAD_RETRY) {
enqueue_id = ec_rep_out->er_comm_out.req_out_enqueue_id;
overloaded = true;
peer_retry = true;
}
D_CDEBUG(rc == 0, DB_TRACE, DLOG_ERR,
DF_UOID " parity[%d] er_status = " DF_RC "\n",
Expand All @@ -1728,7 +1777,13 @@ agg_process_holes_ult(void *arg)
}
crt_req_decref(rpc);
rpc = NULL;
if (overloaded) {
if (rc != 0 && peer_updated && agg_peer_retryable_err(rc) &&
!agg_peer_failed(agg_param, &entry->ae_peer_pshards[peer])) {
DL_INFO(rc, DF_UOID " pidx %d to parity[%d] will retry.",
DP_UOID(entry->ae_oid), pidx, peer);
peer_retry = true;
}
if (peer_retry) {
dss_sleep(daos_rpc_rand_delay(max_delay) << 10);
goto retry;
}
Expand Down
80 changes: 61 additions & 19 deletions src/object/srv_obj.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -701,6 +701,22 @@ obj_set_reply_sizes(crt_rpc_t *rpc, daos_iod_t *iods, int iod_nr, uint8_t *skips
sizes[i] = iods[idx].iod_size;
D_DEBUG(DB_IO, DF_UOID" %d:"DF_U64"\n", DP_UOID(orw->orw_oid),
i, iods[idx].iod_size);
if ((orw->orw_flags & ORF_FOR_MIGRATION) && sizes[i] == 0) {
D_DEBUG(DB_REBUILD,
DF_CONT " obj " DF_UOID "rebuild fetch zero iod_size, "
"i:%d/idx:%d, iod_nr %d, orw_epoch " DF_X64
", orw_epoch_first " DF_X64 " may cause DER_DATA_LOSS",
DP_CONT(orw->orw_pool_uuid, orw->orw_co_uuid),
DP_UOID(orw->orw_oid), i, idx, iods[idx].iod_nr, orw->orw_epoch,
orw->orw_epoch_first);
if (iods[idx].iod_type == DAOS_IOD_ARRAY) {
int j;

for (j = 0; j < min(8, iods[idx].iod_nr); j++)
D_DEBUG(DB_REBUILD, "recx[%d] - " DF_RECX, j,
DP_RECX(iods[idx].iod_recxs[j]));
}
}
idx++;
}

Expand Down Expand Up @@ -1368,7 +1384,7 @@ struct ec_agg_boundary_arg {
};

static int
obj_fetch_ec_agg_boundary(void *data)
obj_fetch_ec_agg_boundary_ult(void *data)
{
struct ec_agg_boundary_arg *arg = data;
int rc;
Expand All @@ -1381,6 +1397,33 @@ obj_fetch_ec_agg_boundary(void *data)
return rc;
}

static int
obj_fetch_ec_agg_boundary(struct obj_io_context *ioc, daos_unit_oid_t *uoid)
{
struct ec_agg_boundary_arg arg;
int rc;

arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool;
uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid);
rc = dss_ult_execute(obj_fetch_ec_agg_boundary_ult, &arg, NULL, NULL, DSS_XS_SYS, 0, 0);
if (rc) {
DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.",
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
DP_UOID(*uoid));
return rc;
}
if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) {
rc = -DER_FETCH_AGAIN;
DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.",
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid));
return rc;
}
D_DEBUG(DB_IO, DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n",
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid), DP_UOID(*uoid),
ioc->ioc_coc->sc_ec_agg_eph_boundary);
return 0;
}

static int
obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *iods,
struct dcs_iod_csums *iod_csums, uint64_t *offs, uint8_t *skips,
Expand Down Expand Up @@ -1503,29 +1546,14 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io
}
if ((ec_deg_fetch || (ec_recov && get_parity_list)) &&
ioc->ioc_coc->sc_ec_agg_eph_valid == 0) {
struct ec_agg_boundary_arg arg;

arg.eab_pool = ioc->ioc_coc->sc_pool->spc_pool;
uuid_copy(arg.eab_co_uuid, ioc->ioc_coc->sc_uuid);
rc = dss_ult_execute(obj_fetch_ec_agg_boundary, &arg, NULL, NULL,
DSS_XS_SYS, 0, 0);
rc = obj_fetch_ec_agg_boundary(ioc, &orw->orw_oid);
if (rc) {
DL_ERROR(rc, DF_CONT ", " DF_UOID " fetch ec_agg_boundary failed.",
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
DP_UOID(orw->orw_oid));
goto out;
}
if (ioc->ioc_coc->sc_ec_agg_eph_valid == 0) {
rc = -DER_FETCH_AGAIN;
DL_INFO(rc, DF_CONT ", " DF_UOID " zero ec_agg_boundary.",
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
DP_UOID(orw->orw_oid));
goto out;
}
D_DEBUG(DB_IO,
DF_CONT ", " DF_UOID " fetched ec_agg_eph_boundary " DF_X64 "\n",
DP_CONT(ioc->ioc_coc->sc_pool_uuid, ioc->ioc_coc->sc_uuid),
DP_UOID(orw->orw_oid), ioc->ioc_coc->sc_ec_agg_eph_boundary);
D_ASSERT(ioc->ioc_coc->sc_ec_agg_eph_valid);
}
if (get_parity_list) {
D_ASSERT(!ec_deg_fetch);
Expand Down Expand Up @@ -3030,6 +3058,20 @@ ds_obj_rw_handler(crt_rpc_t *rpc)
if (orw->orw_flags & ORF_FETCH_EPOCH_EC_AGG_BOUNDARY) {
uint64_t rebuild_epoch;

if (ioc.ioc_coc->sc_ec_agg_eph_valid == 0) {
rc = obj_fetch_ec_agg_boundary(&ioc, &orw->orw_oid);
if (rc) {
DL_ERROR(rc,
DF_CONT ", " DF_UOID " fetch ec_agg_boundary "
"failed.",
DP_CONT(ioc.ioc_coc->sc_pool_uuid,
ioc.ioc_coc->sc_uuid),
DP_UOID(orw->orw_oid));
goto out;
}
D_ASSERT(ioc.ioc_coc->sc_ec_agg_eph_valid);
}

D_ASSERTF(orw->orw_epoch <= orw->orw_epoch_first,
"bad orw_epoch " DF_X64 ", orw_epoch_first " DF_X64 "\n",
orw->orw_epoch, orw->orw_epoch_first);
Expand Down
Loading
Loading