From 20b5e0ea6cc0a6f3bfd374d3d0ec9f41ad3f76f1 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Thu, 9 Oct 2025 14:34:52 -0500 Subject: [PATCH] btl/ofi: fault tolerance Signed-off-by: Matthew Whitlock --- opal/mca/btl/ofi/btl_ofi.h | 2 + opal/mca/btl/ofi/btl_ofi_context.c | 126 +++++++++++++++++------------ opal/mca/btl/ofi/btl_ofi_frag.c | 6 +- opal/mca/btl/ofi/btl_ofi_frag.h | 3 +- opal/mca/btl/ofi/btl_ofi_module.c | 9 +++ 5 files changed, 91 insertions(+), 55 deletions(-) diff --git a/opal/mca/btl/ofi/btl_ofi.h b/opal/mca/btl/ofi/btl_ofi.h index e12d490b390..20345d02c6b 100644 --- a/opal/mca/btl/ofi/btl_ofi.h +++ b/opal/mca/btl/ofi/btl_ofi.h @@ -139,6 +139,8 @@ struct mca_btl_ofi_module_t { /** registration cache */ mca_rcache_base_module_t *rcache; + + mca_btl_base_module_error_cb_fn_t ofi_error_cb; }; typedef struct mca_btl_ofi_module_t mca_btl_ofi_module_t; diff --git a/opal/mca/btl/ofi/btl_ofi_context.c b/opal/mca/btl/ofi/btl_ofi_context.c index 2b9a5fb6905..4a774145744 100644 --- a/opal/mca/btl/ofi/btl_ofi_context.c +++ b/opal/mca/btl/ofi/btl_ofi_context.c @@ -310,6 +310,56 @@ mca_btl_ofi_context_t *get_ofi_context_rr(mca_btl_ofi_module_t *btl) return &btl->contexts[rr_num++ % btl->num_contexts]; } +static void inline complete_op_context(mca_btl_ofi_context_t* context, + void *op_context, int rc) +{ + mca_btl_ofi_completion_context_t *c_ctx = + (mca_btl_ofi_completion_context_t*) op_context; + /* We are casting to every type here just for simplicity. */ + mca_btl_ofi_base_completion_t *comp = + (mca_btl_ofi_base_completion_t *) c_ctx->comp; + mca_btl_ofi_frag_completion_t *frag_comp = + (mca_btl_ofi_frag_completion_t *) c_ctx->comp; + mca_btl_ofi_rdma_completion_t *rdma_comp + = (mca_btl_ofi_rdma_completion_t *) c_ctx->comp; + + switch (comp->type) { + case MCA_BTL_OFI_TYPE_GET: + case MCA_BTL_OFI_TYPE_PUT: + case MCA_BTL_OFI_TYPE_AOP: + case MCA_BTL_OFI_TYPE_AFOP: + case MCA_BTL_OFI_TYPE_CSWAP: + /* call the callback */ + if (rdma_comp->cbfunc) { + rdma_comp->cbfunc(comp->btl, comp->endpoint, rdma_comp->local_address, + rdma_comp->local_handle, rdma_comp->cbcontext, + rdma_comp->cbdata, rc); + } + + MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t *) comp->btl); + break; + + case MCA_BTL_OFI_TYPE_RECV: + mca_btl_ofi_recv_frag((mca_btl_ofi_module_t *) comp->btl, + (mca_btl_ofi_endpoint_t *) comp->endpoint, context, + frag_comp->frag, rc); + break; + + case MCA_BTL_OFI_TYPE_SEND: + MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t *) comp->btl); + mca_btl_ofi_frag_complete(frag_comp->frag, rc); + break; + + default: + /* catasthrophic */ + BTL_ERROR(("unknown completion type")); + MCA_BTL_OFI_ABORT(); + } + + /* return the completion handler */ + opal_free_list_return(comp->my_list, (opal_free_list_item_t *) comp); +} + int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) { @@ -319,11 +369,6 @@ int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) struct fi_cq_entry cq_entry[MCA_BTL_OFI_DEFAULT_MAX_CQE]; struct fi_cq_err_entry cqerr = {0}; - mca_btl_ofi_completion_context_t *c_ctx; - mca_btl_ofi_base_completion_t *comp; - mca_btl_ofi_rdma_completion_t *rdma_comp; - mca_btl_ofi_frag_completion_t *frag_comp; - ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); if (0 < ret) { @@ -331,49 +376,7 @@ int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) for (int i = 0; i < events_read; i++) { if (NULL != cq_entry[i].op_context) { ++events; - - c_ctx = (mca_btl_ofi_completion_context_t *) cq_entry[i].op_context; - - /* We are casting to every type here just for simplicity. */ - comp = (mca_btl_ofi_base_completion_t *) c_ctx->comp; - frag_comp = (mca_btl_ofi_frag_completion_t *) c_ctx->comp; - rdma_comp = (mca_btl_ofi_rdma_completion_t *) c_ctx->comp; - - switch (comp->type) { - case MCA_BTL_OFI_TYPE_GET: - case MCA_BTL_OFI_TYPE_PUT: - case MCA_BTL_OFI_TYPE_AOP: - case MCA_BTL_OFI_TYPE_AFOP: - case MCA_BTL_OFI_TYPE_CSWAP: - /* call the callback */ - if (rdma_comp->cbfunc) { - rdma_comp->cbfunc(comp->btl, comp->endpoint, rdma_comp->local_address, - rdma_comp->local_handle, rdma_comp->cbcontext, - rdma_comp->cbdata, OPAL_SUCCESS); - } - - MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t *) comp->btl); - break; - - case MCA_BTL_OFI_TYPE_RECV: - mca_btl_ofi_recv_frag((mca_btl_ofi_module_t *) comp->btl, - (mca_btl_ofi_endpoint_t *) comp->endpoint, context, - frag_comp->frag); - break; - - case MCA_BTL_OFI_TYPE_SEND: - MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t *) comp->btl); - mca_btl_ofi_frag_complete(frag_comp->frag, OPAL_SUCCESS); - break; - - default: - /* catasthrophic */ - BTL_ERROR(("unknown completion type")); - MCA_BTL_OFI_ABORT(); - } - - /* return the completion handler */ - opal_free_list_return(comp->my_list, (opal_free_list_item_t *) comp); + complete_op_context(context, cq_entry[i].op_context, OPAL_SUCCESS); } } } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { @@ -383,10 +386,31 @@ int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) if (0 > ret) { BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", __FILE__, __LINE__, fi_strerror(-ret), ret)); - } else { - BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", cqerr.prov_errno)); + MCA_BTL_OFI_ABORT(); + } else if(NULL != cqerr.op_context){ + switch(cqerr.err) { + case -FI_EIO: { + mca_btl_ofi_completion_context_t *c_ctx = + (mca_btl_ofi_completion_context_t*) cqerr.op_context; + mca_btl_ofi_base_completion_t *comp = + (mca_btl_ofi_base_completion_t*) c_ctx->comp; + mca_btl_ofi_module_t *ofi_btl = + (mca_btl_ofi_module_t*) comp->btl; + if(ofi_btl->ofi_error_cb){ + ofi_btl->ofi_error_cb(comp->btl, 0, comp->endpoint->ep_proc, + "IO error reported by libfabric"); + } + + ++events; + complete_op_context(context, cqerr.op_context, OPAL_ERR_UNREACH); + break; + } + default: + BTL_ERROR(("fi_cq_readerr: %s(%d) (provider err_code = %d)\n", + fi_strerror(-cqerr.err), cqerr.err, cqerr.prov_errno)); + MCA_BTL_OFI_ABORT(); + } } - MCA_BTL_OFI_ABORT(); } #ifdef FI_EINTR /* sometimes, sockets provider complain about interrupt. We do nothing. */ diff --git a/opal/mca/btl/ofi/btl_ofi_frag.c b/opal/mca/btl/ofi/btl_ofi_frag.c index 25433c0b6a3..e325dd34ccf 100644 --- a/opal/mca/btl/ofi/btl_ofi_frag.c +++ b/opal/mca/btl/ofi/btl_ofi_frag.c @@ -145,9 +145,9 @@ int mca_btl_ofi_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi } int mca_btl_ofi_recv_frag(mca_btl_ofi_module_t *ofi_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag) + mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag, + int rc) { - int rc; mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger + frag->hdr.tag; mca_btl_base_segment_t segment = {.seg_addr.pval = (void *) (frag + 1), .seg_len = frag->hdr.len}; @@ -160,7 +160,7 @@ int mca_btl_ofi_recv_frag(mca_btl_ofi_module_t *ofi_btl, mca_btl_base_endpoint_t /* call the callback */ reg->cbfunc(&ofi_btl->super, &recv_desc); - mca_btl_ofi_frag_complete(frag, OPAL_SUCCESS); + mca_btl_ofi_frag_complete(frag, rc); /* repost the recv */ rc = mca_btl_ofi_post_recvs((mca_btl_base_module_t *) ofi_btl, context, 1); diff --git a/opal/mca/btl/ofi/btl_ofi_frag.h b/opal/mca/btl/ofi/btl_ofi_frag.h index 3afa8866265..786fafa3bbe 100644 --- a/opal/mca/btl/ofi/btl_ofi_frag.h +++ b/opal/mca/btl/ofi/btl_ofi_frag.h @@ -38,7 +38,8 @@ int mca_btl_ofi_send(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi mca_btl_base_descriptor_t *descriptor, mca_btl_base_tag_t tag); int mca_btl_ofi_recv_frag(mca_btl_ofi_module_t *ofi_btl, mca_btl_base_endpoint_t *endpoint, - mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag); + mca_btl_ofi_context_t *context, mca_btl_ofi_base_frag_t *frag, + int rc); struct mca_btl_base_descriptor_t *mca_btl_ofi_prepare_src(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index 19a9064540c..503ba9d4bca 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -143,6 +143,14 @@ static int mca_btl_ofi_del_procs(mca_btl_base_module_t *btl, size_t nprocs, opal return OPAL_SUCCESS; } +static int mca_btl_ofi_register_error(mca_btl_base_module_t *btl, + mca_btl_base_module_error_cb_fn_t cb) +{ + mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl; + ofi_btl->ofi_error_cb = cb; + return OPAL_SUCCESS; +} + void mca_btl_ofi_rcache_init(mca_btl_ofi_module_t *module) { if (!module->initialized) { @@ -515,4 +523,5 @@ mca_btl_ofi_module_t mca_btl_ofi_module_template = { .btl_add_procs = mca_btl_ofi_add_procs, .btl_del_procs = mca_btl_ofi_del_procs, .btl_finalize = mca_btl_ofi_finalize, + .btl_register_error = mca_btl_ofi_register_error, }};