Skip to content

Commit 15c3ed9

Browse files
UCP/EP: UCP_ERR_HANDLING_MODE_FAILOVER fallback implementation (#11067)
* UCP/EP: UCP_EP_INIT_ERR_MODE_FAILOVER fallback impl - fallback to UCP_EP_INIT_ERR_MODE_PEER_FAILURE - refactoring of err_mode getters and comparatort * UCP/EP: fix wire compatibility of ERR_MODE_FAILOVER
1 parent 170437a commit 15c3ed9

File tree

10 files changed

+104
-67
lines changed

10 files changed

+104
-67
lines changed

src/ucp/core/ucp_ep.c

Lines changed: 43 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ static int ucp_ep_shall_use_indirect_id(ucp_context_h context,
267267
return !(ep_init_flags & UCP_EP_INIT_FLAG_INTERNAL) &&
268268
((context->config.ext.proto_indirect_id == UCS_CONFIG_ON) ||
269269
((context->config.ext.proto_indirect_id == UCS_CONFIG_AUTO) &&
270-
(ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE)));
270+
(ep_init_flags & UCP_EP_INIT_ERR_MODE_FAILOVER_MASK)));
271271
}
272272

273273
void ucp_ep_peer_mem_destroy(ucp_context_h context,
@@ -573,8 +573,13 @@ void ucp_ep_release_id(ucp_ep_h ep)
573573
void ucp_ep_config_key_set_err_mode(ucp_ep_config_key_t *key,
574574
unsigned ep_init_flags)
575575
{
576-
key->err_mode = (ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) ?
577-
UCP_ERR_HANDLING_MODE_PEER : UCP_ERR_HANDLING_MODE_NONE;
576+
if (ep_init_flags & UCP_EP_INIT_ERR_MODE_FAILOVER) {
577+
key->err_mode = UCP_ERR_HANDLING_MODE_FAILOVER;
578+
} else if (ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) {
579+
key->err_mode = UCP_ERR_HANDLING_MODE_PEER;
580+
} else {
581+
key->err_mode = UCP_ERR_HANDLING_MODE_NONE;
582+
}
578583
}
579584

580585
void ucp_ep_config_key_init_flags(ucp_ep_config_key_t *key,
@@ -790,7 +795,7 @@ static ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep,
790795
if (ucp_ep_init_flags_has_cm(ep_init_flags)) {
791796
key.cm_lane = 0;
792797
/* Send keepalive on wireup_ep (which will send on aux_ep) */
793-
if (ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) {
798+
if (ep_init_flags & UCP_EP_INIT_ERR_MODE_FAILOVER_MASK) {
794799
key.keepalive_lane = 0;
795800
}
796801
} else {
@@ -931,8 +936,7 @@ ucp_sa_data_v1_unpack(const ucp_wireup_sockaddr_data_base_t *sa_data,
931936
return UCS_ERR_UNSUPPORTED;
932937
}
933938

934-
*ep_init_flags_p = (sa_data->header == UCP_ERR_HANDLING_MODE_PEER) ?
935-
UCP_EP_INIT_ERR_MODE_PEER_FAILURE : 0;
939+
*ep_init_flags_p = ucp_ep_err_mode_init_flags(sa_data->header);
936940
*worker_addr_p = sa_data_v1 + 1;
937941
return UCS_OK;
938942
}
@@ -942,8 +946,15 @@ ucp_sa_data_v2_unpack(const ucp_wireup_sockaddr_data_base_t *sa_data,
942946
unsigned *ep_init_flags_p,
943947
const void** worker_addr_p)
944948
{
945-
*ep_init_flags_p = (sa_data->header & UCP_SA_DATA_FLAG_ERR_MODE_PEER) ?
946-
UCP_EP_INIT_ERR_MODE_PEER_FAILURE : 0;
949+
if (sa_data->header & UCP_SA_DATA_FLAG_ERR_MODE_FAILOVER) {
950+
ucs_assert(sa_data->header & UCP_SA_DATA_FLAG_ERR_MODE_PEER);
951+
*ep_init_flags_p = UCP_EP_INIT_ERR_MODE_FAILOVER_MASK;
952+
} else if (sa_data->header & UCP_SA_DATA_FLAG_ERR_MODE_PEER) {
953+
*ep_init_flags_p = UCP_EP_INIT_ERR_MODE_PEER_FAILURE;
954+
} else {
955+
*ep_init_flags_p = 0;
956+
}
957+
947958
*worker_addr_p = sa_data + 1;
948959
return UCS_OK;
949960
}
@@ -1142,7 +1153,7 @@ ucp_ep_create_api_to_worker_addr(ucp_worker_h worker,
11421153
out_resolve_remote_id:
11431154
if ((context->config.ext.resolve_remote_ep_id == UCS_CONFIG_ON) ||
11441155
((context->config.ext.resolve_remote_ep_id == UCS_CONFIG_AUTO) &&
1145-
(ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) &&
1156+
(ep_init_flags & UCP_EP_INIT_ERR_MODE_FAILOVER_MASK) &&
11461157
ucp_worker_keepalive_is_enabled(worker))) {
11471158
/* If resolving remote ID forced by configuration or PEER_FAILURE
11481159
* and keepalive were requested, resolve remote endpoint ID prior to
@@ -1169,11 +1180,7 @@ ucp_ep_create_api_to_worker_addr(ucp_worker_h worker,
11691180
static void ucp_ep_params_check_err_handling(ucp_ep_h ep,
11701181
const ucp_ep_params_t *params)
11711182
{
1172-
ucp_err_handling_mode_t err_mode =
1173-
UCP_PARAM_VALUE(EP, params, err_mode, ERR_HANDLING_MODE,
1174-
UCP_ERR_HANDLING_MODE_NONE);
1175-
1176-
if (err_mode == UCP_ERR_HANDLING_MODE_NONE) {
1183+
if (ucp_ep_params_err_handling_mode(params) == UCP_ERR_HANDLING_MODE_NONE) {
11771184
return;
11781185
}
11791186

@@ -1193,13 +1200,6 @@ ucs_status_t ucp_ep_create(ucp_worker_h worker, const ucp_ep_params_t *params,
11931200
unsigned flags = UCP_PARAM_VALUE(EP, params, flags, FLAGS, 0);
11941201
ucs_status_t status;
11951202

1196-
/* TODO: Implement failover error handling mode */
1197-
if (UCP_PARAM_VALUE(EP, params, err_mode, ERR_HANDLING_MODE,
1198-
UCP_ERR_HANDLING_MODE_NONE) == UCP_ERR_HANDLING_MODE_FAILOVER) {
1199-
ucs_error("failover error handling mode is not implemented");
1200-
return UCS_ERR_NOT_IMPLEMENTED;
1201-
}
1202-
12031203
UCS_ASYNC_BLOCK(&worker->async);
12041204

12051205
if (flags & UCP_EP_PARAMS_FLAGS_CLIENT_SERVER) {
@@ -1413,10 +1413,9 @@ static void ucp_ep_failed_destroy(uct_ep_h ep)
14131413

14141414
static void ucp_ep_discard_lanes(ucp_ep_h ep, ucs_status_t discard_status)
14151415
{
1416-
unsigned ep_flush_flags = (ucp_ep_config(ep)->key.err_mode ==
1417-
UCP_ERR_HANDLING_MODE_NONE) ?
1418-
UCT_FLUSH_FLAG_LOCAL :
1419-
UCT_FLUSH_FLAG_CANCEL;
1416+
unsigned ep_flush_flags = ucp_ep_config_err_handling_enabled(ep) ?
1417+
UCT_FLUSH_FLAG_CANCEL :
1418+
UCT_FLUSH_FLAG_LOCAL;
14201419
uct_ep_h uct_eps[UCP_MAX_LANES] = { NULL };
14211420
ucp_ep_discard_lanes_arg_t *discard_arg;
14221421
ucs_status_t status;
@@ -1475,7 +1474,6 @@ ucp_ep_set_failed(ucp_ep_h ucp_ep, ucp_lane_index_t lane, ucs_status_t status)
14751474
{
14761475
UCS_STRING_BUFFER_ONSTACK(lane_info_strb, 64);
14771476
ucp_ep_ext_t *ep_ext = ucp_ep->ext;
1478-
ucp_err_handling_mode_t err_mode;
14791477
ucs_log_level_t log_level;
14801478
ucp_request_t *close_req;
14811479

@@ -1516,10 +1514,8 @@ ucp_ep_set_failed(ucp_ep_h ucp_ep, ucp_lane_index_t lane, ucs_status_t status)
15161514
} else if (ep_ext->err_cb == NULL) {
15171515
/* Print error if user requested error handling support but did not
15181516
install a valid error handling callback */
1519-
err_mode = ucp_ep_config(ucp_ep)->key.err_mode;
1520-
log_level = (err_mode == UCP_ERR_HANDLING_MODE_NONE) ?
1521-
UCS_LOG_LEVEL_DIAG :
1522-
UCS_LOG_LEVEL_ERROR;
1517+
log_level = ucp_ep_config_err_handling_enabled(ucp_ep) ?
1518+
UCS_LOG_LEVEL_ERROR : UCS_LOG_LEVEL_DIAG;
15231519

15241520
ucp_ep_get_lane_info_str(ucp_ep, lane, &lane_info_strb);
15251521
ucs_log(log_level,
@@ -1755,7 +1751,7 @@ ucs_status_ptr_t ucp_ep_close_nbx(ucp_ep_h ep, const ucp_request_param_t *param)
17551751
ucp_request_t *close_req;
17561752

17571753
if ((ucp_request_param_flags(param) & UCP_EP_CLOSE_FLAG_FORCE) &&
1758-
(ucp_ep_config(ep)->key.err_mode != UCP_ERR_HANDLING_MODE_PEER)) {
1754+
!ucp_ep_config_err_handling_enabled(ep)) {
17591755
return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM);
17601756
}
17611757

@@ -2683,7 +2679,8 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
26832679
config->md_index[lane] = context->tl_rscs[rsc_index].md_index;
26842680
if (ucp_ep_config_connect_p2p(worker, &config->key, rsc_index)) {
26852681
config->p2p_lanes |= UCS_BIT(lane);
2686-
} else if (config->key.err_mode == UCP_ERR_HANDLING_MODE_PEER) {
2682+
} else if ((config->key.err_mode == UCP_ERR_HANDLING_MODE_PEER) ||
2683+
(config->key.err_mode == UCP_ERR_HANDLING_MODE_FAILOVER)) {
26872684
config->uct_rkey_pack_flags |= UCT_MD_MKEY_PACK_FLAG_INVALIDATE_RMA;
26882685
}
26892686

@@ -3969,3 +3966,17 @@ void ucp_ep_set_cfg_index(ucp_ep_h ep, ucp_worker_cfg_index_t cfg_index)
39693966
ucp_ep_config_activate_worker_ifaces(ep->worker, cfg_index);
39703967
ucp_ep_config_proto_init(ep->worker, cfg_index);
39713968
}
3969+
3970+
unsigned ucp_ep_err_mode_init_flags(ucp_err_handling_mode_t err_mode)
3971+
{
3972+
switch (err_mode) {
3973+
case UCP_ERR_HANDLING_MODE_NONE:
3974+
return 0;
3975+
case UCP_ERR_HANDLING_MODE_PEER:
3976+
return UCP_EP_INIT_ERR_MODE_PEER_FAILURE;
3977+
case UCP_ERR_HANDLING_MODE_FAILOVER:
3978+
return UCP_EP_INIT_ERR_MODE_FAILOVER_MASK;
3979+
default:
3980+
ucs_fatal("invalid error handling mode: %d", err_mode);
3981+
}
3982+
}

src/ucp/core/ucp_ep.h

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,16 @@ enum {
165165
UCP_EP_INIT_CREATE_AM_LANE_ONLY = UCS_BIT(8), /**< Endpoint requires an AM lane only */
166166
UCP_EP_INIT_KA_FROM_EXIST_LANES = UCS_BIT(9), /**< Use only existing lanes to create
167167
keepalive lane */
168-
UCP_EP_INIT_ALLOW_AM_AUX_TL = UCS_BIT(10) /**< Endpoint allows selecting of auxiliary
168+
UCP_EP_INIT_ALLOW_AM_AUX_TL = UCS_BIT(10), /**< Endpoint allows selecting of auxiliary
169169
transports for AM lane */
170+
UCP_EP_INIT_ERR_MODE_FAILOVER = UCS_BIT(11), /**< Endpoint requires an
171+
@ref UCP_ERR_HANDLING_MODE_FAILOVER */
172+
173+
/**
174+
* For consistency with @ref UCP_SA_DATA_MASK_ERR_MODE_FAILOVER
175+
*/
176+
UCP_EP_INIT_ERR_MODE_FAILOVER_MASK = UCP_EP_INIT_ERR_MODE_PEER_FAILURE |
177+
UCP_EP_INIT_ERR_MODE_FAILOVER
170178
};
171179

172180

@@ -617,8 +625,19 @@ enum {
617625
* ucp_wireup_sockaddr_data_base_t structure.
618626
*/
619627
enum {
620-
/* Indicates support of UCP_ERR_HANDLING_MODE_PEER error mode. */
621-
UCP_SA_DATA_FLAG_ERR_MODE_PEER = UCS_BIT(0)
628+
/* Indicates support of @ref UCP_ERR_HANDLING_MODE_PEER error mode. */
629+
UCP_SA_DATA_FLAG_ERR_MODE_PEER = UCS_BIT(0),
630+
/* Indicates support of @ref UCP_ERR_HANDLING_MODE_FAILOVER error mode.
631+
* NOTE: use @ref UCP_SA_DATA_MASK_ERR_MODE_FAILOVER for backward
632+
* compatibility to fallback peer failure mode to
633+
* @ref UCP_ERR_HANDLING_MODE_PEER */
634+
UCP_SA_DATA_FLAG_ERR_MODE_FAILOVER = UCS_BIT(1),
635+
636+
/**
637+
* Backward compatibility mask
638+
*/
639+
UCP_SA_DATA_MASK_ERR_MODE_FAILOVER = UCP_SA_DATA_FLAG_ERR_MODE_PEER |
640+
UCP_SA_DATA_FLAG_ERR_MODE_FAILOVER
622641
};
623642

624643

src/ucp/core/ucp_ep.inl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,4 +289,17 @@ ucp_ep_config_err_mode_eq(ucp_ep_h ep, ucp_err_handling_mode_t err_mode)
289289
return ucp_ep_config(ep)->key.err_mode == err_mode;
290290
}
291291

292+
static UCS_F_ALWAYS_INLINE ucp_err_handling_mode_t
293+
ucp_ep_params_err_handling_mode(const ucp_ep_params_t *params)
294+
{
295+
return UCP_PARAM_VALUE(EP, params, err_mode, ERR_HANDLING_MODE,
296+
UCP_ERR_HANDLING_MODE_NONE);
297+
}
298+
299+
static UCS_F_ALWAYS_INLINE int ucp_ep_config_err_handling_enabled(ucp_ep_h ep)
300+
{
301+
return ucp_ep_config_err_mode_eq(ep, UCP_ERR_HANDLING_MODE_PEER) ||
302+
ucp_ep_config_err_mode_eq(ep, UCP_ERR_HANDLING_MODE_FAILOVER);
303+
}
304+
292305
#endif

src/ucp/core/ucp_ep_vfs.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ static const ucp_ep_vfs_attr_t ucp_ep_vfs_attrs[] = {
3434
};
3535

3636
static const char *ucp_err_handling_mode_names[] = {
37-
[UCP_ERR_HANDLING_MODE_NONE] = "none",
38-
[UCP_ERR_HANDLING_MODE_PEER] = "peer"
37+
[UCP_ERR_HANDLING_MODE_NONE] = "none",
38+
[UCP_ERR_HANDLING_MODE_PEER] = "peer",
39+
[UCP_ERR_HANDLING_MODE_FAILOVER] = "failover"
3940
};
4041

4142
static void ucp_ep_vfs_read_peer_name(void *obj, ucs_string_buffer_t *strb,

src/ucp/core/ucp_request.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -397,14 +397,13 @@ static ucp_md_map_t ucp_request_get_invalidation_map(ucp_ep_h ep)
397397

398398
int ucp_request_memh_invalidate(ucp_request_t *req, ucs_status_t status)
399399
{
400-
ucp_ep_h ep = req->send.ep;
401-
ucp_err_handling_mode_t err_mode = ucp_ep_config(ep)->key.err_mode;
402-
ucp_worker_h worker = ep->worker;
403-
ucp_context_h context = worker->context;
400+
ucp_ep_h ep = req->send.ep;
401+
ucp_worker_h worker = ep->worker;
402+
ucp_context_h context = worker->context;
404403
ucp_mem_h *memh_p;
405404
ucp_md_map_t invalidate_map;
406405

407-
if ((err_mode != UCP_ERR_HANDLING_MODE_PEER) ||
406+
if (ucp_ep_config_err_mode_eq(ep, UCP_ERR_HANDLING_MODE_NONE) ||
408407
!(req->flags & UCP_REQUEST_FLAG_RKEY_INUSE)) {
409408
return 0;
410409
}

src/ucp/proto/proto_common.inl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -359,8 +359,7 @@ ucp_proto_request_pack_rkey(ucp_request_t *req, ucp_md_map_t md_map,
359359
/* Since global VA registration doesn't support invalidation yet, and error
360360
* handling is enabled on this EP, we replace GVA registrations with
361361
* regular ones */
362-
if (ucp_ep_config_err_mode_eq(req->send.ep,
363-
UCP_ERR_HANDLING_MODE_PEER) &&
362+
if (ucp_ep_config_err_handling_enabled(req->send.ep) &&
364363
ucs_unlikely(memh->flags & UCP_MEMH_FLAG_HAS_AUTO_GVA)) {
365364
ucp_memh_disable_gva(memh, md_map);
366365
}

src/ucp/rma/flush.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,8 @@ ucp_ep_flush_request_update_uct_comp(ucp_request_t *req, int diff,
4444
static void ucp_ep_flush_error(ucp_request_t *req, ucp_lane_index_t lane,
4545
ucs_status_t status)
4646
{
47-
ucs_log_level_t level = (ucp_ep_config(req->send.ep)->key.err_mode ==
48-
UCP_ERR_HANDLING_MODE_PEER) ?
49-
UCS_LOG_LEVEL_TRACE_REQ : UCS_LOG_LEVEL_ERROR;
47+
ucs_log_level_t level = ucp_ep_config_err_handling_enabled(req->send.ep) ?
48+
UCS_LOG_LEVEL_TRACE_REQ : UCS_LOG_LEVEL_ERROR;
5049

5150
ucs_assertv(lane != UCP_NULL_LANE, "req=%p ep=%p lane=%d status=%s",
5251
req, req->send.ep, lane, ucs_status_string(status));

src/ucp/wireup/wireup.c

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -579,13 +579,6 @@ void ucp_wireup_remote_connected(ucp_ep_h ep)
579579
ucs_assert(ep->flags & UCP_EP_FLAG_REMOTE_ID);
580580
}
581581

582-
static UCS_F_ALWAYS_INLINE unsigned
583-
ucp_ep_err_mode_init_flags(ucp_err_handling_mode_t err_mode)
584-
{
585-
return (err_mode == UCP_ERR_HANDLING_MODE_PEER) ?
586-
UCP_EP_INIT_ERR_MODE_PEER_FAILURE : 0;
587-
}
588-
589582
static UCS_F_NOINLINE void
590583
ucp_wireup_process_pre_request(ucp_worker_h worker, ucp_ep_h ep,
591584
const ucp_wireup_msg_t *msg,
@@ -2228,13 +2221,6 @@ static void ucp_wireup_msg_dump(ucp_worker_h worker, uct_am_trace_type_t type,
22282221
ucs_free(unpacked_address.address_list);
22292222
}
22302223

2231-
static ucp_err_handling_mode_t
2232-
ucp_ep_params_err_handling_mode(const ucp_ep_params_t *params)
2233-
{
2234-
return (params->field_mask & UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE) ?
2235-
params->err_mode : UCP_ERR_HANDLING_MODE_NONE;
2236-
}
2237-
22382224
static unsigned
22392225
ucp_cm_ep_init_flags(const ucp_ep_params_t *params)
22402226
{
@@ -2259,11 +2245,8 @@ unsigned ucp_ep_init_flags(const ucp_worker_h worker,
22592245
flags |= UCP_EP_INIT_CREATE_AM_LANE;
22602246
}
22612247

2262-
if (ucp_ep_params_err_handling_mode(params) == UCP_ERR_HANDLING_MODE_PEER) {
2263-
flags |= UCP_EP_INIT_ERR_MODE_PEER_FAILURE;
2264-
}
2265-
2266-
return flags;
2248+
return flags |
2249+
ucp_ep_err_mode_init_flags(ucp_ep_params_err_handling_mode(params));
22672250
}
22682251

22692252
double ucp_wireup_iface_lat_distance_v1(const ucp_worker_iface_t *wiface)

src/ucp/wireup/wireup.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,17 @@ void ucp_wireup_remote_connected(ucp_ep_h ep);
202202
unsigned ucp_ep_init_flags(const ucp_worker_h worker,
203203
const ucp_ep_params_t *params);
204204

205+
206+
/**
207+
* @brief Convert error handling mode to endpoint initialization flags.
208+
*
209+
* @param [in] err_mode Error handling mode.
210+
*
211+
* @return Endpoint initialization flags.
212+
*/
213+
unsigned ucp_ep_err_mode_init_flags(ucp_err_handling_mode_t err_mode);
214+
215+
205216
int ucp_wireup_connect_p2p(ucp_worker_h worker, ucp_rsc_index_t rsc_index,
206217
int has_cm_lane);
207218

src/ucp/wireup/wireup_cm.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,8 +263,10 @@ ucp_cm_ep_sa_data_pack(ucp_ep_h ep, ucp_wireup_sockaddr_data_base_t *sa_data,
263263
"sa_data version: %u", sa_data_version);
264264
sa_data->header = UCP_OBJECT_VERSION_V2 <<
265265
UCP_SA_DATA_HEADER_VERSION_SHIFT;
266-
if (ucp_ep_config(ep)->key.err_mode == UCP_ERR_HANDLING_MODE_PEER) {
266+
if (ucp_ep_config_err_mode_eq(ep, UCP_ERR_HANDLING_MODE_PEER)) {
267267
sa_data->header |= UCP_SA_DATA_FLAG_ERR_MODE_PEER;
268+
} else if (ucp_ep_config_err_mode_eq(ep, UCP_ERR_HANDLING_MODE_FAILOVER)) {
269+
sa_data->header |= UCP_SA_DATA_MASK_ERR_MODE_FAILOVER;
268270
}
269271

270272
return sa_data + 1;

0 commit comments

Comments
 (0)