Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions man/fi_cxi.7.md
Original file line number Diff line number Diff line change
Expand Up @@ -1314,6 +1314,25 @@ The CXI provider checks for the following environment variables:
: Enable enforcement of triggered operation limit. Doing this can prevent
fi_control(FI_QUEUE_WORK) deadlocking at the cost of performance.

*FI_CXI_ENABLE_WRITEDATA*
: Controls provider support for the fi_writedata() and fi_inject_writedata() RMA
operations. When enabled and the domain attribute cq_data_size is non-zero,
the CXI provider implements handling to generate solicited RMA completions that
include immediate data; completions will include FI_REMOTE_CQ_DATA and will
report source information when FI_SOURCE is enabled (FI_SOURCE_ERR behavior is
followed on resolution failures).

Note that the CXI_RX_CQ_DATA capability is not required and writedata RMA
operations do not consume posted receive buffers on the target. The feature
is gated by domain/endpoint capabilities (for example, a non-zero
domain_attr->cq_data_size in the libfabric API) and endpoint support. Internally,
the combination of domain and endpoint cq_data_size sets rma_cq_data_size. Only
provider MR keys are supported.

This option is disabled by default; enable it only when applications require
immediate-data delivery on RMA completions or for controlled testing and
debugging.

*FI_CXI_MR_CACHE_EVENTS_DISABLE_POLL_NSECS*
: Max amount of time to poll when disabling an MR configured with MR match events.

Expand Down
3 changes: 2 additions & 1 deletion prov/cxi/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ nodist_prov_cxi_test_cxitest_SOURCES = \
prov/cxi/test/fork.c \
prov/cxi/test/mem_reg.c \
prov/cxi/test/nic.c \
prov/cxi/test/mr_cache.c
prov/cxi/test/mr_cache.c \
prov/cxi/test/writedata.c

prov_cxi_test_cxitest_CPPFLAGS = $(AM_CPPFLAGS) $(cxi_CPPFLAGS) \
$(cxitest_CPPFLAGS) $(PTHREAD_CFLAGS)
Expand Down
49 changes: 43 additions & 6 deletions prov/cxi/include/cxip.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@
#define CXIP_DEFAULT_RX_SIZE 1024U

#define CXIP_MAJOR_VERSION 0
#define CXIP_MINOR_VERSION 1
#define CXIP_MINOR_VERSION 2
#define CXIP_PROV_VERSION FI_VERSION(CXIP_MAJOR_VERSION, \
CXIP_MINOR_VERSION)
#define CXIP_FI_VERSION FI_VERSION(2, 4)
Expand Down Expand Up @@ -358,6 +358,7 @@ struct cxip_environment {
int force_dev_reg_copy;
enum cxip_mr_target_ordering mr_target_ordering;
int disable_cuda_sync_memops;
int enable_writedata;
};

extern struct cxip_environment cxip_env;
Expand Down Expand Up @@ -521,7 +522,8 @@ struct cxip_mr_key {
* it repeated.
*/
uint64_t id : 16; /* Unique - 64K MR */
uint64_t seqnum : 44; /* Sequence with random seed */
uint64_t seqnum : 43; /* Sequence with random seed */
uint64_t sol_event : 1; /* For FI_WRITEDATA dual entry */
uint64_t events : 1; /* Requires event generation */
uint64_t unused3: 2;
uint64_t is_prov: 1;
Expand Down Expand Up @@ -699,7 +701,24 @@ union cxip_match_bits {
uint64_t raw;
};
#define CXIP_IS_PROV_MR_KEY_BIT (1ULL << 63)
#define CXIP_KEY_MATCH_BITS(key) ((key) & ~CXIP_IS_PROV_MR_KEY_BIT)
#define CXIP_SOL_NUM_MR_KEY_BIT (1ULL << 59)
#define CXIP_EVENTS_MR_KEY_BIT (1ULL << 60) /* 'events' field - request comm event generation */
#define CXIP_KEY_MATCH_BITS(key) ((key) & ~(CXIP_IS_PROV_MR_KEY_BIT | CXIP_SOL_NUM_MR_KEY_BIT))

static inline uint64_t cxip_key_set_writedata(uint64_t key)
{
struct cxip_mr_key cxip_key = { .raw = key };

/* Provider keys only: non-cached provider keys support writedata.
* Set sol_event (bit 59) for writedata LE match and events (bit 60)
* for target comm event generation.
*/
if (cxip_key.is_prov && !cxip_key.cached) {
/* is_prov bit (63) masked out, preserving 60:59. */
return key | CXIP_SOL_NUM_MR_KEY_BIT | CXIP_EVENTS_MR_KEY_BIT;
}
return key;
}

/* libcxi Wrapper Structures */

Expand Down Expand Up @@ -908,6 +927,20 @@ struct cxip_domain {

uint32_t tclass;

/* CQ data sizes for remote CQ data support:
* - msg_cq_data_size: for messaging operations (FI_REMOTE_CQ_DATA in msg ops)
* - rma_cq_data_size: for RMA writedata operations (fi_writedata/fi_inject_writedata)
* These are set separately to allow messaging to use remote CQ data without
* forcing RMA to enable writedata support.
*/
size_t msg_cq_data_size;
size_t rma_cq_data_size;

/* Legacy cq_data_size field - now derived from msg_cq_data_size and rma_cq_data_size.
* Set to non-zero if either messaging or RMA supports remote CQ data.
*/
size_t cq_data_size;

struct cxip_eq *eq; //unused
struct cxip_eq *mr_eq; //unused

Expand Down Expand Up @@ -2698,9 +2731,10 @@ struct cxip_mr {
struct fi_mr_attr attr; // attributes
struct cxip_cntr *cntr; // if bound to cntr

/* Indicates if FI_RMA_EVENT was specified at creation and
* will be used to enable fi_writedata() and fi_inject_writedata()
* support for this MR (TODO).
/* Indicates if FI_RMA_EVENT was specified at creation.
* This enables remote counter events for this MR.
* Note: fi_writedata() support is controlled by domain->rma_cq_data_size,
* not by FI_RMA_EVENT or this flag.
*/
bool rma_events;

Expand All @@ -2720,9 +2754,12 @@ struct cxip_mr {
struct cxip_mr_util_ops *mr_util;
bool enabled;
struct cxip_pte *pte;
struct cxip_pte *writedata_pte; // Second PTE for FI_WRITEDATA dual entry
enum cxip_mr_state mr_state;
enum cxip_mr_state writedata_mr_state; // State for writedata PTE
int64_t mr_id; // Non-cached provider key uniqueness
struct cxip_ctrl_req req;
struct cxip_ctrl_req writedata_req; // Control req for writedata PTE
bool optimized;

void *buf; // memory buffer VA
Expand Down
54 changes: 52 additions & 2 deletions prov/cxi/src/cxip_dom.c
Original file line number Diff line number Diff line change
Expand Up @@ -389,8 +389,11 @@ int cxip_domain_prov_mr_id_alloc(struct cxip_domain *dom,
*/
key.events = mr->count_events || mr->rma_events || mr->cntr;

key.opt = dom->optimized_mrs &&
key.id < CXIP_PTL_IDX_PROV_MR_OPT_CNT;
/* Force unoptimized keys for RMA events (fi_writedata support).
* Optimized MRs do not support header_data delivery in target events.
*/
key.opt = mr->rma_events || mr->domain->rma_cq_data_size ? false :
(dom->optimized_mrs && key.id < CXIP_PTL_IDX_PROV_MR_OPT_CNT);
mr->key = key.raw;
ofi_spin_unlock(&dom->ctrl_id_lock);

Expand Down Expand Up @@ -2005,6 +2008,32 @@ int cxip_domain(struct fid_fabric *fabric, struct fi_info *info,
cxi_domain->tclass = FI_TC_BEST_EFFORT;
}

/* Initialize CQ data sizes for messaging and RMA separately.
* Both default to info->domain_attr->cq_data_size initially, but can be
* controlled independently. This allows messaging to use remote CQ data
* without forcing RMA writedata support.
*
* msg_cq_data_size: for FI_REMOTE_CQ_DATA in messaging operations
* rma_cq_data_size: for fi_writedata/fi_inject_writedata operations
*/
cxi_domain->msg_cq_data_size = info->domain_attr->cq_data_size;

if (cxip_env.enable_writedata && info->domain_attr->cq_data_size) {
if (cxi_domain->util_domain.mr_mode & FI_MR_PROV_KEY) {
cxi_domain->rma_cq_data_size = info->domain_attr->cq_data_size;
} else {
CXIP_WARN("FI_MR_PROV_KEY required for RMA CQ data\n");
cxi_domain->rma_cq_data_size = 0;
}
} else {
cxi_domain->rma_cq_data_size = 0;
}

/* Legacy cq_data_size: non-zero if either msg or RMA supports CQ data */
cxi_domain->cq_data_size = cxi_domain->msg_cq_data_size ||
cxi_domain->rma_cq_data_size ?
info->domain_attr->cq_data_size : 0;

cxi_domain->av_user_id =
!!(cxi_domain->util_domain.info_domain_caps & FI_AV_USER_ID);
cxi_domain->auth_key_entry_max = info->domain_attr->max_ep_auth_key;
Expand Down Expand Up @@ -2069,6 +2098,27 @@ int cxip_domain(struct fid_fabric *fabric, struct fi_info *info,
cxi_domain->rx_match_mode = cxip_env.rx_match_mode;
cxi_domain->msg_offload = cxip_env.msg_offload;
cxi_domain->req_buf_size = cxip_env.req_buf_size;

/* Disable provider key caching and optimized MRs for writedata support.
* Provider keys lack space for sol_event bit in cached encoding.
* Optimized MRs don't support header_data in target events.
* Writedata is only supported with provider keys.
*/
if (cxi_domain->rma_cq_data_size) {
bool disable_cache = cxi_domain->is_prov_key && cxi_domain->prov_key_cache;
bool disable_opt = cxi_domain->optimized_mrs;

if (disable_cache || disable_opt) {
CXIP_DBG("Disabling %s%s%s due to writedata support (rma_cq_data_size=%zu)\n",
disable_cache ? "provider key cache" : "",
(disable_cache && disable_opt) ? " and " : "",
disable_opt ? "optimized MRs" : "",
cxi_domain->rma_cq_data_size);
}

cxi_domain->prov_key_cache = false;
cxi_domain->optimized_mrs = false;
}
*dom = &cxi_domain->util_domain.domain_fid;

return 0;
Expand Down
64 changes: 52 additions & 12 deletions prov/cxi/src/cxip_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,25 @@

#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__)
#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__)
#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__)

extern struct fi_ops_rma cxip_ep_rma_ops;
extern struct fi_ops_rma cxip_ep_rma_no_ops;
extern struct fi_ops_rma cxip_ep_rma_writedata_ops;

extern struct fi_ops_msg cxip_ep_msg_ops;
extern struct fi_ops_msg cxip_ep_msg_no_ops;
extern struct fi_ops_msg cxip_ep_msg_no_tx_ops;
extern struct fi_ops_msg cxip_ep_msg_no_rx_ops;
extern struct fi_ops_msg cxip_ep_msg_ops_no_writedata;
extern struct fi_ops_msg cxip_ep_msg_no_rx_ops_no_writedata;

extern struct fi_ops_tagged cxip_ep_tagged_ops;
extern struct fi_ops_tagged cxip_ep_tagged_no_ops;
extern struct fi_ops_tagged cxip_ep_tagged_no_tx_ops;
extern struct fi_ops_tagged cxip_ep_tagged_no_rx_ops;
extern struct fi_ops_tagged cxip_ep_tagged_ops_no_writedata;
extern struct fi_ops_tagged cxip_ep_tagged_no_rx_ops_no_writedata;

extern struct fi_ops_atomic cxip_ep_atomic_ops;
extern struct fi_ops_atomic cxip_ep_atomic_no_ops;
Expand Down Expand Up @@ -717,29 +723,62 @@ static int cxip_ep_enable(struct fid_ep *fid_ep)

/* Enable only appropriate API functions based on primary/secondary
* capabilities. Send/Receive requires FI_MSG or FI_TAGGED.
*
* For FI_TAGGED operations, check if writedata is supported.
* If tx_attr.caps includes FI_TAGGED and domain->msg_cq_data_size is non-zero,
* use ops table with fi_tagged_senddata/fi_tagged_injectdata implementations.
* Otherwise, use ops table with fi_no_tagged_senddata/fi_no_tagged_injectdata.
*/
if (ofi_send_allowed(ep->tx_attr.caps & ~FI_MSG) &&
ofi_recv_allowed(ep->rx_attr.caps & ~FI_MSG))
ep->ep.tagged = &cxip_ep_tagged_ops;
else if (ofi_send_allowed(ep->tx_attr.caps & ~FI_MSG))
ep->ep.tagged = &cxip_ep_tagged_no_rx_ops;
else if (ofi_recv_allowed(ep->rx_attr.caps & ~FI_MSG))
ofi_recv_allowed(ep->rx_attr.caps & ~FI_MSG)) {
if ((ep->tx_attr.caps & FI_TAGGED) && ep_obj->domain->msg_cq_data_size)
ep->ep.tagged = &cxip_ep_tagged_ops;
else
ep->ep.tagged = &cxip_ep_tagged_ops_no_writedata;
} else if (ofi_send_allowed(ep->tx_attr.caps & ~FI_MSG)) {
if ((ep->tx_attr.caps & FI_TAGGED) && ep_obj->domain->msg_cq_data_size)
ep->ep.tagged = &cxip_ep_tagged_no_rx_ops;
else
ep->ep.tagged = &cxip_ep_tagged_no_rx_ops_no_writedata;
} else if (ofi_recv_allowed(ep->rx_attr.caps & ~FI_MSG)) {
ep->ep.tagged = &cxip_ep_tagged_no_tx_ops;
}

/* For FI_MSG operations, check if writedata is supported.
* If tx_attr.caps includes FI_MSG and domain->msg_cq_data_size is non-zero,
* use ops table with fi_senddata/fi_injectdata implementations.
* Otherwise, use ops table with fi_no_msg_senddata/fi_no_msg_injectdata.
*/
if (ofi_send_allowed(ep->tx_attr.caps & ~FI_TAGGED) &&
ofi_recv_allowed(ep->rx_attr.caps & ~FI_TAGGED))
ep->ep.msg = &cxip_ep_msg_ops;
else if (ofi_send_allowed(ep->tx_attr.caps & ~FI_TAGGED))
ep->ep.msg = &cxip_ep_msg_no_rx_ops;
else if (ofi_recv_allowed(ep->rx_attr.caps & ~FI_TAGGED))
ofi_recv_allowed(ep->rx_attr.caps & ~FI_TAGGED)) {
if ((ep->tx_attr.caps & FI_MSG) && ep_obj->domain->msg_cq_data_size)
ep->ep.msg = &cxip_ep_msg_ops;
else
ep->ep.msg = &cxip_ep_msg_ops_no_writedata;
} else if (ofi_send_allowed(ep->tx_attr.caps & ~FI_TAGGED)) {
if ((ep->tx_attr.caps & FI_MSG) && ep_obj->domain->msg_cq_data_size)
ep->ep.msg = &cxip_ep_msg_no_rx_ops;
else
ep->ep.msg = &cxip_ep_msg_no_rx_ops_no_writedata;
} else if (ofi_recv_allowed(ep->rx_attr.caps & ~FI_TAGGED)) {
ep->ep.msg = &cxip_ep_msg_no_tx_ops;
}

/* Initiate requires FI_RMA or FI_ATOMIC */
if (ofi_rma_initiate_allowed(ep->tx_attr.caps & ~FI_RMA))
ep->ep.atomic = &cxip_ep_atomic_ops;

if (ofi_rma_initiate_allowed(ep->tx_attr.caps & ~FI_ATOMIC))
ep->ep.rma = &cxip_ep_rma_ops;
if (ofi_rma_initiate_allowed(ep->tx_attr.caps & ~FI_ATOMIC)) {
/* Select RMA ops variant. Enable writedata/injectdata only if:
* - FI_RMA is present in tx_attr.caps
* - domain has non-zero rma_cq_data_size (remote CQ data supported)
*/
if ((ep->tx_attr.caps & FI_RMA) && ep_obj->domain->rma_cq_data_size) {
ep->ep.rma = &cxip_ep_rma_writedata_ops;
} else {
ep->ep.rma = &cxip_ep_rma_ops;
}
}

ep_obj->enabled = true;
ofi_genlock_unlock(&ep_obj->lock);
Expand Down Expand Up @@ -1608,3 +1647,4 @@ int cxip_endpoint(struct fid_domain *domain, struct fi_info *info,

return FI_SUCCESS;
}

7 changes: 7 additions & 0 deletions prov/cxi/src/cxip_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,7 @@ struct cxip_environment cxip_env = {
.force_dev_reg_copy = false,
.mr_target_ordering = MR_ORDER_DEFAULT,
.disable_cuda_sync_memops = false,
.enable_writedata = false,
};

static void cxip_env_init(void)
Expand Down Expand Up @@ -957,6 +958,12 @@ static void cxip_env_init(void)
fi_param_get_bool(&cxip_prov, "mr_match_events",
&cxip_env.mr_match_events);

fi_param_define(&cxip_prov, "enable_writedata", FI_PARAM_BOOL,
"Enable dual MR entries for FI_WRITEDATA support (default %d).",
cxip_env.enable_writedata);
fi_param_get_bool(&cxip_prov, "enable_writedata",
&cxip_env.enable_writedata);

fi_param_define(&cxip_prov, "prov_key_cache", FI_PARAM_BOOL,
"Disable caching of FI_MR_PROV_KEY (default %lu).",
&cxip_env.prov_key_cache);
Expand Down
Loading