Skip to content

Commit db9232f

Browse files
authored
Merge pull request #3169 from hjelmn/btl_ugni_2_0
More btl/ugni updates
2 parents 37214ed + 6b210fa commit db9232f

File tree

6 files changed

+41
-10
lines changed

6 files changed

+41
-10
lines changed

opal/mca/btl/ugni/btl_ugni.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@
5353
/** number of rdma completion queue items to remove per progress loop */
5454
#define MCA_BTL_UGNI_COMPLETIONS_PER_LOOP 16
5555

56+
/** how often to check for connection requests */
57+
#define MCA_BTL_UGNI_CONNECT_USEC 10
58+
5659
/**
5760
* Modex data
5861
*/
@@ -167,6 +170,9 @@ typedef struct mca_btl_ugni_module_t {
167170
gni_ep_handle_t wildcard_ep;
168171
struct mca_btl_base_endpoint_t *local_ep;
169172

173+
volatile int32_t active_datagrams;
174+
opal_event_t connection_event;
175+
170176
struct mca_btl_ugni_endpoint_attr_t wc_remote_attr, wc_local_attr;
171177

172178
gni_cq_handle_t smsg_remote_cq;
@@ -422,6 +428,7 @@ int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_
422428
int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
423429

424430
int mca_btl_ugni_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint);
431+
int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device);
425432

426433
mca_btl_base_descriptor_t *
427434
mca_btl_ugni_alloc(struct mca_btl_base_module_t *btl,

opal/mca/btl/ugni/btl_ugni_add_procs.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs,
156156
mca_btl_ugni_spawn_progress_thread(btl);
157157
}
158158

159+
opal_event_evtimer_add (&ugni_module->connection_event, (&(struct timeval) {.tv_sec = 0, .tv_usec = MCA_BTL_UGNI_CONNECT_USEC}));
160+
159161
ugni_module->initialized = true;
160162
}
161163

opal/mca/btl/ugni/btl_ugni_component.c

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -499,9 +499,9 @@ mca_btl_ugni_component_init (int *num_btl_modules,
499499
return base_modules;
500500
}
501501

502-
static inline int
503-
mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device)
502+
int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device)
504503
{
504+
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
505505
mca_btl_base_endpoint_t *ep;
506506
gni_ep_handle_t handle;
507507
int count = 0, rc;
@@ -542,6 +542,7 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni
542542
BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));
543543

544544
ep->dg_posted = false;
545+
(void) opal_atomic_add_32 (&ugni_module->active_datagrams, -1);
545546
}
546547

547548
(void) mca_btl_ugni_ep_connect_progress (ep);
@@ -705,16 +706,12 @@ mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
705706
static int mca_btl_ugni_component_progress (void)
706707
{
707708
mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
708-
static volatile int32_t call_count = 0;
709-
int32_t current_call;
710709
int count = 0;
711710

712-
current_call = OPAL_THREAD_ADD32(&call_count, 1);
713-
714711
count += mca_btl_ugni_progress_remote_smsg (ugni_module);
715712

716-
if ((current_call & 0x7) == 0) {
717-
count += mca_btl_ugni_progress_datagram (ugni_module, ugni_module->devices);
713+
if (ugni_module->active_datagrams) {
714+
count += mca_btl_ugni_progress_datagram (ugni_module->devices);
718715
}
719716

720717
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {

opal/mca/btl/ugni/btl_ugni_endpoint.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,9 @@ static int mca_btl_ugni_directed_ep_post (mca_btl_base_endpoint_t *ep)
311311
rc = GNI_EpPostDataWId (ep->smsg_ep_handle->gni_handle, &ep->mailbox->attr, sizeof (ep->mailbox->attr),
312312
ep->remote_attr, sizeof (*ep->remote_attr),
313313
MCA_BTL_UGNI_CONNECT_DIRECTED_ID | ep->index);
314+
if (OPAL_LIKELY(GNI_RC_SUCCESS == rc)) {
315+
(void) opal_atomic_add_32 (&ugni_module->active_datagrams, 1);
316+
}
314317

315318
return mca_btl_rc_ugni_to_opal (rc);
316319
}

opal/mca/btl/ugni/btl_ugni_module.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,16 @@ mca_btl_ugni_module_t mca_btl_ugni_module = {
6161
}
6262
};
6363

64+
static void *mca_btl_ugni_datagram_event (int foo, short bar, void *arg)
65+
{
66+
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) arg;
67+
mca_btl_ugni_device_t *device = ugni_module->devices;
68+
69+
mca_btl_ugni_progress_datagram (device);
70+
71+
opal_event_evtimer_add (&ugni_module->connection_event, (&(struct timeval) {.tv_sec = 0, .tv_usec = MCA_BTL_UGNI_CONNECT_USEC}));
72+
}
73+
6474
int
6575
mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module)
6676
{
@@ -74,6 +84,10 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module)
7484
ugni_module->initialized = false;
7585
ugni_module->nlocal_procs = 0;
7686
ugni_module->connected_peer_count = 0;
87+
ugni_module->active_datagrams = 0;
88+
89+
opal_event_evtimer_set (opal_sync_event_base, &ugni_module->connection_event,
90+
mca_btl_ugni_datagram_event, ugni_module);
7791

7892
OBJ_CONSTRUCT(&ugni_module->failed_frags, opal_list_t);
7993
OBJ_CONSTRUCT(&ugni_module->failed_frags_lock, opal_mutex_t);
@@ -170,6 +184,8 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
170184
if (GNI_RC_SUCCESS != rc) {
171185
BTL_VERBOSE(("btl/ugni error destroying endpoint - %s",gni_err_str[rc]));
172186
}
187+
188+
opal_event_del (&ugni_module->connection_event);
173189
}
174190

175191
for (int i = 0 ; i < MCA_BTL_UGNI_LIST_MAX ; ++i) {

opal/mca/btl/ugni/btl_ugni_send.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,13 @@ int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
118118
size_t packed_size = payload_size;
119119
int rc;
120120

121+
if (OPAL_UNLIKELY(opal_list_get_size (&endpoint->frag_wait_list))) {
122+
if (NULL != descriptor) {
123+
*descriptor = NULL;
124+
}
125+
return OPAL_ERR_OUT_OF_RESOURCE;
126+
}
127+
121128
do {
122129
BTL_VERBOSE(("btl/ugni isend sending fragment from %d -> %d. length = %" PRIu64
123130
" endoint state %d", OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid,
@@ -134,8 +141,7 @@ int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl,
134141
}
135142

136143
assert (packed_size == payload_size);
137-
if (OPAL_UNLIKELY(NULL == frag || OPAL_SUCCESS != mca_btl_ugni_check_endpoint_state (endpoint) ||
138-
opal_list_get_size (&endpoint->frag_wait_list))) {
144+
if (OPAL_UNLIKELY(NULL == frag || OPAL_SUCCESS != mca_btl_ugni_check_endpoint_state (endpoint))) {
139145
break;
140146
}
141147

0 commit comments

Comments
 (0)