Skip to content

Commit ee85204

Browse files
Thananon PatinyasakdikulThananon Patinyasakdikul
authored andcommitted
Added MPI_THREAD_MULTIPLE support for btl/usnic.
1 parent 80e362d commit ee85204

File tree

8 files changed

+78
-6
lines changed

8 files changed

+78
-6
lines changed

opal/mca/btl/usnic/README.txt

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,3 +335,40 @@ libfabric abstractions:
335335
fi_fabric: corresponds to a VIC PF
336336
fi_domain: corresponds to a VIC VF
337337
fi_endpoint: resources inside the VIC VF (basically a QP)
338+
339+
======================================
340+
341+
MPI_THREAD_MULTIPLE support
342+
343+
In order to make usnic btl thread-safe, the mutex locks are issued
344+
to protect the critical path. ie; libfabric routines, book keeping, etc.
345+
346+
The said lock is btl_usnic_lock. It is a RECURSIVE lock, meaning that
347+
the same thread can take the lock again even if it already has the lock to
348+
allow the callback function to post another segment right away if we know
349+
that the current segment is completed inline. (So we can call send in send
350+
without deadlocking)
351+
352+
These two functions taking care of hotel checkin/checkout and we
353+
have to protect that part. So we take the mutex lock before we enter the
354+
function.
355+
356+
- opal_btl_usnic_check_rts()
357+
- opal_btl_usnic_handle_ack()
358+
359+
We also have to protect the call to libfabric routines
360+
361+
- opal_btl_usnic_endpoint_send_segment() (fi_send)
362+
- opal_btl_usnic_recv_call() (fi_recvmsg)
363+
364+
have to be protected as well.
365+
366+
Also cclient connection checking (opal_btl_usnic_connectivity_ping) has to be
367+
protected. This happens only in the beginning but cclient communicate with cagent
368+
through opal_fd_read/write() and if two or more clients do opal_fd_write() at the
369+
same time, the data might be corrupt.
370+
371+
With this concept, many functions in btl/usnic that make calls to the
372+
listed functions are protected by OPAL_THREAD_LOCK macro which will only
373+
be active if the user specify MPI_Init_thread() with MPI_THREAD_MULTIPLE
374+
support.

opal/mca/btl/usnic/btl_usnic_cclient.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,8 @@ int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port,
228228
uint32_t dest_netmask, int dest_port,
229229
char *dest_nodename,
230230
size_t max_msg_size)
231-
{
231+
{
232+
OPAL_THREAD_LOCK(&btl_usnic_lock);
232233
/* If connectivity checking is not enabled, do nothing */
233234
if (!mca_btl_usnic_component.connectivity_enabled) {
234235
return OPAL_SUCCESS;
@@ -259,6 +260,7 @@ int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port,
259260
ABORT("usnic connectivity client IPC write failed");
260261
/* Will not return */
261262
}
263+
OPAL_THREAD_UNLOCK(&btl_usnic_lock);
262264

263265
return OPAL_SUCCESS;
264266
}

opal/mca/btl/usnic/btl_usnic_compat.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,7 @@ opal_btl_usnic_prepare_src(
509509
size_t* size,
510510
uint32_t flags)
511511
{
512+
OPAL_THREAD_LOCK(&btl_usnic_lock);
512513
opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module;
513514
opal_btl_usnic_send_frag_t *frag;
514515
uint32_t payload_len;
@@ -552,6 +553,7 @@ opal_btl_usnic_prepare_src(
552553
#endif
553554
#endif
554555

556+
OPAL_THREAD_UNLOCK(&btl_usnic_lock);
555557
return &frag->sf_base.uf_base;
556558
}
557559

opal/mca/btl/usnic/btl_usnic_component.c

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,9 @@
8686

8787
#define OPAL_BTL_USNIC_NUM_COMPLETIONS 500
8888

89+
/* MPI_THREAD_MULTIPLE_SUPPORT */
90+
opal_recursive_mutex_t btl_usnic_lock;
91+
8992
/* RNG buffer definition */
9093
opal_rng_buff_t opal_btl_usnic_rand_buff = {0};
9194

@@ -222,6 +225,8 @@ static int usnic_component_close(void)
222225
opal_btl_usnic_cleanup_tests();
223226
#endif
224227

228+
OBJ_DESTRUCT(&btl_usnic_lock);
229+
225230
return OPAL_SUCCESS;
226231
}
227232

@@ -615,13 +620,22 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
615620

616621
*num_btl_modules = 0;
617622

618-
/* Currently refuse to run if MPI_THREAD_MULTIPLE is enabled */
623+
/* MPI_THREAD_MULTIPLE is only supported in 2.0+ */
619624
if (want_mpi_threads && !mca_btl_base_thread_multiple_override) {
620-
opal_output_verbose(5, USNIC_OUT,
621-
"btl:usnic: MPI_THREAD_MULTIPLE not supported; skipping this component");
622-
return NULL;
625+
626+
if (OMPI_MAJOR_VERSION >= 2) {
627+
opal_output_verbose(5, USNIC_OUT,
628+
"btl:usnic: MPI_THREAD_MULTIPLE support is in testing phase.");
629+
}
630+
else {
631+
opal_output_verbose(5, USNIC_OUT,
632+
"btl:usnic: MPI_THREAD_MULTIPLE is not supported in version < 2.");
633+
return NULL;
634+
}
623635
}
624636

637+
OBJ_CONSTRUCT(&btl_usnic_lock, opal_recursive_mutex_t);
638+
625639
/* We only want providers named "usnic that are of type EP_DGRAM */
626640
fabric_attr.prov_name = "usnic";
627641
ep_attr.type = FI_EP_DGRAM;
@@ -1151,6 +1165,8 @@ static int usnic_handle_completion(
11511165
/* Make the completion be Valgrind-defined */
11521166
opal_memchecker_base_mem_defined(seg, sizeof(*seg));
11531167

1168+
OPAL_THREAD_LOCK(&btl_usnic_lock);
1169+
11541170
/* Handle work completions */
11551171
switch(seg->us_type) {
11561172

@@ -1181,6 +1197,8 @@ static int usnic_handle_completion(
11811197
BTL_ERROR(("Unhandled completion segment type %d", seg->us_type));
11821198
break;
11831199
}
1200+
1201+
OPAL_THREAD_UNLOCK(&btl_usnic_lock);
11841202
return 1;
11851203
}
11861204

opal/mca/btl/usnic/btl_usnic_module.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,6 +1086,7 @@ opal_btl_usnic_module_progress_sends(
10861086
/*
10871087
* Handle all the retransmits we can
10881088
*/
1089+
OPAL_THREAD_LOCK(&btl_usnic_lock);
10891090
if (OPAL_UNLIKELY(!opal_list_is_empty(&module->pending_resend_segs))) {
10901091
usnic_do_resends(module);
10911092
}
@@ -1195,6 +1196,7 @@ opal_btl_usnic_module_progress_sends(
11951196

11961197
endpoint = next_endpoint;
11971198
}
1199+
OPAL_THREAD_UNLOCK(&btl_usnic_lock);
11981200
}
11991201

12001202
/*
@@ -1229,6 +1231,7 @@ usnic_send(
12291231
opal_btl_usnic_module_t *module;
12301232
opal_btl_usnic_send_segment_t *sseg;
12311233

1234+
OPAL_THREAD_LOCK(&btl_usnic_lock);
12321235
endpoint = (opal_btl_usnic_endpoint_t *)base_endpoint;
12331236
module = (opal_btl_usnic_module_t *)base_module;
12341237
frag = (opal_btl_usnic_send_frag_t*) descriptor;
@@ -1337,6 +1340,7 @@ usnic_send(
13371340

13381341
++module->stats.pml_module_sends;
13391342

1343+
OPAL_THREAD_UNLOCK(&btl_usnic_lock);
13401344
return rc;
13411345
}
13421346

opal/mca/btl/usnic/btl_usnic_module.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@
5353

5454
BEGIN_C_DECLS
5555

56+
/*
57+
* MPI_THREAD_MULTIPLE support
58+
*/
59+
extern opal_recursive_mutex_t btl_usnic_lock;
60+
61+
5662
/*
5763
* Forward declarations to avoid include loops
5864
*/

opal/mca/btl/usnic/btl_usnic_recv.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,8 +340,9 @@ void opal_btl_usnic_recv_call(opal_btl_usnic_module_t *module,
340340
opal_output(0, " Received ACK for sequence number %" UDSEQ " from %s to %s\n",
341341
bseg->us_btl_header->ack_seq, remote_ip, local_ip);
342342
#endif
343+
OPAL_THREAD_LOCK(&btl_usnic_lock);
343344
opal_btl_usnic_handle_ack(endpoint, ack_seq);
344-
345+
OPAL_THREAD_UNLOCK(&btl_usnic_lock);
345346
goto repost;
346347
}
347348

opal/mca/btl/usnic/btl_usnic_recv.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,8 +157,10 @@ opal_btl_usnic_check_rx_seq(
157157
#if MSGDEBUG1
158158
opal_output(0, "Handle piggy-packed ACK seq %"UDSEQ"\n", seg->rs_base.us_btl_header->ack_seq);
159159
#endif
160+
OPAL_THREAD_LOCK(&btl_usnic_lock);
160161
opal_btl_usnic_handle_ack(endpoint,
161162
seg->rs_base.us_btl_header->ack_seq);
163+
OPAL_THREAD_UNLOCK(&btl_usnic_lock);
162164
}
163165

164166
/* Do we have room in the endpoint's receiver window?

0 commit comments

Comments
 (0)