Skip to content

Commit 8038fac

Browse files
authored
Merge pull request #6844 from adrianreber/check_for_user_ns
Do not use CMA in user namespaces
2 parents a7da93f + fc68d8a commit 8038fac

File tree

4 files changed

+127
-5
lines changed

4 files changed

+127
-5
lines changed

opal/mca/btl/vader/btl_vader.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,12 @@ union vader_modex_t {
8484
void *segment_base;
8585
} xpmem;
8686
#endif
87-
opal_shmem_ds_t seg_ds;
87+
struct vader_modex_other_t {
88+
ino_t user_ns_id;
89+
int seg_ds_size;
90+
/* seg_ds needs to be the last element */
91+
opal_shmem_ds_t seg_ds;
92+
} other;
8893
};
8994

9095
/**
@@ -270,6 +275,8 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
270275
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
271276
#endif
272277

278+
ino_t mca_btl_vader_get_user_ns_id(void);
279+
273280
int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
274281
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
275282
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,

opal/mca/btl/vader/btl_vader_component.c

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@
4242
#include "btl_vader_fbox.h"
4343
#include "btl_vader_xpmem.h"
4444

45+
#ifdef HAVE_SYS_STAT_H
46+
#include <sys/stat.h>
47+
#endif
48+
4549
#include <sys/mman.h>
4650
#include <fcntl.h>
4751

@@ -351,6 +355,25 @@ static int mca_btl_vader_component_close(void)
351355
return OPAL_SUCCESS;
352356
}
353357

358+
/*
359+
* mca_btl_vader_parse_proc_ns_user() tries to get the user namespace ID
360+
* of the current process.
361+
* Returns the ID of the user namespace. In the case of an error '0' is returned.
362+
*/
363+
ino_t mca_btl_vader_get_user_ns_id(void)
364+
{
365+
struct stat buf;
366+
367+
if (0 > stat("/proc/self/ns/user", &buf)) {
368+
/*
369+
* Something went wrong, probably an old kernel that does not support namespaces
370+
* simply assume all processes are in the same user namespace and return 0
371+
*/
372+
return 0;
373+
}
374+
375+
return buf.st_ino;
376+
}
354377
static int mca_btl_base_vader_modex_send (void)
355378
{
356379
union vader_modex_t modex;
@@ -364,8 +387,16 @@ static int mca_btl_base_vader_modex_send (void)
364387
modex_size = sizeof (modex.xpmem);
365388
} else {
366389
#endif
367-
modex_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
368-
memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size);
390+
modex.other.seg_ds_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
391+
memmove (&modex.other.seg_ds, &mca_btl_vader_component.seg_ds, modex.other.seg_ds_size);
392+
modex.other.user_ns_id = mca_btl_vader_get_user_ns_id();
393+
/*
394+
* If modex.other.user_ns_id is '0' something did not work out
395+
* during user namespace detection. Assuming there are no
396+
* namespaces available it will return '0' for all processes and
397+
* the check later will see '0' everywhere and not disable CMA.
398+
*/
399+
modex_size = sizeof (modex.other);
369400

370401
#if OPAL_BTL_VADER_HAVE_XPMEM
371402
}

opal/mca/btl/vader/btl_vader_module.c

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
*/
2828

2929
#include "opal_config.h"
30+
#include "opal/util/show_help.h"
3031

3132
#include "btl_vader.h"
3233
#include "btl_vader_endpoint.h"
@@ -79,6 +80,28 @@ mca_btl_vader_t mca_btl_vader = {
7980
}
8081
};
8182

83+
/*
84+
* Exit function copied from btl_usnic_util.c
85+
*
86+
* The following comment tells Coverity that this function does not return.
87+
* See https://scan.coverity.com/tune.
88+
*/
89+
90+
/* coverity[+kill] */
91+
static void vader_btl_exit(mca_btl_vader_t *btl)
92+
{
93+
if (NULL != btl && NULL != btl->error_cb) {
94+
btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
95+
(opal_proc_t*) opal_proc_local_get(),
96+
"The vader BTL is aborting the MPI job (via PML error callback).");
97+
}
98+
99+
/* If the PML error callback returns (or if there wasn't one), just exit. Shrug. */
100+
fprintf(stderr, "*** The Open MPI vader BTL is aborting the MPI job (via exit(3)).\n");
101+
fflush(stderr);
102+
exit(1);
103+
}
104+
82105
static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
83106
{
84107
mca_btl_vader_component_t *component = &mca_btl_vader_component;
@@ -173,6 +196,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
173196
static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_proc_t *proc, int remote_rank) {
174197
mca_btl_vader_component_t *component = &mca_btl_vader_component;
175198
union vader_modex_t *modex;
199+
ino_t my_user_ns_id;
176200
size_t msg_size;
177201
int rc;
178202

@@ -197,17 +221,58 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_
197221
} else {
198222
#endif
199223
/* store a copy of the segment information for detach */
200-
ep->segment_data.other.seg_ds = malloc (msg_size);
224+
ep->segment_data.other.seg_ds = malloc (modex->other.seg_ds_size);
201225
if (NULL == ep->segment_data.other.seg_ds) {
202226
return OPAL_ERR_OUT_OF_RESOURCE;
203227
}
204228

205-
memcpy (ep->segment_data.other.seg_ds, &modex->seg_ds, msg_size);
229+
memcpy (ep->segment_data.other.seg_ds, &modex->other.seg_ds, modex->other.seg_ds_size);
206230

207231
ep->segment_base = opal_shmem_segment_attach (ep->segment_data.other.seg_ds);
208232
if (NULL == ep->segment_base) {
209233
return OPAL_ERROR;
210234
}
235+
236+
if (MCA_BTL_VADER_CMA == mca_btl_vader_component.single_copy_mechanism) {
237+
my_user_ns_id = mca_btl_vader_get_user_ns_id();
238+
if (my_user_ns_id != modex->other.user_ns_id) {
239+
mca_base_var_source_t source;
240+
int vari;
241+
rc = mca_base_var_find_by_name("btl_vader_single_copy_mechanism", &vari);
242+
if (OPAL_ERROR == rc) {
243+
return OPAL_ERROR;
244+
}
245+
rc = mca_base_var_get_value(vari, NULL, &source, NULL);
246+
if (OPAL_ERROR == rc) {
247+
return OPAL_ERROR;
248+
}
249+
/*
250+
* CMA is not possible as different user namespaces are in use.
251+
* Currently the kernel does not allow * process_vm_{read,write}v()
252+
* for processes running in different user namespaces even if
253+
* all involved user IDs are mapped to the same user ID.
254+
*
255+
* Fallback to MCA_BTL_VADER_EMUL.
256+
*/
257+
if (MCA_BASE_VAR_SOURCE_DEFAULT != source) {
258+
/* If CMA has been explicitly selected we want to error out */
259+
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-error",
260+
true, opal_process_info.nodename);
261+
vader_btl_exit(&mca_btl_vader);
262+
}
263+
/*
264+
* If CMA has been selected because it is the default or
265+
* some fallback, this falls back even further.
266+
*/
267+
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-warning",
268+
true, opal_process_info.nodename);
269+
mca_btl_vader_component.single_copy_mechanism = MCA_BTL_VADER_EMUL;
270+
mca_btl_vader.super.btl_get = mca_btl_vader_get_sc_emu;
271+
mca_btl_vader.super.btl_put = mca_btl_vader_put_sc_emu;
272+
mca_btl_vader.super.btl_put_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
273+
mca_btl_vader.super.btl_get_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
274+
}
275+
}
211276
#if OPAL_BTL_VADER_HAVE_XPMEM
212277
}
213278
#endif

opal/mca/btl/vader/help-btl-vader.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,25 @@ WARNING: Linux kernel CMA support was requested via the
121121
btl_vader_single_copy_mechanism MCA variable, but CMA support is
122122
not available due to restrictive ptrace settings.
123123

124+
The vader shared memory BTL will fall back on another single-copy
125+
mechanism if one is available. This may result in lower performance.
126+
127+
Local host: %s
128+
#
129+
[cma-different-user-namespace-error]
130+
ERROR: Linux kernel CMA support was requested via the
131+
btl_vader_single_copy_mechanism MCA variable, but CMA support is
132+
not available due to different user namespaces.
133+
134+
Your MPI job will abort now. Please select another value for
135+
btl_vader_single_copy_mechanism.
136+
137+
Local host: %s
138+
#
139+
[cma-different-user-namespace-warning]
140+
WARNING: The default btl_vader_single_copy_mechanism CMA is
141+
not available due to different user namespaces.
142+
124143
The vader shared memory BTL will fall back on another single-copy
125144
mechanism if one is available. This may result in lower performance.
126145

0 commit comments

Comments
 (0)