Skip to content

Commit be8424b

Browse files
author
Ralph Castain
committed
Provide backward compatible keys so that the non-PMIx components in the opal/pmix framework don't have to adjust as we continue to work on finalizing the PMIx reference scheme. Activate and utilize the new PMIx show_help capability to provide more meaningful error output when the server cannot start.
Add a contrib script to cleanup permissions incorrectly modified due to things like smb mounts dd
1 parent d12e50b commit be8424b

File tree

7 files changed

+119
-45
lines changed

7 files changed

+119
-45
lines changed

contrib/cleanperms

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/usr/bin/bash
2+
3+
find . -type f -name "*.c" -perm /u+x -print -exec chmod -x {} \;
4+
find . -type f -name Makefile.am -perm /u+x -print -exec chmod -x {} \;
5+
find . -type f -name "*.h" -perm /u+x -print -exec chmod -x {} \;
6+
find . -type f -name Makefile.include -perm /u+x -print -exec chmod -x {} \;
7+
find . -type f -name Makefile -perm /u+x -print -exec chmod -x {} \;
8+
find . -type f -name "*.m4" -perm /u+x -print -exec chmod -x {} \;
9+
find . -type f -name "*.ac" -perm /u+x -print -exec chmod -x {} \;
10+
find . -type f -name "*.txt" -perm /u+x -print -exec chmod -x {} \;
11+
find . -type f -name "*.l" -perm /u+x -print -exec chmod -x {} \;

opal/mca/pmix/pmix2x/pmix/src/server/Makefile.include

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
# $HEADER$
1212
#
1313

14+
dist_pmixdata_DATA += server/help-pmix-server.txt
15+
1416
headers += \
1517
server/pmix_server_ops.h
1618

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# -*- text -*-
2+
#
3+
# Copyright (c) 2016 Intel, Inc. All rights reserved.
4+
# $COPYRIGHT$
5+
#
6+
# Additional copyrights may follow
7+
#
8+
# $HEADER$
9+
#
10+
#
11+
[rnd-path-too-long]
12+
The PMIx server was unable to setup a rendezvous file due to your
13+
system's restriction for Unix's socket's path-length.
14+
15+
Temporary directory: %s
16+
Rendezvous filename: %s
17+
18+
Please try to set TMPDIR to something short (like /tmp) or change
19+
your computer's name to something shorter (see uname -n).
20+
[listener-failed-start]
21+
The PMIx server was unable to start its listening thread. This is
22+
usually due to a conflicting stale named pipe from a prior failed
23+
job, thus preventing the server from binding to its assigned socket.
24+
25+
Rendezvous filename: %s
26+
27+
Please remove the stale file and try again.
28+
[data-store-failed]
29+
The PMIx server was unable to store the specified key-value:
30+
31+
Key: %s
32+
33+
The precise reason for the failure was provided in the above
34+
"error-log" message. This is probably something that should
35+
be referred to the PMIx developers.

opal/mca/pmix/pmix2x/pmix/src/server/pmix_server.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@
5252
#include "src/util/error.h"
5353
#include "src/util/output.h"
5454
#include "src/util/pmix_environ.h"
55+
#include "src/util/show_help.h"
56+
#include "src/mca/base/base.h"
57+
#include "src/mca/base/pmix_mca_base_var.h"
58+
#include "src/mca/pinstalldirs/base/base.h"
5559
#include "src/runtime/pmix_progress_threads.h"
5660
#include "src/usock/usock.h"
5761
#include "src/sec/pmix_sec.h"
@@ -121,11 +125,25 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module)
121125
char *tdir, *evar;
122126
char * pmix_pid;
123127
pmix_listener_t *listener;
128+
pmix_status_t ret;
124129

125130
/* initialize the output system */
126131
if (!pmix_output_init()) {
132+
fprintf(stderr, "PMIx server was unable to initialize its output system\n");
127133
return PMIX_ERR_INIT;
128134
}
135+
/* initialize install dirs code */
136+
if (PMIX_SUCCESS != (ret = pmix_mca_base_framework_open(&pmix_pinstalldirs_base_framework, 0))) {
137+
fprintf(stderr, "pmix_pinstalldirs_base_open() failed -- process will likely abort (%s:%d, returned %d instead of PMIX_SUCCESS)\n",
138+
__FILE__, __LINE__, ret);
139+
return ret;
140+
}
141+
142+
if (PMIX_SUCCESS != pmix_show_help_init()) {
143+
fprintf(stderr, "PMIx server was unable to initialize its show_help system\n");
144+
return PMIX_ERR_INIT;
145+
}
146+
129147
/* setup the globals */
130148
pmix_globals_init();
131149
memset(&pmix_server_globals, 0, sizeof(pmix_server_globals));
@@ -198,7 +216,9 @@ static pmix_status_t initialize_server_base(pmix_server_module_t *module)
198216
if (0 > asprintf(&pmix_pid, "%s/pmix-%d", tdir, mypid)) {
199217
return PMIX_ERR_NOMEM;
200218
}
219+
201220
if ((strlen(pmix_pid) + 1) > sizeof(listener->address.sun_path)-1) {
221+
pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid);
202222
free(pmix_pid);
203223
return PMIX_ERR_INVALID_LENGTH;
204224
}
@@ -352,6 +372,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
352372
return PMIX_ERR_NOMEM;
353373
}
354374
if ((strlen(pmix_pid) + 1) > sizeof(tl->address.sun_path)-1) {
375+
pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid);
355376
free(pmix_pid);
356377
return PMIX_ERR_INVALID_LENGTH;
357378
}
@@ -380,6 +401,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
380401
return PMIX_ERR_NOMEM;
381402
}
382403
if ((strlen(pmix_pid) + 1) > sizeof(tl->address.sun_path)-1) {
404+
pmix_show_help("help-pmix-server.txt", "rnd-path-too-long", true, tdir, pmix_pid);
383405
free(pmix_pid);
384406
return PMIX_ERR_INVALID_LENGTH;
385407
}
@@ -413,6 +435,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
413435
}
414436
if (need_listener) {
415437
if (PMIX_SUCCESS != pmix_start_listening()) {
438+
pmix_show_help("help-pmix-server.txt", "listener-failed-start", true, tl->address.sun_path);
416439
PMIx_server_finalize();
417440
return PMIX_ERR_INIT;
418441
}
@@ -441,6 +464,7 @@ PMIX_EXPORT pmix_status_t PMIx_server_init(pmix_server_module_t *module,
441464
kv.value = &info[n].value;
442465
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(&pmix_server_globals.gdata, &kv, 1, PMIX_KVAL))) {
443466
PMIX_ERROR_LOG(rc);
467+
pmix_show_help("help-pmix-server.txt", "data-store-failed", true, kv.key);
444468
/* protect the incoming data */
445469
kv.key = NULL;
446470
kv.value = NULL;

orte/mca/ess/pmi/ess_pmi_module.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,12 +314,13 @@ static int rte_init(void)
314314
}
315315
/* retrieve the local peers */
316316
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
317-
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
317+
&wildcard_rank, &val, OPAL_STRING);
318318
if (OPAL_SUCCESS == ret && NULL != val) {
319319
peers = opal_argv_split(val, ',');
320320
free(val);
321321
/* and their cpusets, if available */
322-
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, ORTE_PROC_MY_NAME, &val, OPAL_STRING);
322+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS,
323+
&wildcard_rank, &val, OPAL_STRING);
323324
if (OPAL_SUCCESS == ret && NULL != val) {
324325
cpusets = opal_argv_split(val, ':');
325326
free(val);

orte/orted/pmix/pmix_server.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,7 @@ int pmix_server_init(void)
272272

273273
/* setup the local server */
274274
if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {
275-
ORTE_ERROR_LOG(rc);
276-
/* memory cleanup will occur when finalize is called */
277-
orte_show_help("help-orterun.txt", "orterun:pmix-failed", true,
278-
orte_process_info.proc_session_dir);
275+
/* pmix will provide a nice show_help output here */
279276
return rc;
280277
}
281278
OPAL_LIST_DESTRUCT(&info);

orte/orted/pmix/pmix_server_register_fns.c

Lines changed: 43 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
5454
{
5555
int rc;
5656
orte_proc_t *pptr;
57-
int i, k, n, nlocalprocs;
57+
int i, k, n;
5858
opal_list_t *info, *pmap;
5959
opal_value_t *kv;
60-
orte_node_t *node, *n2;
60+
orte_node_t *node, *mynode;
6161
opal_vpid_t vpid;
6262
char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist;
6363
orte_job_t *dmns;
@@ -164,8 +164,8 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
164164
OPAL_LIST_RELEASE(info);
165165
return ORTE_ERR_NOT_FOUND;
166166
}
167-
node = pptr->node;
168-
if (NULL == node) {
167+
mynode = pptr->node;
168+
if (NULL == mynode) {
169169
/* cannot happen */
170170
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
171171
OPAL_LIST_RELEASE(info);
@@ -175,14 +175,14 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
175175
kv = OBJ_NEW(opal_value_t);
176176
kv->key = strdup(OPAL_PMIX_NODEID);
177177
kv->type = OPAL_UINT32;
178-
kv->data.uint32 = node->index;
178+
kv->data.uint32 = mynode->index;
179179
opal_list_append(info, &kv->super);
180180

181181
/* pass our node size */
182182
kv = OBJ_NEW(opal_value_t);
183183
kv->key = strdup(OPAL_PMIX_NODE_SIZE);
184184
kv->type = OPAL_UINT32;
185-
kv->data.uint32 = node->num_procs;
185+
kv->data.uint32 = mynode->num_procs;
186186
opal_list_append(info, &kv->super);
187187

188188
/* univ size */
@@ -220,43 +220,29 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
220220
kv->data.uint32 = jdata->total_slots_alloc;
221221
opal_list_append(info, &kv->super);
222222

223-
/* identify our local node object within the map,
224-
* if we were included */
225-
node = NULL;
226-
map = (orte_job_map_t*)jdata->map;
227-
for (i=0; i < map->nodes->size; i++) {
228-
if (NULL == (n2 = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
223+
/* register any local clients */
224+
vpid = ORTE_VPID_MAX;
225+
for (i=0; i < mynode->procs->size; i++) {
226+
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(mynode->procs, i))) {
229227
continue;
230228
}
231-
if (n2 == pptr->node) {
232-
node = n2;
233-
break;
234-
}
235-
}
236-
if (NULL != node) {
237-
vpid = ORTE_VPID_MAX;
238-
for (i=0; i < node->procs->size; i++) {
239-
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
240-
continue;
229+
if (pptr->name.jobid == jdata->jobid) {
230+
if (pptr->name.vpid < vpid) {
231+
vpid = pptr->name.vpid;
241232
}
242-
if (pptr->name.jobid == jdata->jobid) {
243-
if (pptr->name.vpid < vpid) {
244-
vpid = pptr->name.vpid;
245-
}
246-
/* go ahead and register this client */
247-
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
248-
(void*)pptr, NULL, NULL))) {
249-
ORTE_ERROR_LOG(rc);
250-
}
233+
/* go ahead and register this client */
234+
if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
235+
(void*)pptr, NULL, NULL))) {
236+
ORTE_ERROR_LOG(rc);
251237
}
252238
}
253-
/* pass the local ldr */
254-
kv = OBJ_NEW(opal_value_t);
255-
kv->key = strdup(OPAL_PMIX_LOCALLDR);
256-
kv->type = OPAL_VPID;
257-
kv->data.name.vpid = vpid;
258-
opal_list_append(info, &kv->super);
259239
}
240+
/* pass the local ldr */
241+
kv = OBJ_NEW(opal_value_t);
242+
kv->key = strdup(OPAL_PMIX_LOCALLDR);
243+
kv->type = OPAL_VPID;
244+
kv->data.name.vpid = vpid;
245+
opal_list_append(info, &kv->super);
260246

261247
/* for each proc in this job, create an object that
262248
* includes the info describing the proc so the recipient has a complete
@@ -276,13 +262,11 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
276262
cpulist = NULL;
277263
peerlist = NULL;
278264
vpid = ORTE_VPID_MAX;
279-
nlocalprocs = 0;
280265
for (i=0; i < node->procs->size; i++) {
281266
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
282267
continue;
283268
}
284269
if (pptr->name.jobid == jdata->jobid) {
285-
++nlocalprocs;
286270
opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid));
287271
if (pptr->name.vpid < vpid) {
288272
vpid = pptr->name.vpid;
@@ -315,6 +299,26 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
315299
procs = NULL;
316300
}
317301

302+
/* if this is me, then pass the peers and cpusets to myself
303+
* in order to maintain backward compatibility for the non-pmix
304+
* components in OPAL/pmix */
305+
if (node == mynode) {
306+
/* pass the list of peers */
307+
kv = OBJ_NEW(opal_value_t);
308+
kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
309+
kv->type = OPAL_STRING;
310+
kv->data.string = strdup(peerlist);
311+
opal_list_append(info, &kv->super);
312+
313+
/* pass the list of cpusets */
314+
kv = OBJ_NEW(opal_value_t);
315+
kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
316+
kv->type = OPAL_STRING;
317+
kv->data.string = strdup(cpulist);
318+
opal_list_append(info, &kv->super);
319+
320+
}
321+
318322
/* now cycle across each proc on this node, passing all data that
319323
* varies by proc */
320324
for (i=0; i < node->procs->size; i++) {

0 commit comments

Comments
 (0)