Skip to content

Commit a6f6113

Browse files
author
Ralph Castain
authored
Merge pull request #3588 from rhc54/topic/server
Fix ompi-server operations
2 parents c7e6294 + 8c2a064 commit a6f6113

File tree

5 files changed

+150
-125
lines changed

5 files changed

+150
-125
lines changed

orte/mca/rml/base/rml_base_contact.c

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2016 Intel, Inc. All rights reserved.
12+
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
1313
* $COPYRIGHT$
1414
*
1515
* Additional copyrights may follow
@@ -72,6 +72,7 @@ int orte_rml_base_get_contact_info(orte_jobid_t job, opal_buffer_t *data)
7272
int orte_rml_base_update_contact_info(opal_buffer_t* data)
7373
{
7474
orte_std_cntr_t cnt;
75+
orte_process_name_t peer;
7576
orte_vpid_t num_procs;
7677
char *rml_uri;
7778
int rc;
@@ -89,11 +90,18 @@ int orte_rml_base_update_contact_info(opal_buffer_t* data)
8990
if (NULL != rml_uri) {
9091
/* set the contact info into the hash table */
9192
orte_rml.set_contact_info(rml_uri);
93+
/* if this was an update to my own job, then
94+
* track how many procs were in the message */
95+
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &peer, NULL))) {
96+
ORTE_ERROR_LOG(rc);
97+
free(rml_uri);
98+
return rc;
99+
}
100+
if (peer.jobid == ORTE_PROC_MY_NAME->jobid) {
101+
++num_procs;
102+
}
92103
free(rml_uri);
93104
}
94-
95-
/* track how many procs were in the message */
96-
++num_procs;
97105
}
98106
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
99107
ORTE_ERROR_LOG(rc);

orte/orted/help-orted.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,10 @@ This is usually caused by a large job that encounters significant
8080
delays across the cluster when starting the application processes.
8181
Your job may terminate as a result of this problem. You may want to
8282
adjust the MCA parameter pmix_server_max_reqs and try again.
83+
#
84+
[noserver]
85+
A publish/lookup server was provided, but we were unable to connect
86+
to it - please check the connection info and ensure the server
87+
is alive:
88+
89+
Connection: %s

orte/orted/pmix/pmix_server.c

Lines changed: 2 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -296,94 +296,6 @@ int pmix_server_init(void)
296296
}
297297
OPAL_LIST_DESTRUCT(&info);
298298

299-
/* if the universal server wasn't specified, then we use
300-
* our own HNP for that purpose */
301-
if (NULL == orte_pmix_server_globals.server_uri) {
302-
orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP;
303-
} else {
304-
char *server;
305-
opal_buffer_t buf;
306-
if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) ||
307-
0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) {
308-
char input[1024], *filename;
309-
FILE *fp;
310-
311-
/* it is a file - get the filename */
312-
filename = strchr(orte_pmix_server_globals.server_uri, ':');
313-
if (NULL == filename) {
314-
/* filename is not correctly formatted */
315-
orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
316-
orte_basename, orte_pmix_server_globals.server_uri);
317-
return ORTE_ERR_BAD_PARAM;
318-
}
319-
++filename; /* space past the : */
320-
321-
if (0 >= strlen(filename)) {
322-
/* they forgot to give us the name! */
323-
orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true,
324-
orte_basename, orte_pmix_server_globals.server_uri);
325-
return ORTE_ERR_BAD_PARAM;
326-
}
327-
328-
/* open the file and extract the uri */
329-
fp = fopen(filename, "r");
330-
if (NULL == fp) { /* can't find or read file! */
331-
orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true,
332-
orte_basename, orte_pmix_server_globals.server_uri);
333-
return ORTE_ERR_BAD_PARAM;
334-
}
335-
if (NULL == fgets(input, 1024, fp)) {
336-
/* something malformed about file */
337-
fclose(fp);
338-
orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true,
339-
orte_basename, orte_pmix_server_globals.server_uri,
340-
orte_basename);
341-
return ORTE_ERR_BAD_PARAM;
342-
}
343-
fclose(fp);
344-
input[strlen(input)-1] = '\0'; /* remove newline */
345-
server = strdup(input);
346-
} else {
347-
server = strdup(orte_pmix_server_globals.server_uri);
348-
}
349-
/* setup our route to the server */
350-
OBJ_CONSTRUCT(&buf, opal_buffer_t);
351-
opal_dss.pack(&buf, &server, 1, OPAL_STRING);
352-
if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) {
353-
ORTE_ERROR_LOG(rc);
354-
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
355-
return rc;
356-
}
357-
OBJ_DESTRUCT(&buf);
358-
/* parse the URI to get the server's name */
359-
if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(server, &orte_pmix_server_globals.server, NULL))) {
360-
ORTE_ERROR_LOG(rc);
361-
return rc;
362-
}
363-
/* check if we are to wait for the server to start - resolves
364-
* a race condition that can occur when the server is run
365-
* as a background job - e.g., in scripts
366-
*/
367-
if (orte_pmix_server_globals.wait_for_server) {
368-
/* ping the server */
369-
struct timeval timeout;
370-
timeout.tv_sec = orte_pmix_server_globals.timeout;
371-
timeout.tv_usec = 0;
372-
if (ORTE_SUCCESS != (rc = orte_rml.ping(orte_mgmt_conduit, server, &timeout))) {
373-
/* try it one more time */
374-
if (ORTE_SUCCESS != (rc = orte_rml.ping(orte_mgmt_conduit, server, &timeout))) {
375-
/* okay give up */
376-
orte_show_help("help-orterun.txt", "orterun:server-not-found", true,
377-
orte_basename, server,
378-
(long)orte_pmix_server_globals.timeout,
379-
ORTE_ERROR_NAME(rc));
380-
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
381-
return rc;
382-
}
383-
}
384-
}
385-
}
386-
387299
return rc;
388300
}
389301

@@ -716,8 +628,9 @@ OBJ_CLASS_INSTANCE(orte_pmix_server_op_caddy_t,
716628
static void rqcon(pmix_server_req_t *p)
717629
{
718630
p->operation = NULL;
719-
p->target = *ORTE_NAME_INVALID;
631+
p->range = OPAL_PMIX_RANGE_SESSION;
720632
p->proxy = *ORTE_NAME_INVALID;
633+
p->target = *ORTE_NAME_INVALID;
721634
p->timeout = orte_pmix_server_globals.timeout;
722635
p->jdata = NULL;
723636
OBJ_CONSTRUCT(&p->msg, opal_buffer_t);

orte/orted/pmix/pmix_server_internal.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,9 @@
6767
int timeout;
6868
int room_num;
6969
int remote_room_num;
70+
opal_pmix_data_range_t range;
7071
orte_process_name_t proxy;
71-
opal_process_name_t target;
72+
orte_process_name_t target;
7273
orte_job_t *jdata;
7374
opal_buffer_t msg;
7475
opal_pmix_op_cbfunc_t opcbfunc;
@@ -255,6 +256,7 @@ typedef struct {
255256
bool wait_for_server;
256257
orte_process_name_t server;
257258
opal_list_t notifications;
259+
bool pubsub_init;
258260
} pmix_server_globals_t;
259261

260262
extern pmix_server_globals_t orte_pmix_server_globals;

0 commit comments

Comments
 (0)