Skip to content

Commit 79fd359

Browse files
author
Ralph Castain
authored
Merge pull request #3713 from rhc54/topic/ofi
Enable use of OFI fabrics for launch and other collective operations.…
2 parents 1272c08 + 9dad3f7 commit 79fd359

File tree

12 files changed

+539
-574
lines changed

12 files changed

+539
-574
lines changed

opal/mca/pmix/pmix2x/pmix2x_server_south.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ int pmix2x_server_init(opal_pmix_server_module_t *module,
112112

113113
/* convert the list to an array of pmix_info_t */
114114
if (NULL != info) {
115-
sz = opal_list_get_size(info);
115+
sz = opal_list_get_size(info) + 2;
116116
PMIX_INFO_CREATE(pinfo, sz);
117117
n = 0;
118118
OPAL_LIST_FOREACH(kv, info, opal_value_t) {
@@ -121,8 +121,8 @@ int pmix2x_server_init(opal_pmix_server_module_t *module,
121121
++n;
122122
}
123123
} else {
124-
sz = 0;
125-
pinfo = NULL;
124+
sz = 2;
125+
PMIX_INFO_CREATE(pinfo, 2);
126126
}
127127

128128
/* insert ourselves into our list of jobids - it will be the
@@ -133,6 +133,9 @@ int pmix2x_server_init(opal_pmix_server_module_t *module,
133133
opal_list_append(&mca_pmix_pmix2x_component.jobids, &job->super);
134134
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
135135

136+
/* add our nspace and rank to the array going down to the PMIx server */
137+
PMIX_INFO_LOAD(&pinfo[sz-2], PMIX_SERVER_NSPACE, job->nspace, PMIX_STRING);
138+
PMIX_INFO_LOAD(&pinfo[sz-1], PMIX_SERVER_RANK, &OPAL_PROC_MY_NAME.vpid, PMIX_PROC_RANK);
136139
if (PMIX_SUCCESS != (rc = PMIx_server_init(&mymodule, pinfo, sz))) {
137140
PMIX_INFO_FREE(pinfo, sz);
138141
return pmix2x_convert_rc(rc);

opal/mca/pmix/pmix_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ BEGIN_C_DECLS
6262
#define OPAL_PMIX_CONNECT_SYSTEM_FIRST "pmix.cnct.sys.first" // (bool) Preferentially look for a system-level PMIx server first
6363
#define OPAL_PMIX_REGISTER_NODATA "pmix.reg.nodata" // (bool) Registration is for nspace only, do not copy job data
6464
#define OPAL_PMIX_SERVER_ENABLE_MONITORING "pmix.srv.monitor" // (bool) Enable PMIx internal monitoring by server
65+
#define OPAL_PMIX_SERVER_NSPACE "pmix.srv.nspace" // (char*) Name of the nspace to use for this server
66+
#define OPAL_PMIX_SERVER_RANK "pmix.srv.rank" // (uint32_t) Rank of this server
6567

6668

6769
/* identification attributes */

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,9 @@ int orte_ess_base_orted_setup(void)
357357
}
358358
/* set the event base */
359359
opal_pmix_base_set_evbase(orte_event_base);
360-
/* setup the PMIx server */
360+
/* setup the PMIx server - we need this here in case the
361+
* communications infrastructure wants to register
362+
* information */
361363
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
362364
/* the server code already barked, so let's be quiet */
363365
ret = ORTE_ERR_SILENT;
@@ -398,6 +400,9 @@ int orte_ess_base_orted_setup(void)
398400
goto error;
399401
}
400402

403+
/* it is now safe to start the pmix server */
404+
pmix_server_start();
405+
401406
if (NULL != orte_process_info.my_hnp_uri) {
402407
/* extract the HNP's name so we can update the routing table */
403408
if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
@@ -444,7 +449,7 @@ int orte_ess_base_orted_setup(void)
444449
/* add our contact info to our proc object */
445450
proc->rml_uri = orte_rml.get_contact_info();
446451

447-
/*
452+
/*
448453
* Group communications
449454
*/
450455
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,31 @@ static int rte_init(void)
313313
}
314314
}
315315

316+
/* setup the PMIx framework - ensure it skips all non-PMIx components, but
317+
* do not override anything we were given */
318+
opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray,isolated", false, &environ);
319+
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
320+
ORTE_ERROR_LOG(ret);
321+
error = "orte_pmix_base_open";
322+
goto error;
323+
}
324+
if (ORTE_SUCCESS != (ret = opal_pmix_base_select())) {
325+
ORTE_ERROR_LOG(ret);
326+
error = "opal_pmix_base_select";
327+
goto error;
328+
}
329+
/* set the event base */
330+
opal_pmix_base_set_evbase(orte_event_base);
331+
/* setup the PMIx server - we need this here in case the
332+
* communications infrastructure wants to register
333+
* information */
334+
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
335+
/* the server code already barked, so let's be quiet */
336+
ret = ORTE_ERR_SILENT;
337+
error = "pmix_server_init";
338+
goto error;
339+
}
340+
316341
/* Setup the communication infrastructure */
317342
/*
318343
* Routed system
@@ -372,6 +397,9 @@ static int rte_init(void)
372397
}
373398
OPAL_LIST_DESTRUCT(&transports);
374399

400+
/* it is now safe to start the pmix server */
401+
pmix_server_start();
402+
375403
/*
376404
* Group communications
377405
*/
@@ -637,30 +665,6 @@ static int rte_init(void)
637665
free(contact_path);
638666
}
639667

640-
/* setup the PMIx framework - ensure it skips all non-PMIx components, but
641-
* do not override anything we were given */
642-
opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray,isolated", false, &environ);
643-
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
644-
ORTE_ERROR_LOG(ret);
645-
error = "orte_pmix_base_open";
646-
goto error;
647-
}
648-
if (ORTE_SUCCESS != (ret = opal_pmix_base_select())) {
649-
ORTE_ERROR_LOG(ret);
650-
error = "opal_pmix_base_select";
651-
goto error;
652-
}
653-
/* set the event base */
654-
opal_pmix_base_set_evbase(orte_event_base);
655-
656-
/* setup the PMIx server */
657-
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
658-
/* the server code already barked, so let's be quiet */
659-
ret = ORTE_ERR_SILENT;
660-
error = "pmix_server_init";
661-
goto error;
662-
}
663-
664668
/* setup I/O forwarding system - must come after we init routes */
665669
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
666670
ORTE_ERROR_LOG(ret);

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "opal/class/opal_pointer_array.h"
4242
#include "opal/dss/dss.h"
4343
#include "opal/mca/hwloc/hwloc-internal.h"
44+
#include "opal/mca/pmix/pmix.h"
4445

4546
#include "orte/util/dash_host/dash_host.h"
4647
#include "orte/util/session_dir.h"
@@ -1055,6 +1056,8 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
10551056
int i;
10561057
bool found;
10571058
orte_daemon_cmd_flag_t cmd;
1059+
int32_t flag;
1060+
opal_value_t *kv;
10581061

10591062
/* get the daemon job, if necessary */
10601063
if (NULL == jdatorted) {
@@ -1092,6 +1095,26 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
10921095
/* record that this daemon is alive */
10931096
ORTE_FLAG_SET(daemon, ORTE_PROC_FLAG_ALIVE);
10941097

1098+
/* unpack the flag indicating the number of connection blobs
1099+
* in the report */
1100+
idx = 1;
1101+
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &idx, OPAL_INT32))) {
1102+
ORTE_ERROR_LOG(rc);
1103+
orted_failed_launch = true;
1104+
goto CLEANUP;
1105+
}
1106+
for (i=0; i < flag; i++) {
1107+
idx = 1;
1108+
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &kv, &idx, OPAL_VALUE))) {
1109+
ORTE_ERROR_LOG(rc);
1110+
orted_failed_launch = true;
1111+
goto CLEANUP;
1112+
}
1113+
/* store this in a daemon wireup buffer for later distribution */
1114+
opal_pmix.store_local(&dname, kv);
1115+
OBJ_RELEASE(kv);
1116+
}
1117+
10951118
/* unpack the node name */
10961119
idx = 1;
10971120
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &nodename, &idx, OPAL_STRING))) {

orte/mca/rml/ofi/.opal_ignore

Whitespace-only changes.

orte/mca/rml/ofi/.opal_unignore

Lines changed: 0 additions & 2 deletions
This file was deleted.

0 commit comments

Comments
 (0)