Skip to content

Commit 10b103a

Browse files
author
Ralph Castain
authored
Merge pull request #3524 from rhc54/topic/nodis
Update the distributed mapping system to maintain coherence
2 parents 7adce62 + 657e701 commit 10b103a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1886
-5177
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@ orte/test/mpi/memcached-dummy
415415
orte/test/mpi/coll_test
416416
orte/test/mpi/badcoll
417417
orte/test/mpi/iof
418+
orte/test/mpi/no-disconnect
418419

419420
orte/test/system/radix
420421
orte/test/system/sigusr_trap

opal/mca/pmix/base/pmix_base_fns.c

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
/*
33
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
5+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
66
* Copyright (c) 2014-2017 Research Organization for Information Science
77
* and Technology (RIST). All rights reserved.
88
* Copyright (c) 2016 Mellanox Technologies, Inc.
@@ -118,6 +118,12 @@ static void lookup_cbfunc(int status, opal_list_t *data, void *cbdata)
118118
cd->active = false;
119119
}
120120

121+
static void opcbfunc(int status, void *cbdata)
122+
{
123+
struct lookup_caddy_t *cd = (struct lookup_caddy_t*)cbdata;
124+
cd->active = false;
125+
}
126+
121127
int opal_pmix_base_exchange(opal_value_t *indat,
122128
opal_pmix_pdata_t *outdat,
123129
int timeout)
@@ -141,11 +147,29 @@ int opal_pmix_base_exchange(opal_value_t *indat,
141147
opal_list_append(&ilist, &info->super);
142148

143149
/* publish it with "session" scope */
144-
rc = opal_pmix.publish(&ilist);
145-
OPAL_LIST_DESTRUCT(&ilist);
146-
if (OPAL_SUCCESS != rc) {
147-
OPAL_ERROR_LOG(rc);
148-
return rc;
150+
if (NULL == opal_pmix.publish_nb) {
151+
rc = opal_pmix.publish(&ilist);
152+
OPAL_LIST_DESTRUCT(&ilist);
153+
if (OPAL_SUCCESS != rc) {
154+
OPAL_ERROR_LOG(rc);
155+
return rc;
156+
}
157+
} else {
158+
caddy.active = true;
159+
rc = opal_pmix.publish_nb(&ilist, opcbfunc, &caddy);
160+
if (OPAL_SUCCESS != rc) {
161+
OPAL_ERROR_LOG(rc);
162+
OPAL_LIST_DESTRUCT(&ilist);
163+
return rc;
164+
}
165+
while (caddy.active) {
166+
usleep(10);
167+
}
168+
OPAL_LIST_DESTRUCT(&ilist);
169+
if (OPAL_SUCCESS != caddy.status) {
170+
OPAL_ERROR_LOG(caddy.status);
171+
return caddy.status;
172+
}
149173
}
150174

151175
/* lookup the other side's info - if a non-blocking form

orte/mca/odls/base/odls_base_default_fns.c

Lines changed: 101 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
131131
/* if we couldn't provide the allocation regex on the orted
132132
* cmd line, then we need to provide all the info here */
133133
if (!orte_nidmap_communicated) {
134-
if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(&nidmap))) {
134+
if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, &nidmap))) {
135135
ORTE_ERROR_LOG(rc);
136136
return rc;
137137
}
@@ -246,6 +246,22 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
246246
return rc;
247247
}
248248

249+
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
250+
/* compute and pack the ppn regex */
251+
if (ORTE_SUCCESS != (rc = orte_util_nidmap_generate_ppn(jdata, &nidmap))) {
252+
ORTE_ERROR_LOG(rc);
253+
return rc;
254+
}
255+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &nidmap, 1, OPAL_STRING))) {
256+
ORTE_ERROR_LOG(rc);
257+
free(nidmap);
258+
return rc;
259+
}
260+
free(nidmap);
261+
}
262+
263+
/* compute and pack the regex of ppn */
264+
249265
return ORTE_SUCCESS;
250266
}
251267

@@ -262,13 +278,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
262278
int rc;
263279
orte_std_cntr_t cnt;
264280
orte_job_t *jdata=NULL, *daemons;
265-
int32_t n, k, m;
281+
int32_t n, k;
266282
opal_buffer_t *bptr;
267-
orte_node_t *node;
268283
orte_proc_t *pptr, *dmn;
269284
orte_app_context_t *app;
270-
bool newmap = false;
271285
int8_t flag;
286+
char *ppn;
272287

273288
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
274289
"%s odls:constructing child list",
@@ -356,33 +371,73 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
356371
* the storage */
357372
jdata->jobid = ORTE_JOBID_INVALID;
358373
OBJ_RELEASE(jdata);
359-
/* get the correct job object */
374+
/* get the correct job object - it will be completely filled out */
360375
if (NULL == (jdata = orte_get_job_data_object(*job))) {
361376
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
362377
rc = ORTE_ERR_NOT_FOUND;
363378
goto REPORT_ERROR;
364379
}
365380
} else {
366381
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
367-
}
368382

369-
/* ensure the map object is present */
370-
if (NULL == jdata->map) {
371-
jdata->map = OBJ_NEW(orte_job_map_t);
372-
newmap = true;
383+
/* ensure the map object is present */
384+
if (NULL == jdata->map) {
385+
jdata->map = OBJ_NEW(orte_job_map_t);
386+
}
373387
}
374388

375-
if (orte_no_vm) {
376-
/* if we are operating novm, then mpirun will have sent us
377-
* the complete array of procs - process it */
378-
for (n=0; n < jdata->procs->size; n++) {
379-
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
380-
continue;
389+
/* if the job is fully described, then mpirun will have computed
390+
* and sent us the complete array of procs in the orte_job_t, so we
391+
* don't need to do anything more here */
392+
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
393+
if (!ORTE_PROC_IS_HNP) {
394+
/* extract the ppn regex */
395+
cnt = 1;
396+
if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ppn, &cnt, OPAL_STRING))) {
397+
ORTE_ERROR_LOG(rc);
398+
goto REPORT_ERROR;
381399
}
382-
if (ORTE_PROC_STATE_UNDEF == pptr->state) {
383-
/* not ready for use yet */
384-
continue;
400+
/* populate the node array of the job map and the proc array of
401+
* the job object so we know how many procs are on each node */
402+
if (ORTE_SUCCESS != (rc = orte_util_nidmap_parse_ppn(jdata, ppn))) {
403+
ORTE_ERROR_LOG(rc);
404+
free(ppn);
405+
goto REPORT_ERROR;
406+
}
407+
free(ppn);
408+
/* now assign locations to the procs */
409+
if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) {
410+
ORTE_ERROR_LOG(rc);
411+
goto REPORT_ERROR;
385412
}
413+
}
414+
/* compute the ranks and add the proc objects
415+
* to the jdata->procs array */
416+
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
417+
ORTE_ERROR_LOG(rc);
418+
goto REPORT_ERROR;
419+
}
420+
/* and finally, compute the local and node ranks */
421+
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
422+
ORTE_ERROR_LOG(rc);
423+
goto REPORT_ERROR;
424+
}
425+
}
426+
427+
/* now that the node array in the job map and jdata are completely filled out,.
428+
* we need to "wireup" the procs to their nodes so other utilities can
429+
* locate them */
430+
for (n=0; n < jdata->procs->size; n++) {
431+
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
432+
continue;
433+
}
434+
if (ORTE_PROC_STATE_UNDEF == pptr->state) {
435+
/* not ready for use yet */
436+
continue;
437+
}
438+
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
439+
/* the parser will have already made the connection, but the fully described
440+
* case won't have done it, so connect the proc to its node here */
386441
opal_output_verbose(5, orte_odls_base_framework.framework_output,
387442
"%s GETTING DAEMON FOR PROC %s WITH PARENT %s",
388443
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@@ -401,100 +456,44 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
401456
}
402457
OBJ_RETAIN(dmn->node);
403458
pptr->node = dmn->node;
404-
/* add proc to node - note that num_procs for the
405-
* node was already correctly unpacked, so don't
406-
* increment it here */
407-
OBJ_RETAIN(pptr);
408-
opal_pointer_array_add(dmn->node->procs, pptr);
409-
410-
/* add the node to the map, if not already there */
411-
if (!ORTE_FLAG_TEST(dmn->node, ORTE_NODE_FLAG_MAPPED)) {
412-
OBJ_RETAIN(dmn->node);
413-
ORTE_FLAG_SET(dmn->node, ORTE_NODE_FLAG_MAPPED);
414-
opal_pointer_array_add(jdata->map->nodes, dmn->node);
415-
if (newmap) {
416-
jdata->map->num_nodes++;
417-
}
418-
}
419-
420-
/* see if it belongs to us */
421-
if (pptr->parent == ORTE_PROC_MY_NAME->vpid) {
422-
/* is this child on our current list of children */
423-
if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
424-
/* not on the local list */
425-
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
426-
"%s[%s:%d] adding proc %s to my local list",
427-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
428-
__FILE__, __LINE__,
429-
ORTE_NAME_PRINT(&pptr->name)));
430-
/* keep tabs of the number of local procs */
431-
jdata->num_local_procs++;
432-
/* add this proc to our child list */
433-
OBJ_RETAIN(pptr);
434-
ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL);
435-
opal_pointer_array_add(orte_local_children, pptr);
436-
}
437-
438-
/* if the job is in restart mode, the child must not barrier when launched */
439-
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
440-
orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
441-
}
442-
/* mark that this app_context is being used on this node */
443-
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
444-
ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
445-
}
446-
}
447-
} else {
448-
/* create the map - will already have been done for the novm case */
449-
if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_job(jdata))) {
450-
ORTE_ERROR_LOG(rc);
451-
goto REPORT_ERROR;
452459
}
453-
/* find our local procs */
454-
for (n=0; n < jdata->map->nodes->size; n++) {
455-
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
456-
continue;
457-
}
458-
if (node->index != (int)ORTE_PROC_MY_NAME->vpid) {
459-
continue;
460+
/* see if it belongs to us */
461+
if (pptr->parent == ORTE_PROC_MY_NAME->vpid) {
462+
/* is this child on our current list of children */
463+
if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
464+
/* not on the local list */
465+
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
466+
"%s[%s:%d] adding proc %s to my local list",
467+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
468+
__FILE__, __LINE__,
469+
ORTE_NAME_PRINT(&pptr->name)));
470+
/* keep tabs of the number of local procs */
471+
jdata->num_local_procs++;
472+
/* add this proc to our child list */
473+
OBJ_RETAIN(pptr);
474+
ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL);
475+
opal_pointer_array_add(orte_local_children, pptr);
460476
}
461-
for (m=0; m < node->procs->size; m++) {
462-
if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, m))) {
463-
continue;
464-
}
465-
if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
466-
/* not on the local list */
467-
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
468-
"%s[%s:%d] adding proc %s to my local list",
469-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
470-
__FILE__, __LINE__,
471-
ORTE_NAME_PRINT(&pptr->name)));
472-
/* keep tabs of the number of local procs */
473-
jdata->num_local_procs++;
474-
/* add this proc to our child list */
475-
OBJ_RETAIN(pptr);
476-
ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL);
477-
opal_pointer_array_add(orte_local_children, pptr);
478-
/* mark that this app_context is being used on this node */
479-
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
480-
ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
481-
}
477+
478+
/* if the job is in restart mode, the child must not barrier when launched */
479+
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
480+
orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
482481
}
482+
/* mark that this app_context is being used on this node */
483+
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
484+
ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
483485
}
486+
}
487+
488+
if (!ORTE_PROC_IS_HNP &&
489+
!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
484490
/* compute and save bindings of local children */
485491
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
486492
ORTE_ERROR_LOG(rc);
487493
goto REPORT_ERROR;
488494
}
489495
}
490496

491-
/* reset any node map flags we used so the next job will start clean */
492-
for (n=0; n < jdata->map->nodes->size; n++) {
493-
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
494-
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
495-
}
496-
}
497-
498497
/* if we wanted to see the map, now is the time to display it */
499498
if (jdata->map->display_map) {
500499
orte_rmaps_base_display_map(jdata);

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ static void files_ready(int status, void *cbdata)
209209
if (ORTE_SUCCESS != status) {
210210
ORTE_FORCED_TERMINATE(status);
211211
} else {
212-
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP);
212+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
213213
}
214214
}
215215

@@ -1497,7 +1497,7 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv,
14971497

14981498
/* convert the nodes with daemons to a regex */
14991499
param = NULL;
1500-
if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(&param))) {
1500+
if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, &param))) {
15011501
ORTE_ERROR_LOG(rc);
15021502
return rc;
15031503
}

orte/mca/rmaps/base/Makefile.am

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
1313
# Copyright (c) 2011 Los Alamos National Security, LLC.
1414
# All rights reserved.
15-
# Copyright (c) 2015 Intel, Inc. All rights reserved.
15+
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
1616
# $COPYRIGHT$
1717
#
1818
# Additional copyrights may follow
@@ -31,7 +31,8 @@ libmca_rmaps_la_SOURCES += \
3131
base/rmaps_base_support_fns.c \
3232
base/rmaps_base_ranking.c \
3333
base/rmaps_base_print_fns.c \
34-
base/rmaps_base_binding.c
34+
base/rmaps_base_binding.c \
35+
base/rmaps_base_assign_locations.c
3536

3637

3738
dist_ortedata_DATA = base/help-orte-rmaps-base.txt

orte/mca/rmaps/base/base.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ OBJ_CLASS_DECLARATION(orte_rmaps_base_selected_module_t);
9999
/*
100100
* Map a job
101101
*/
102-
ORTE_DECLSPEC int orte_rmaps_base_map_job(orte_job_t *jdata);
102+
ORTE_DECLSPEC void orte_rmaps_base_map_job(int sd, short args, void *cbdata);
103+
ORTE_DECLSPEC int orte_rmaps_base_assign_locations(orte_job_t *jdata);
103104

104105
/**
105106
* Utility routines to get/set vpid mapping for the job

orte/mca/rmaps/base/help-orte-rmaps-base.txt

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
1414
# Copyright (c) 2011 Los Alamos National Security, LLC.
1515
# All rights reserved.
16-
# Copyright (c) 2014 Intel, Inc. All rights reserved.
16+
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1717
# $COPYRIGHT$
1818
#
1919
# Additional copyrights may follow
@@ -410,3 +410,13 @@ Either the -host or -hostfile options were given, but the number
410410
of processes to start was omitted. This combination is not supported.
411411

412412
Please specify the number of processes to run and try again.
413+
#
414+
[failed-assignments]
415+
The attempt to assign hardware locations to processes on a
416+
compute node failed:
417+
418+
Node: %s
419+
Policy: %s
420+
421+
We cannot continue - please check that the policy is in
422+
accordance with the actual available hardware.

0 commit comments

Comments
 (0)