Skip to content

Commit 9cb18b8

Browse files
author
Ralph Castain
authored
Merge pull request #3280 from rhc54/topic/dvm
Fix the DVM by ensuring that all nodes, even those that didn't partic…
2 parents 308d33d + 74863a0 commit 9cb18b8

File tree

5 files changed

+93
-3
lines changed

5 files changed

+93
-3
lines changed

orte/mca/odls/odls_types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ typedef uint8_t orte_daemon_cmd_flag_t;
8989
/* request full topology string */
9090
#define ORTE_DAEMON_REPORT_TOPOLOGY_CMD (orte_daemon_cmd_flag_t) 33
9191

92+
/* tell DVM daemons to cleanup resources from job */
93+
#define ORTE_DAEMON_DVM_CLEANUP_JOB_CMD (orte_daemon_cmd_flag_t) 34
94+
95+
9296
/*
9397
* Struct written up the pipe from the child to the parent.
9498
*/

orte/mca/state/dvm/state_dvm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ static void check_complete(int fd, short args, void *cbdata)
410410
* we call the errmgr so that any attempt to restart the job will
411411
* avoid doing so in the exact same place as the current job
412412
*/
413-
if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
413+
if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
414414
map = jdata->map;
415415
for (index = 0; index < map->nodes->size; index++) {
416416
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {

orte/orted/orted_comm.c

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
#include "orte/mca/odls/base/base.h"
7070
#include "orte/mca/plm/plm.h"
7171
#include "orte/mca/plm/base/plm_private.h"
72+
#include "orte/mca/rmaps/rmaps_types.h"
7273
#include "orte/mca/routed/routed.h"
7374
#include "orte/mca/ess/ess.h"
7475
#include "orte/mca/state/state.h"
@@ -122,6 +123,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
122123
opal_pstats_t pstat;
123124
char *rtmod;
124125
char *coprocessors;
126+
orte_job_map_t *map;
125127

126128
/* unpack the command */
127129
n = 1;
@@ -557,6 +559,66 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
557559
}
558560
break;
559561

562+
563+
/**** DVM CLEANUP JOB COMMAND ****/
564+
case ORTE_DAEMON_DVM_CLEANUP_JOB_CMD:
565+
/* unpack the jobid */
566+
n = 1;
567+
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
568+
ORTE_ERROR_LOG(ret);
569+
goto CLEANUP;
570+
}
571+
572+
/* look up job data object */
573+
if (NULL == (jdata = orte_get_job_data_object(job))) {
574+
/* we can safely ignore this request as the job
575+
* was already cleaned up */
576+
goto CLEANUP;
577+
}
578+
579+
/* if we have any local children for this job, then we
580+
* can ignore this request as we would have already
581+
* dealt with it */
582+
if (0 < jdata->num_local_procs) {
583+
goto CLEANUP;
584+
}
585+
586+
/* release all resources (even those on other nodes) that we
587+
* assigned to this job */
588+
if (NULL != jdata->map) {
589+
map = (orte_job_map_t*)jdata->map;
590+
for (n = 0; n < map->nodes->size; n++) {
591+
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
592+
continue;
593+
}
594+
for (i = 0; i < node->procs->size; i++) {
595+
if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
596+
continue;
597+
}
598+
if (proct->name.jobid != jdata->jobid) {
599+
/* skip procs from another job */
600+
continue;
601+
}
602+
node->slots_inuse--;
603+
node->num_procs--;
604+
/* set the entry in the node array to NULL */
605+
opal_pointer_array_set_item(node->procs, i, NULL);
606+
/* release the proc once for the map entry */
607+
OBJ_RELEASE(proct);
608+
}
609+
/* set the node location to NULL */
610+
opal_pointer_array_set_item(map->nodes, n, NULL);
611+
/* maintain accounting */
612+
OBJ_RELEASE(node);
613+
/* flag that the node is no longer in a map */
614+
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
615+
}
616+
OBJ_RELEASE(map);
617+
jdata->map = NULL;
618+
}
619+
break;
620+
621+
560622
/**** REPORT TOPOLOGY COMMAND ****/
561623
case ORTE_DAEMON_REPORT_TOPOLOGY_CMD:
562624
answer = OBJ_NEW(opal_buffer_t);
@@ -1337,6 +1399,9 @@ static char *get_orted_comm_cmd_str(int command)
13371399
case ORTE_DAEMON_GET_MEMPROFILE:
13381400
return strdup("ORTE_DAEMON_GET_MEMPROFILE");
13391401

1402+
case ORTE_DAEMON_DVM_CLEANUP_JOB_CMD:
1403+
return strdup("ORTE_DAEMON_DVM_CLEANUP_JOB_CMD");
1404+
13401405
default:
13411406
return strdup("Unknown Command!");
13421407
}

orte/runtime/orte_quit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ static void dump_aborted_procs(void)
345345
/* find the job that caused the problem */
346346
n = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&job, &nptr);
347347
while (OPAL_SUCCESS == n) {
348-
if (job->jobid == ORTE_PROC_MY_NAME->jobid) {
348+
if (NULL == job || job->jobid == ORTE_PROC_MY_NAME->jobid) {
349349
goto next;
350350
}
351351
if (ORTE_JOB_STATE_UNDEF != job->state &&

orte/tools/orte-dvm/orte-dvm.c

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
1616
* reserved.
17-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
17+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1818
* $COPYRIGHT$
1919
*
2020
* Additional copyrights may follow
@@ -75,6 +75,7 @@
7575
#include "opal/class/opal_pointer_array.h"
7676

7777
#include "orte/mca/errmgr/errmgr.h"
78+
#include "orte/mca/grpcomm/grpcomm.h"
7879
#include "orte/mca/odls/odls.h"
7980
#include "orte/mca/rml/rml.h"
8081
#include "orte/mca/rml/base/rml_contact.h"
@@ -519,6 +520,8 @@ static void notify_requestor(int sd, short args, void *cbdata)
519520
orte_proc_t *pptr;
520521
int ret, id, *idptr;
521522
opal_buffer_t *reply;
523+
orte_daemon_cmd_flag_t command;
524+
orte_grpcomm_signature_t *sig;
522525

523526
/* notify the requestor */
524527
reply = OBJ_NEW(opal_buffer_t);
@@ -557,6 +560,24 @@ static void notify_requestor(int sd, short args, void *cbdata)
557560
ORTE_RML_TAG_NOTIFY_COMPLETE,
558561
send_callback, jdata);
559562

563+
/* now ensure that _all_ daemons know that this job has terminated so even
564+
* those that did not participate in it will know to cleanup the resources
565+
* they assigned to the job. This is necessary now that the mapping function
566+
* has been moved to the backend daemons - otherwise, non-participating daemons
567+
* retain the slot assignments on the participating daemons, and then incorrectly
568+
* map subsequent jobs thinking those nodes are still "busy" */
569+
reply = OBJ_NEW(opal_buffer_t);
570+
command = ORTE_DAEMON_DVM_CLEANUP_JOB_CMD;
571+
opal_dss.pack(reply, &command, 1, ORTE_DAEMON_CMD);
572+
opal_dss.pack(reply, &jdata->jobid, 1, ORTE_JOBID);
573+
sig = OBJ_NEW(orte_grpcomm_signature_t);
574+
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
575+
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
576+
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
577+
orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, reply);
578+
OBJ_RELEASE(reply);
579+
OBJ_RELEASE(sig);
580+
560581
/* we cannot cleanup the job object as we might
561582
* hit an error during transmission, so clean it
562583
* up in the send callback */

0 commit comments

Comments
 (0)