Skip to content

Commit 74863a0

Browse files
author
Ralph Castain
committed
Fix the DVM by ensuring that all nodes, even those that didn't participate (i.e., didn't have any local children) in a job, clean up all resources associated with that job upon its completion. With the advent of backend distributed mapping, nodes that weren't part of the job would still allocate resources on other nodes - and then start from that point when mapping the next job. This change ensures that all daemons start from the same point each time.
Signed-off-by: Ralph Castain <[email protected]>
1 parent a605bd4 commit 74863a0

File tree

5 files changed

+93
-3
lines changed

5 files changed

+93
-3
lines changed

orte/mca/odls/odls_types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ typedef uint8_t orte_daemon_cmd_flag_t;
8989
/* request full topology string */
9090
#define ORTE_DAEMON_REPORT_TOPOLOGY_CMD (orte_daemon_cmd_flag_t) 33
9191

92+
/* tell DVM daemons to cleanup resources from job */
93+
#define ORTE_DAEMON_DVM_CLEANUP_JOB_CMD (orte_daemon_cmd_flag_t) 34
94+
95+
9296
/*
9397
* Struct written up the pipe from the child to the parent.
9498
*/

orte/mca/state/dvm/state_dvm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ static void check_complete(int fd, short args, void *cbdata)
410410
* we call the errmgr so that any attempt to restart the job will
411411
* avoid doing so in the exact same place as the current job
412412
*/
413-
if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
413+
if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) {
414414
map = jdata->map;
415415
for (index = 0; index < map->nodes->size; index++) {
416416
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {

orte/orted/orted_comm.c

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
#include "orte/mca/odls/base/base.h"
7070
#include "orte/mca/plm/plm.h"
7171
#include "orte/mca/plm/base/plm_private.h"
72+
#include "orte/mca/rmaps/rmaps_types.h"
7273
#include "orte/mca/routed/routed.h"
7374
#include "orte/mca/ess/ess.h"
7475
#include "orte/mca/state/state.h"
@@ -122,6 +123,7 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
122123
opal_pstats_t pstat;
123124
char *rtmod;
124125
char *coprocessors;
126+
orte_job_map_t *map;
125127

126128
/* unpack the command */
127129
n = 1;
@@ -557,6 +559,66 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
557559
}
558560
break;
559561

562+
563+
/**** DVM CLEANUP JOB COMMAND ****/
564+
case ORTE_DAEMON_DVM_CLEANUP_JOB_CMD:
565+
/* unpack the jobid */
566+
n = 1;
567+
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
568+
ORTE_ERROR_LOG(ret);
569+
goto CLEANUP;
570+
}
571+
572+
/* look up job data object */
573+
if (NULL == (jdata = orte_get_job_data_object(job))) {
574+
/* we can safely ignore this request as the job
575+
* was already cleaned up */
576+
goto CLEANUP;
577+
}
578+
579+
/* if we have any local children for this job, then we
580+
* can ignore this request as we would have already
581+
* dealt with it */
582+
if (0 < jdata->num_local_procs) {
583+
goto CLEANUP;
584+
}
585+
586+
/* release all resources (even those on other nodes) that we
587+
* assigned to this job */
588+
if (NULL != jdata->map) {
589+
map = (orte_job_map_t*)jdata->map;
590+
for (n = 0; n < map->nodes->size; n++) {
591+
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
592+
continue;
593+
}
594+
for (i = 0; i < node->procs->size; i++) {
595+
if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
596+
continue;
597+
}
598+
if (proct->name.jobid != jdata->jobid) {
599+
/* skip procs from another job */
600+
continue;
601+
}
602+
node->slots_inuse--;
603+
node->num_procs--;
604+
/* set the entry in the node array to NULL */
605+
opal_pointer_array_set_item(node->procs, i, NULL);
606+
/* release the proc once for the map entry */
607+
OBJ_RELEASE(proct);
608+
}
609+
/* set the node location to NULL */
610+
opal_pointer_array_set_item(map->nodes, n, NULL);
611+
/* maintain accounting */
612+
OBJ_RELEASE(node);
613+
/* flag that the node is no longer in a map */
614+
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
615+
}
616+
OBJ_RELEASE(map);
617+
jdata->map = NULL;
618+
}
619+
break;
620+
621+
560622
/**** REPORT TOPOLOGY COMMAND ****/
561623
case ORTE_DAEMON_REPORT_TOPOLOGY_CMD:
562624
answer = OBJ_NEW(opal_buffer_t);
@@ -1337,6 +1399,9 @@ static char *get_orted_comm_cmd_str(int command)
13371399
case ORTE_DAEMON_GET_MEMPROFILE:
13381400
return strdup("ORTE_DAEMON_GET_MEMPROFILE");
13391401

1402+
case ORTE_DAEMON_DVM_CLEANUP_JOB_CMD:
1403+
return strdup("ORTE_DAEMON_DVM_CLEANUP_JOB_CMD");
1404+
13401405
default:
13411406
return strdup("Unknown Command!");
13421407
}

orte/runtime/orte_quit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ static void dump_aborted_procs(void)
345345
/* find the job that caused the problem */
346346
n = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&job, &nptr);
347347
while (OPAL_SUCCESS == n) {
348-
if (job->jobid == ORTE_PROC_MY_NAME->jobid) {
348+
if (NULL == job || job->jobid == ORTE_PROC_MY_NAME->jobid) {
349349
goto next;
350350
}
351351
if (ORTE_JOB_STATE_UNDEF != job->state &&

orte/tools/orte-dvm/orte-dvm.c

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
1616
* reserved.
17-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
17+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1818
* $COPYRIGHT$
1919
*
2020
* Additional copyrights may follow
@@ -75,6 +75,7 @@
7575
#include "opal/class/opal_pointer_array.h"
7676

7777
#include "orte/mca/errmgr/errmgr.h"
78+
#include "orte/mca/grpcomm/grpcomm.h"
7879
#include "orte/mca/odls/odls.h"
7980
#include "orte/mca/rml/rml.h"
8081
#include "orte/mca/rml/base/rml_contact.h"
@@ -519,6 +520,8 @@ static void notify_requestor(int sd, short args, void *cbdata)
519520
orte_proc_t *pptr;
520521
int ret, id, *idptr;
521522
opal_buffer_t *reply;
523+
orte_daemon_cmd_flag_t command;
524+
orte_grpcomm_signature_t *sig;
522525

523526
/* notify the requestor */
524527
reply = OBJ_NEW(opal_buffer_t);
@@ -557,6 +560,24 @@ static void notify_requestor(int sd, short args, void *cbdata)
557560
ORTE_RML_TAG_NOTIFY_COMPLETE,
558561
send_callback, jdata);
559562

563+
/* now ensure that _all_ daemons know that this job has terminated so even
564+
* those that did not participate in it will know to cleanup the resources
565+
* they assigned to the job. This is necessary now that the mapping function
566+
* has been moved to the backend daemons - otherwise, non-participating daemons
567+
* retain the slot assignments on the participating daemons, and then incorrectly
568+
* map subsequent jobs thinking those nodes are still "busy" */
569+
reply = OBJ_NEW(opal_buffer_t);
570+
command = ORTE_DAEMON_DVM_CLEANUP_JOB_CMD;
571+
opal_dss.pack(reply, &command, 1, ORTE_DAEMON_CMD);
572+
opal_dss.pack(reply, &jdata->jobid, 1, ORTE_JOBID);
573+
sig = OBJ_NEW(orte_grpcomm_signature_t);
574+
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
575+
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
576+
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
577+
orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, reply);
578+
OBJ_RELEASE(reply);
579+
OBJ_RELEASE(sig);
580+
560581
/* we cannot cleanup the job object as we might
561582
* hit an error during transmission, so clean it
562583
* up in the send callback */

0 commit comments

Comments
 (0)