Merge pull request #3018 from naughtont3/tjn-dvmerrmgr-issue2987

Ralph Castain · web-flow · commit 0db91889a778 · 2017-02-25T08:09:16.000-08:00
debug fix for DVM early quit
diff --git a/orte/mca/errmgr/dvm/errmgr_dvm.c b/orte/mca/errmgr/dvm/errmgr_dvm.c
@@ -2,14 +2,14 @@
  * Copyright (c) 2009-2011 The Trustees of Indiana University.
  *                         All rights reserved.
  * Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
- * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2010-2017 Oak Ridge National Labs.  All rights reserved.
  * Copyright (c) 2004-2011 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2011      Oracle and/or all its affiliates.  All rights reserved.
  * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
  *                         All rights reserved.
- * Copyright (c) 2014-2016 Intel, Inc.  All rights reserved.
+ * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
  * Copyright (c) 2017      IBM Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
@@ -142,8 +142,6 @@ static void job_errors(int fd, short args, void *cbdata)
     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
     orte_job_t *jdata;
     orte_job_state_t jobstate;
-    orte_exit_code_t sts;
-    orte_proc_t *aborted_proc;
     opal_buffer_t *answer;
     int32_t rc, ret;
     int room, *rmptr;
@@ -175,110 +173,67 @@ static void job_errors(int fd, short args, void *cbdata)
                          ORTE_JOBID_PRINT(jdata->jobid),
                          orte_job_state_to_str(jobstate)));
 
-    if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
-        ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
-        ORTE_JOB_STATE_MAP_FAILED == jobstate ||
-        ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
-        /* disable routing as we may not have performed the daemon
-         * wireup - e.g., in a managed environment, all the daemons
-         * "phone home", but don't actually wireup into the routed
-         * network until they receive the launch message
-         */
-        orte_routing_is_enabled = false;
-        jdata->num_terminated = jdata->num_procs;
-        ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
-        /* if it was a dynamic spawn, then we better tell them this didn't work */
-        if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
-            rc = jobstate;
-            answer = OBJ_NEW(opal_buffer_t);
-            if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
-                ORTE_ERROR_LOG(ret);
-                OBJ_RELEASE(caddy);
-                return;
-            }
-            if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
-                ORTE_ERROR_LOG(ret);
-                OBJ_RELEASE(caddy);
-                return;
-            }
-            /* pack the room number */
-            rmptr = &room;
-            if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
-                if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
-                    ORTE_ERROR_LOG(ret);
-                    OBJ_RELEASE(caddy);
-                    return;
-                }
-            }
-            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
-                                 "%s errmgr:dvm sending dyn error release of job %s to %s",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                                 ORTE_JOBID_PRINT(jdata->jobid),
-                                 ORTE_NAME_PRINT(&jdata->originator)));
-            if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
-                                                           &jdata->originator, answer,
-                                                           ORTE_RML_TAG_LAUNCH_RESP,
-                                                           orte_rml_send_callback, NULL))) {
-                ORTE_ERROR_LOG(ret);
-                OBJ_RELEASE(answer);
-            }
+    if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
+        /* if the daemon job aborted and we haven't heard from everyone yet,
+         * then this could well have been caused by a daemon not finding
+         * a way back to us. In this case, output a message indicating a daemon
+         * died without reporting. Otherwise, say nothing as we
+         * likely already output an error message */
+        if (ORTE_JOB_STATE_ABORTED == jobstate &&
+            jdata->num_procs != jdata->num_reported) {
+            orte_routing_is_enabled = false;
+            orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
         }
+        /* there really isn't much else we can do since the problem
+         * is in the DVM itself, so best just to terminate */
+        jdata->num_terminated = jdata->num_procs;
+        /* activate the terminated state so we can exit */
+        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
         OBJ_RELEASE(caddy);
         return;
     }
 
-    if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
-        ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
-        /* the job object for this job will have been NULL'd
-         * in the array if the job was solely local. If it isn't
-         * NULL, then we need to tell everyone else to die
-         */
-        aborted_proc = NULL;
-        if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
-            sts = aborted_proc->exit_code;
-            if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
-                if (WIFSIGNALED(sts)) { /* died on signal */
-#ifdef WCOREDUMP
-                    if (WCOREDUMP(sts)) {
-                        orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
-                                       WTERMSIG(sts));
-                        sts = WTERMSIG(sts);
-                    } else {
-                        orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
-                                       WTERMSIG(sts));
-                        sts = WTERMSIG(sts);
-                    }
-#else
-                    orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
-                                   WTERMSIG(sts));
-                    sts = WTERMSIG(sts);
-#endif /* WCOREDUMP */
-                } else {
-                    orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
-                                   WEXITSTATUS(sts));
-                    sts = WEXITSTATUS(sts);
-                }
-            }
-        }
-        /* if this is the daemon job, then we need to ensure we
-         * output an error message indicating we couldn't launch the
-         * daemons */
-        if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
-            orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
+    /* all other cases involve jobs submitted to the DVM - therefore,
+     * we only inform the submitter of the problem, but do NOT terminate
+     * the DVM itself */
+
+    rc = jobstate;
+    answer = OBJ_NEW(opal_buffer_t);
+    if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
+        ORTE_ERROR_LOG(ret);
+        OBJ_RELEASE(caddy);
+        return;
+    }
+    if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
+        ORTE_ERROR_LOG(ret);
+        OBJ_RELEASE(caddy);
+        return;
+    }
+    /* pack the room number */
+    rmptr = &room;
+    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
+        if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
+            ORTE_ERROR_LOG(ret);
+            OBJ_RELEASE(caddy);
+            return;
         }
     }
-
-    /* if the daemon job aborted and we haven't heard from everyone yet,
-     * then this could well have been caused by a daemon not finding
-     * a way back to us. In this case, output a message indicating a daemon
-     * died without reporting. Otherwise, say nothing as we
-     * likely already output an error message */
-    if (ORTE_JOB_STATE_ABORTED == jobstate &&
-        jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
-        jdata->num_procs != jdata->num_reported) {
-        orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
+    OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
+                         "%s errmgr:dvm sending notification of job %s failure to %s",
+                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                         ORTE_JOBID_PRINT(jdata->jobid),
+                         ORTE_NAME_PRINT(&jdata->originator)));
+    if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
+                                                   &jdata->originator, answer,
+                                                   ORTE_RML_TAG_LAUNCH_RESP,
+                                                   orte_rml_send_callback, NULL))) {
+        ORTE_ERROR_LOG(ret);
+        OBJ_RELEASE(answer);
     }
+    /* ensure we terminate any processes left running in the DVM */
+    _terminate_job(jdata->jobid);
 
+    /* cleanup */
     OBJ_RELEASE(caddy);
 }