Skip to content

Commit 29000b4

Browse files
author
Ralph Castain
committed
Now that we have an "isolated" PLM component, we cannot just let rsh silently decline to run when it cannot find a launch agent - if we do, then we will -always- run on the local node. So if the user specifies a launch agent and we can't find it, then generate a pretty error message, report a fatal error back to the component select, and exit out.
This required modifying the mca_component_select function to actually check the return code on a component query - it was blissfully ignoring it. Also do a little cleanup to avoid bombarding the user with multiple error messages. Thanks to Patrick Begou for reporting the problem (cherry picked from commit open-mpi/ompi@0140ff0)
1 parent 46d887a commit 29000b4

File tree

8 files changed

+130
-199
lines changed

8 files changed

+130
-199
lines changed

opal/mca/base/mca_base_components_select.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* Corporation. All rights reserved.
55
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
66
* reserved.
7+
* Copyright (c) 2015 Intel, Inc. All rights reserved.
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -38,6 +39,7 @@ int mca_base_select(const char *type_name, int output_id,
3839
mca_base_component_t *component = NULL;
3940
mca_base_module_t *module = NULL;
4041
int priority = 0, best_priority = INT32_MIN;
42+
int rc;
4143

4244
*best_module = NULL;
4345
*best_component = NULL;
@@ -70,7 +72,18 @@ int mca_base_select(const char *type_name, int output_id,
7072
"mca:base:select:(%5s) Querying component [%s]",
7173
type_name, component->mca_component_name);
7274

73-
component->mca_query_component(&module, &priority);
75+
rc = component->mca_query_component(&module, &priority);
76+
if (OPAL_ERR_FATAL == rc) {
77+
/* a fatal error was detected by this component - e.g., the
78+
* user specified a required element and the component could
79+
* not find it. In this case, we must not continue as we might
80+
* find some other component that could run, causing us to do
81+
* something the user didn't want */
82+
return rc;
83+
} else if (OPAL_SUCCESS != rc) {
84+
/* silently skip this component */
85+
continue;
86+
}
7487

7588
/*
7689
* If no module was returned, then skip component

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -222,31 +222,26 @@ static int rte_init(void)
222222
* process stats if requested
223223
*/
224224
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
225-
ORTE_ERROR_LOG(ret);
226225
error = "opal_pstat_base_open";
227226
goto error;
228227
}
229228
if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
230-
ORTE_ERROR_LOG(ret);
231229
error = "opal_pstat_base_select";
232230
goto error;
233231
}
234232

235233
/* open and setup the state machine */
236234
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
237-
ORTE_ERROR_LOG(ret);
238235
error = "orte_state_base_open";
239236
goto error;
240237
}
241238
if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
242-
ORTE_ERROR_LOG(ret);
243239
error = "orte_state_base_select";
244240
goto error;
245241
}
246242

247243
/* open the errmgr */
248244
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
249-
ORTE_ERROR_LOG(ret);
250245
error = "orte_errmgr_base_open";
251246
goto error;
252247
}
@@ -257,27 +252,27 @@ static int rte_init(void)
257252
* first and select that component.
258253
*/
259254
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
260-
ORTE_ERROR_LOG(ret);
261255
error = "orte_plm_base_open";
262256
goto error;
263257
}
264258

265259
if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
266-
ORTE_ERROR_LOG(ret);
267260
error = "orte_plm_base_select";
261+
if (ORTE_ERR_FATAL == ret) {
262+
/* we already output a show_help - so keep down the verbage */
263+
ret = ORTE_ERR_SILENT;
264+
}
268265
goto error;
269266
}
270267
/* if we were spawned by a singleton, our jobid was given to us */
271268
if (NULL != orte_ess_base_jobid) {
272269
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&ORTE_PROC_MY_NAME->jobid, orte_ess_base_jobid))) {
273-
ORTE_ERROR_LOG(ret);
274270
error = "convert_string_to_jobid";
275271
goto error;
276272
}
277273
ORTE_PROC_MY_NAME->vpid = 0;
278274
} else {
279275
if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
280-
ORTE_ERROR_LOG(ret);
281276
error = "orte_plm_set_hnp_name";
282277
goto error;
283278
}
@@ -304,7 +299,6 @@ static int rte_init(void)
304299
orte_process_info.tmpdir_base,
305300
orte_process_info.nodename, NULL,
306301
ORTE_PROC_MY_NAME))) {
307-
ORTE_ERROR_LOG(ret);
308302
error = "orte_session_dir define";
309303
goto error;
310304
}
@@ -318,7 +312,6 @@ static int rte_init(void)
318312
orte_process_info.tmpdir_base,
319313
orte_process_info.nodename, NULL,
320314
ORTE_PROC_MY_NAME))) {
321-
ORTE_ERROR_LOG(ret);
322315
error = "orte_session_dir";
323316
goto error;
324317
}
@@ -330,12 +323,10 @@ static int rte_init(void)
330323
* OOB Layer
331324
*/
332325
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
333-
ORTE_ERROR_LOG(ret);
334326
error = "orte_oob_base_open";
335327
goto error;
336328
}
337329
if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
338-
ORTE_ERROR_LOG(ret);
339330
error = "orte_oob_base_select";
340331
goto error;
341332
}
@@ -344,18 +335,15 @@ static int rte_init(void)
344335
* Runtime Messaging Layer
345336
*/
346337
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
347-
ORTE_ERROR_LOG(ret);
348338
error = "orte_rml_base_open";
349339
goto error;
350340
}
351341
if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
352-
ORTE_ERROR_LOG(ret);
353342
error = "orte_rml_base_select";
354343
goto error;
355344
}
356345

357346
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
358-
ORTE_ERROR_LOG(ret);
359347
error = "orte_errmgr_base_select";
360348
goto error;
361349
}

orte/mca/plm/base/plm_base_select.c

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1313
* All rights reserved.
14+
* Copyright (c) 2015 Intel, Inc. All rights reserved.
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -32,43 +33,26 @@
3233

3334

3435
/**
35-
* Function for selecting one component from all those that are
36+
* Function for selecting one component from all those that are
3637
* available.
3738
*/
3839

3940
int orte_plm_base_select(void)
4041
{
41-
int exit_status = ORTE_SUCCESS;
42+
int rc;
4243
orte_plm_base_component_t *best_component = NULL;
4344
orte_plm_base_module_t *best_module = NULL;
4445

4546
/*
4647
* Select the best component
4748
*/
48-
if( OPAL_SUCCESS != mca_base_select("plm", orte_plm_base_framework.framework_output,
49-
&orte_plm_base_framework.framework_components,
50-
(mca_base_module_t **) &best_module,
51-
(mca_base_component_t **) &best_component) ) {
52-
/* This will only happen if no component was selected
53-
*
54-
* If we didn't find one, and we are a daemon, then default to retaining the proxy.
55-
* Otherwise, if we didn't find one to select, that is unacceptable.
56-
*/
57-
if (ORTE_PROC_IS_DAEMON) {
58-
/* don't record a selected component or flag selected
59-
* so we finalize correctly - just leave the plm alone
60-
* as it defaults to pointing at the proxy
61-
*/
62-
goto cleanup;
63-
} else {
64-
exit_status = ORTE_ERR_NOT_FOUND;
65-
goto cleanup;
66-
}
49+
if (OPAL_SUCCESS == (rc = mca_base_select("plm", orte_plm_base_framework.framework_output,
50+
&orte_plm_base_framework.framework_components,
51+
(mca_base_module_t **) &best_module,
52+
(mca_base_component_t **) &best_component))) {
53+
/* Save the winner */
54+
orte_plm = *best_module;
6755
}
6856

69-
/* Save the winner */
70-
orte_plm = *best_module;
71-
72-
cleanup:
73-
return exit_status;
57+
return rc;
7458
}

orte/mca/plm/rsh/help-plm-rsh.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13+
# Copyright (c) 2015 Intel, Inc. All rights reserved.
1314
# $COPYRIGHT$
1415
#
1516
# Additional copyrights may follow
@@ -18,6 +19,14 @@
1819
#
1920
# This is the US/English general help file for Open RTE's orterun.
2021
#
22+
[agent-not-found]
23+
The value of the MCA parameter "plm_rsh_agent" was set to a path
24+
that could not be found:
25+
26+
plm_rsh_agent: %s
27+
28+
Please either unset the parameter, or check that the path is correct
29+
#
2130
[no-local-orted]
2231
The rsh PLS component was not able to find the executable "orted" in
2332
your PATH or in the directory where Open MPI/OpenRTE was initially installed,

orte/mca/plm/rsh/plm_rsh.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ struct orte_plm_rsh_component_t {
5959
bool no_tree_spawn;
6060
int num_concurrent;
6161
char *agent;
62+
char *agent_path;
63+
char **agent_argv;
6264
bool assume_same_shell;
6365
bool pass_environ_mca_params;
6466
char *ssh_args;

0 commit comments

Comments
 (0)