Skip to content

Commit b2fe097

Browse files
committed
Merge pull request open-mpi#608 from rhc54/cmr2.0/rsh
Now that we have an "isolated" PLM component, we cannot just let rsh …
2 parents 033c597 + 29000b4 commit b2fe097

File tree

8 files changed

+130
-199
lines changed

8 files changed

+130
-199
lines changed

opal/mca/base/mca_base_components_select.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* Corporation. All rights reserved.
55
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
66
* reserved.
7+
* Copyright (c) 2015 Intel, Inc. All rights reserved.
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -38,6 +39,7 @@ int mca_base_select(const char *type_name, int output_id,
3839
mca_base_component_t *component = NULL;
3940
mca_base_module_t *module = NULL;
4041
int priority = 0, best_priority = INT32_MIN;
42+
int rc;
4143

4244
*best_module = NULL;
4345
*best_component = NULL;
@@ -70,7 +72,18 @@ int mca_base_select(const char *type_name, int output_id,
7072
"mca:base:select:(%5s) Querying component [%s]",
7173
type_name, component->mca_component_name);
7274

73-
component->mca_query_component(&module, &priority);
75+
rc = component->mca_query_component(&module, &priority);
76+
if (OPAL_ERR_FATAL == rc) {
77+
/* a fatal error was detected by this component - e.g., the
78+
* user specified a required element and the component could
79+
* not find it. In this case, we must not continue as we might
80+
* find some other component that could run, causing us to do
81+
* something the user didn't want */
82+
return rc;
83+
} else if (OPAL_SUCCESS != rc) {
84+
/* silently skip this component */
85+
continue;
86+
}
7487

7588
/*
7689
* If no module was returned, then skip component

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -222,31 +222,26 @@ static int rte_init(void)
222222
* process stats if requested
223223
*/
224224
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
225-
ORTE_ERROR_LOG(ret);
226225
error = "opal_pstat_base_open";
227226
goto error;
228227
}
229228
if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
230-
ORTE_ERROR_LOG(ret);
231229
error = "opal_pstat_base_select";
232230
goto error;
233231
}
234232

235233
/* open and setup the state machine */
236234
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
237-
ORTE_ERROR_LOG(ret);
238235
error = "orte_state_base_open";
239236
goto error;
240237
}
241238
if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
242-
ORTE_ERROR_LOG(ret);
243239
error = "orte_state_base_select";
244240
goto error;
245241
}
246242

247243
/* open the errmgr */
248244
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
249-
ORTE_ERROR_LOG(ret);
250245
error = "orte_errmgr_base_open";
251246
goto error;
252247
}
@@ -257,27 +252,27 @@ static int rte_init(void)
257252
* first and select that component.
258253
*/
259254
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
260-
ORTE_ERROR_LOG(ret);
261255
error = "orte_plm_base_open";
262256
goto error;
263257
}
264258

265259
if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
266-
ORTE_ERROR_LOG(ret);
267260
error = "orte_plm_base_select";
261+
if (ORTE_ERR_FATAL == ret) {
262+
/* we already output a show_help - so keep down the verbage */
263+
ret = ORTE_ERR_SILENT;
264+
}
268265
goto error;
269266
}
270267
/* if we were spawned by a singleton, our jobid was given to us */
271268
if (NULL != orte_ess_base_jobid) {
272269
if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&ORTE_PROC_MY_NAME->jobid, orte_ess_base_jobid))) {
273-
ORTE_ERROR_LOG(ret);
274270
error = "convert_string_to_jobid";
275271
goto error;
276272
}
277273
ORTE_PROC_MY_NAME->vpid = 0;
278274
} else {
279275
if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
280-
ORTE_ERROR_LOG(ret);
281276
error = "orte_plm_set_hnp_name";
282277
goto error;
283278
}
@@ -304,7 +299,6 @@ static int rte_init(void)
304299
orte_process_info.tmpdir_base,
305300
orte_process_info.nodename, NULL,
306301
ORTE_PROC_MY_NAME))) {
307-
ORTE_ERROR_LOG(ret);
308302
error = "orte_session_dir define";
309303
goto error;
310304
}
@@ -318,7 +312,6 @@ static int rte_init(void)
318312
orte_process_info.tmpdir_base,
319313
orte_process_info.nodename, NULL,
320314
ORTE_PROC_MY_NAME))) {
321-
ORTE_ERROR_LOG(ret);
322315
error = "orte_session_dir";
323316
goto error;
324317
}
@@ -330,12 +323,10 @@ static int rte_init(void)
330323
* OOB Layer
331324
*/
332325
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
333-
ORTE_ERROR_LOG(ret);
334326
error = "orte_oob_base_open";
335327
goto error;
336328
}
337329
if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
338-
ORTE_ERROR_LOG(ret);
339330
error = "orte_oob_base_select";
340331
goto error;
341332
}
@@ -344,18 +335,15 @@ static int rte_init(void)
344335
* Runtime Messaging Layer
345336
*/
346337
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
347-
ORTE_ERROR_LOG(ret);
348338
error = "orte_rml_base_open";
349339
goto error;
350340
}
351341
if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
352-
ORTE_ERROR_LOG(ret);
353342
error = "orte_rml_base_select";
354343
goto error;
355344
}
356345

357346
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
358-
ORTE_ERROR_LOG(ret);
359347
error = "orte_errmgr_base_select";
360348
goto error;
361349
}

orte/mca/plm/base/plm_base_select.c

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1313
* All rights reserved.
14+
* Copyright (c) 2015 Intel, Inc. All rights reserved.
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -32,43 +33,26 @@
3233

3334

3435
/**
35-
* Function for selecting one component from all those that are
36+
* Function for selecting one component from all those that are
3637
* available.
3738
*/
3839

3940
int orte_plm_base_select(void)
4041
{
41-
int exit_status = ORTE_SUCCESS;
42+
int rc;
4243
orte_plm_base_component_t *best_component = NULL;
4344
orte_plm_base_module_t *best_module = NULL;
4445

4546
/*
4647
* Select the best component
4748
*/
48-
if( OPAL_SUCCESS != mca_base_select("plm", orte_plm_base_framework.framework_output,
49-
&orte_plm_base_framework.framework_components,
50-
(mca_base_module_t **) &best_module,
51-
(mca_base_component_t **) &best_component) ) {
52-
/* This will only happen if no component was selected
53-
*
54-
* If we didn't find one, and we are a daemon, then default to retaining the proxy.
55-
* Otherwise, if we didn't find one to select, that is unacceptable.
56-
*/
57-
if (ORTE_PROC_IS_DAEMON) {
58-
/* don't record a selected component or flag selected
59-
* so we finalize correctly - just leave the plm alone
60-
* as it defaults to pointing at the proxy
61-
*/
62-
goto cleanup;
63-
} else {
64-
exit_status = ORTE_ERR_NOT_FOUND;
65-
goto cleanup;
66-
}
49+
if (OPAL_SUCCESS == (rc = mca_base_select("plm", orte_plm_base_framework.framework_output,
50+
&orte_plm_base_framework.framework_components,
51+
(mca_base_module_t **) &best_module,
52+
(mca_base_component_t **) &best_component))) {
53+
/* Save the winner */
54+
orte_plm = *best_module;
6755
}
6856

69-
/* Save the winner */
70-
orte_plm = *best_module;
71-
72-
cleanup:
73-
return exit_status;
57+
return rc;
7458
}

orte/mca/plm/rsh/help-plm-rsh.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13+
# Copyright (c) 2015 Intel, Inc. All rights reserved.
1314
# $COPYRIGHT$
1415
#
1516
# Additional copyrights may follow
@@ -18,6 +19,14 @@
1819
#
1920
# This is the US/English general help file for Open RTE's orterun.
2021
#
22+
[agent-not-found]
23+
The value of the MCA parameter "plm_rsh_agent" was set to a path
24+
that could not be found:
25+
26+
plm_rsh_agent: %s
27+
28+
Please either unset the parameter, or check that the path is correct
29+
#
2130
[no-local-orted]
2231
The rsh PLS component was not able to find the executable "orted" in
2332
your PATH or in the directory where Open MPI/OpenRTE was initially installed,

orte/mca/plm/rsh/plm_rsh.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ struct orte_plm_rsh_component_t {
5959
bool no_tree_spawn;
6060
int num_concurrent;
6161
char *agent;
62+
char *agent_path;
63+
char **agent_argv;
6264
bool assume_same_shell;
6365
bool pass_environ_mca_params;
6466
char *ssh_args;

0 commit comments

Comments
 (0)