diff --git a/orte/mca/plm/rsh/plm_rsh_component.c b/orte/mca/plm/rsh/plm_rsh_component.c index 0ad126abac..50d4999405 100644 --- a/orte/mca/plm/rsh/plm_rsh_component.c +++ b/orte/mca/plm/rsh/plm_rsh_component.c @@ -258,8 +258,10 @@ static int rsh_component_query(mca_base_module_t **module, int *priority) char *tmp; /* Check if we are under Grid Engine parallel environment by looking at several - * environment variables. If so, setup the path and argv[0]. */ - if (NULL == mca_plm_rsh_component.agent) { + * environment variables. If so, setup the path and argv[0]. + * Note that we allow the user to specify the launch agent + * even if they are in a Grid Engine environment */ + if (0 == strcmp(mca_plm_rsh_component.agent, "ssh : rsh")) { if (!mca_plm_rsh_component.disable_qrsh && NULL != getenv("SGE_ROOT") && NULL != getenv("ARC") && NULL != getenv("PE_HOSTFILE") && NULL != getenv("JOB_ID")) { diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index ed8b7f2450..00454aba78 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -164,6 +164,7 @@ static const char *orte_plm_rsh_shell_name[7] = { */ static void set_handler_default(int sig); static orte_plm_rsh_shell_t find_shell(char *shell); +static int launch_agent_setup(const char *agent, char *path); static void ssh_child(int argc, char **argv) __opal_attribute_noreturn__; static int rsh_probe(char *nodename, orte_plm_rsh_shell_t *shell); @@ -177,14 +178,59 @@ static void process_launch_list(int fd, short args, void *cbdata); static int num_in_progress=0; static opal_list_t launch_list; static opal_event_t launch_event; +static char *rsh_agent_path=NULL; +static char **rsh_agent_argv=NULL; /** * Init the module */ static int rsh_init(void) { + char *tmp; int rc; + /* we were selected, so setup the launch agent */ + if (mca_plm_rsh_component.using_qrsh) { + /* perform base setup for qrsh */ + asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC")); + if (ORTE_SUCCESS != (rc = launch_agent_setup("qrsh", tmp))) { + ORTE_ERROR_LOG(rc); + free(tmp); + return rc; + } + free(tmp); + /* automatically add -inherit and grid engine PE related flags */ + opal_argv_append_nosize(&rsh_agent_argv, "-inherit"); + /* Don't use the "-noshell" flag as qrsh would have a problem + * swallowing a long command */ + opal_argv_append_nosize(&rsh_agent_argv, "-nostdin"); + opal_argv_append_nosize(&rsh_agent_argv, "-V"); + if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { + opal_argv_append_nosize(&rsh_agent_argv, "-verbose"); + tmp = opal_argv_join(rsh_agent_argv, ' '); + opal_output_verbose(1, orte_plm_base_framework.framework_output, + "%s plm:rsh: using \"%s\" for launching\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); + free(tmp); + } + } else if(mca_plm_rsh_component.using_llspawn) { + /* perform base setup for llspawn */ + if (ORTE_SUCCESS != (rc = launch_agent_setup("llspawn", NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_output_verbose(1, orte_plm_base_framework.framework_output, + "%s plm:rsh: using \"%s\" for launching\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + rsh_agent_path); + } else { + /* not using qrsh or llspawn - use MCA-specified agent */ + if (ORTE_SUCCESS != (rc = launch_agent_setup(mca_plm_rsh_component.agent, NULL))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /* point to our launch command */ if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS, launch_daemons, ORTE_SYS_PRI))) { @@ -325,8 +371,8 @@ static int setup_launch(int *argcptr, char ***argvptr, /* * Build argv array */ - argv = opal_argv_copy(mca_plm_rsh_component.agent_argv); - argc = opal_argv_count(mca_plm_rsh_component.agent_argv); + argv = opal_argv_copy(rsh_agent_argv); + argc = opal_argv_count(argv); /* if any ssh args were provided, now is the time to add them */ if (NULL != mca_plm_rsh_component.ssh_args) { char **ssh_argv; @@ -676,7 +722,7 @@ static void ssh_child(int argc, char **argv) * about remote launches here */ exec_argv = argv; - exec_path = mca_plm_rsh_component.agent_path; + exec_path = strdup(rsh_agent_path); /* Don't let ssh slurp all of our stdin! */ fdin = open("/dev/null", O_RDWR); @@ -1331,6 +1377,63 @@ static orte_plm_rsh_shell_t find_shell(char *shell) return ORTE_PLM_RSH_SHELL_UNKNOWN; } +static int launch_agent_setup(const char *agent, char *path) +{ + char *bname; + int i; + + /* if no agent was provided, then report not found */ + if (NULL == mca_plm_rsh_component.agent && NULL == agent) { + return ORTE_ERR_NOT_FOUND; + } + + /* search for the argv */ + OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, + "%s plm:rsh_setup on agent %s path %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == agent) ? mca_plm_rsh_component.agent : agent, + (NULL == path) ? "NULL" : path)); + rsh_agent_argv = orte_plm_rsh_search(agent, path); + + if (0 == opal_argv_count(rsh_agent_argv)) { + /* nothing was found */ + return ORTE_ERR_NOT_FOUND; + } + + /* see if we can find the agent in the path */ + rsh_agent_path = opal_path_findv(rsh_agent_argv[0], X_OK, environ, path); + + if (NULL == rsh_agent_path) { + /* not an error - just report not found */ + opal_argv_free(rsh_agent_argv); + return ORTE_ERR_NOT_FOUND; + } + + bname = opal_basename(rsh_agent_argv[0]); + if (NULL != bname && 0 == strcmp(bname, "ssh")) { + /* if xterm option was given, add '-X', ensuring we don't do it twice */ + if (NULL != orte_xterm) { + opal_argv_append_unique_nosize(&rsh_agent_argv, "-X", false); + } else if (0 >= opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { + /* if debug was not specified, and the user didn't explicitly + * specify X11 forwarding/non-forwarding, add "-x" if it + * isn't already there (check either case) + */ + for (i = 1; NULL != rsh_agent_argv[i]; ++i) { + if (0 == strcasecmp("-x", rsh_agent_argv[i])) { + break; + } + } + if (NULL == rsh_agent_argv[i]) { + opal_argv_append_nosize(&rsh_agent_argv, "-x"); + } + } + } + + /* the caller can append any additional argv's they desire */ + return ORTE_SUCCESS; +} + /** * Check the Shell variable and system type on the specified node */ @@ -1454,15 +1557,10 @@ static int setup_shell(orte_plm_rsh_shell_t *rshell, struct passwd *p; p = getpwuid(getuid()); - if( NULL == p ) { - /* This user is unknown to the system. Therefore, there is no reason we - * spawn whatsoever in his name. Give up with a HUGE error message. - */ - orte_show_help( "help-plm-rsh.txt", "unknown-user", true, (int)getuid() ); - return ORTE_ERR_FATAL; + if( NULL != p ) { + param = p->pw_shell; + local_shell = find_shell(p->pw_shell); } - param = p->pw_shell; - local_shell = find_shell(p->pw_shell); } #endif