Skip to content

Commit 6d6bc9b

Browse files
author
Ralph Castain
committed
Update alps module to new APIs
Signed-off-by: Ralph Castain <[email protected]>
1 parent fb27bd1 commit 6d6bc9b

File tree

4 files changed

+38
-82
lines changed

4 files changed

+38
-82
lines changed

contrib/platform/intel/bend/gadget

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ enable_orterun_prefix_by_default=yes
22
enable_mpi_thread_multiple=no
33
enable_mem_debug=no
44
enable_mem_profile=no
5-
enable_debug_symbols=yes
5+
enable_debug_symbols=no
66
enable_binaries=yes
77
enable_heterogeneous=no
88
enable_picky=yes
9-
enable_debug=yes
9+
enable_debug=no
1010
enable_shared=yes
1111
enable_static=yes
1212
enable_memchecker=no

contrib/platform/intel/bend/gadget.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,4 @@
6363
mca_base_component_show_load_errors = 1
6464
orte_abort_timeout = 10
6565
hwloc_base_mem_bind_failure_action = silent
66-
66+
btl_ugni_rcache=grdma

contrib/scaling/scaling.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@
151151

152152
# determine the number of nodes - doesn't
153153
# matter which starter we use
154-
$cmd = "mpirun --novm --pernode hostname";
154+
$cmd = "mpirun --pernode hostname";
155155
$output = `$cmd`;
156156
@lines = split(/\n/, $output);
157157
$num_nodes = $#lines + 1;

orte/mca/odls/alps/odls_alps_module.c

Lines changed: 34 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2010 IBM Corporation. All rights reserved.
1616
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
1717
* reserved.
18-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
18+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1919
*
2020
* $COPYRIGHT$
2121
*
@@ -144,8 +144,8 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child);
144144
static void send_error_show_help(int fd, int exit_status,
145145
const char *file, const char *topic, ...)
146146
__opal_attribute_noreturn__;
147-
static int do_child(orte_app_context_t* context,
148-
orte_proc_t *child,
147+
static int do_child(orte_proc_t *child,
148+
char *app, char **argv,
149149
char **environ_copy,
150150
orte_job_t *jobdat, int write_fd,
151151
orte_iof_base_io_conf_t opts)
@@ -342,8 +342,8 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
342342
return ORTE_SUCCESS;
343343
}
344344

345-
static int do_child(orte_app_context_t* context,
346-
orte_proc_t *child,
345+
static int do_child( orte_proc_t *child,
346+
char *app, char **argv,
347347
char **environ_copy,
348348
orte_job_t *jobdat, int write_fd,
349349
orte_iof_base_io_conf_t opts)
@@ -375,7 +375,7 @@ static int do_child(orte_app_context_t* context,
375375
send_error_show_help(write_fd, 1,
376376
"help-orte-odls-alps.txt",
377377
"iof setup failed",
378-
orte_process_info.nodename, context->app);
378+
orte_process_info.nodename, app);
379379
/* Does not return */
380380
}
381381

@@ -399,30 +399,18 @@ static int do_child(orte_app_context_t* context,
399399
close(fdnull);
400400
}
401401

402-
/* if the user requested it, set the system resource limits */
403-
if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
404-
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
405-
"set limit",
406-
orte_process_info.nodename, context->app,
407-
__FILE__, __LINE__, msg);
408-
}
409-
/* ensure we only do this once */
410-
(void) mca_base_var_env_name("opal_set_max_sys_limits", &param);
411-
opal_unsetenv(param, &environ_copy);
412-
free(param);
413-
414402
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
415403
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
416404
"close fds",
417-
orte_process_info.nodename, context->app,
405+
orte_process_info.nodename, app,
418406
__FILE__, __LINE__);
419407
}
420408

421409

422-
if (context->argv == NULL) {
423-
context->argv = malloc(sizeof(char*)*2);
424-
context->argv[0] = strdup(context->app);
425-
context->argv[1] = NULL;
410+
if (argv == NULL) {
411+
argv = malloc(sizeof(char*)*2);
412+
argv[0] = strdup(app);
413+
argv[1] = NULL;
426414
}
427415

428416
/* Set signal handlers back to the default. Do this close to
@@ -449,25 +437,25 @@ static int do_child(orte_app_context_t* context,
449437

450438
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
451439
int jout;
452-
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), context->app);
453-
for (jout=0; NULL != context->argv[jout]; jout++) {
454-
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, context->argv[jout]);
440+
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app);
441+
for (jout=0; NULL != argv[jout]; jout++) {
442+
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, argv[jout]);
455443
}
456444
for (jout=0; NULL != environ_copy[jout]; jout++) {
457445
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
458446
}
459447
}
460448

461-
execve(context->app, context->argv, environ_copy);
449+
execve(app, argv, environ_copy);
462450
send_error_show_help(write_fd, 1,
463451
"help-orte-odls-alps.txt", "execve error",
464-
orte_process_info.nodename, context->app, strerror(errno));
452+
orte_process_info.nodename, app, strerror(errno));
465453
/* Does not return */
466454
}
467455

468456

469-
static int do_parent(orte_app_context_t* context,
470-
orte_proc_t *child,
457+
static int do_parent(orte_proc_t *child,
458+
char *app, char **argv,
471459
char **environ_copy,
472460
orte_job_t *jobdat, int read_fd,
473461
orte_iof_base_io_conf_t opts)
@@ -476,19 +464,10 @@ static int do_parent(orte_app_context_t* context,
476464
orte_odls_pipe_err_msg_t msg;
477465
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
478466

479-
if (NULL != child && ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
480-
/* connect endpoints IOF */
481-
rc = orte_iof_base_setup_parent(&child->name, &opts);
482-
if (ORTE_SUCCESS != rc) {
483-
ORTE_ERROR_LOG(rc);
484-
close(read_fd);
485-
486-
if (NULL != child) {
487-
child->state = ORTE_PROC_STATE_UNDEF;
488-
}
489-
return rc;
490-
}
491-
}
467+
close(opts.p_stdin[0]);
468+
close(opts.p_stdout[1]);
469+
close(opts.p_stderr[1]);
470+
close(opts.p_internal[1]);
492471

493472
/* Block reading a message from the pipe */
494473
while (1) {
@@ -525,7 +504,7 @@ static int do_parent(orte_app_context_t* context,
525504
if (OPAL_SUCCESS != rc) {
526505
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
527506
true,
528-
orte_process_info.nodename, context->app,
507+
orte_process_info.nodename, app,
529508
"opal_fd_read", __FILE__, __LINE__);
530509
if (NULL != child) {
531510
child->state = ORTE_PROC_STATE_UNDEF;
@@ -539,7 +518,7 @@ static int do_parent(orte_app_context_t* context,
539518
if (OPAL_SUCCESS != rc) {
540519
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
541520
true,
542-
orte_process_info.nodename, context->app,
521+
orte_process_info.nodename, app,
543522
"opal_fd_read", __FILE__, __LINE__);
544523
if (NULL != child) {
545524
child->state = ORTE_PROC_STATE_UNDEF;
@@ -553,7 +532,7 @@ static int do_parent(orte_app_context_t* context,
553532
if (NULL == str) {
554533
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
555534
true,
556-
orte_process_info.nodename, context->app,
535+
orte_process_info.nodename, app,
557536
"opal_fd_read", __FILE__, __LINE__);
558537
if (NULL != child) {
559538
child->state = ORTE_PROC_STATE_UNDEF;
@@ -602,39 +581,16 @@ static int do_parent(orte_app_context_t* context,
602581
/**
603582
* Fork/exec the specified processes
604583
*/
605-
static int odls_alps_fork_local_proc(orte_app_context_t* context,
606-
orte_proc_t *child,
607-
char **environ_copy,
608-
orte_job_t *jobdat)
584+
static int odls_alps_fork_local_proc(orte_proc_t *child,
585+
char *app,
586+
char **argv,
587+
char **environ_copy,
588+
orte_job_t *jobdat,
589+
orte_iof_base_io_conf_t opts)
609590
{
610-
orte_iof_base_io_conf_t opts;
611591
int rc, p[2];
612592
pid_t pid;
613593

614-
if (NULL != child) {
615-
/* should pull this information from MPIRUN instead of going with
616-
default */
617-
opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
618-
619-
/* do we want to setup stdin? */
620-
if (NULL != child &&
621-
(jobdat->stdin_target == ORTE_VPID_WILDCARD ||
622-
child->name.vpid == jobdat->stdin_target)) {
623-
opts.connect_stdin = true;
624-
} else {
625-
opts.connect_stdin = false;
626-
}
627-
628-
if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&opts))) {
629-
ORTE_ERROR_LOG(rc);
630-
if (NULL != child) {
631-
child->state = ORTE_PROC_STATE_FAILED_TO_START;
632-
child->exit_code = rc;
633-
}
634-
return rc;
635-
}
636-
}
637-
638594
/* A pipe is used to communicate between the parent and child to
639595
indicate whether the exec ultimately succeeded or failed. The
640596
child sets the pipe to be close-on-exec; the child only ever
@@ -668,16 +624,16 @@ static int odls_alps_fork_local_proc(orte_app_context_t* context,
668624
}
669625

670626
if (pid == 0) {
671-
close(p[0]);
627+
close(p[0]);
672628
#if HAVE_SETPGID
673629
setpgid(0, 0);
674630
#endif
675-
do_child(context, child, environ_copy, jobdat, p[1], opts);
631+
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
676632
/* Does not return */
677633
}
678634

679635
close(p[1]);
680-
return do_parent(context, child, environ_copy, jobdat, p[0], opts);
636+
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
681637
}
682638

683639

0 commit comments

Comments
 (0)