diff --git a/ompi/proc/proc.c b/ompi/proc/proc.c index 961e8c5f9b9..5b712bf25e1 100644 --- a/ompi/proc/proc.c +++ b/ompi/proc/proc.c @@ -116,6 +116,8 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t opal_hash_table_set_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name), proc); + /* by default we consider process to be remote */ + proc->super.proc_flags = OPAL_PROC_NON_LOCAL; *procp = proc; return OMPI_SUCCESS; @@ -133,26 +135,14 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t */ int ompi_proc_complete_init_single (ompi_proc_t *proc) { - uint16_t u16, *u16ptr; int ret; - u16ptr = &u16; - if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) && (OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) { /* nothing else to do */ return OMPI_SUCCESS; } - /* get the locality information - all RTEs are required - * to provide this information at startup */ - OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16); - if (OPAL_SUCCESS != ret) { - proc->super.proc_flags = OPAL_PROC_NON_LOCAL; - } else { - proc->super.proc_flags = u16; - } - /* we can retrieve the hostname at no cost because it * was provided at startup - but make it optional so * we don't chase after it if some system doesn't @@ -287,20 +277,6 @@ int ompi_proc_init(void) } #endif - if (ompi_process_info.num_procs < ompi_add_procs_cutoff) { - /* create proc structures and find self */ - for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) { - if (i == OMPI_PROC_MY_NAME->vpid) { - continue; - } - - ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, i, &proc); - if (OMPI_SUCCESS != ret) { - return ret; - } - } - } - return OMPI_SUCCESS; } @@ -329,11 +305,44 @@ static int ompi_proc_compare_vid (opal_list_item_t **a, opal_list_item_t **b) */ int ompi_proc_complete_init(void) { + opal_process_name_t wildcard_rank; ompi_proc_t *proc; int ret, errcode = OMPI_SUCCESS; + char *val; opal_mutex_lock (&ompi_proc_lock); + /* Add all local peers first */ + wildcard_rank.jobid = OMPI_PROC_MY_NAME->jobid; + wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid; + /* retrieve the local peers */ + OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, + &wildcard_rank, &val, OPAL_STRING); + if (OPAL_SUCCESS == ret && NULL != val) { + char **peers = opal_argv_split(val, ','); + int i; + free(val); + for (i=0; NULL != peers[i]; i++) { + ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10); + uint16_t u16, *u16ptr = &u16; + if (OMPI_PROC_MY_NAME->vpid == local_rank) { + continue; + } + ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, local_rank, &proc); + if (OMPI_SUCCESS != ret) { + return ret; + } + /* get the locality information - all RTEs are required + * to provide this information at startup */ + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16); + if (OPAL_SUCCESS == ret) { + proc->super.proc_flags = u16; + } + } + opal_argv_free(peers); + } + + /* Complete initialization of node-local procs */ OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) { ret = ompi_proc_complete_init_single (proc); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -341,35 +350,32 @@ int ompi_proc_complete_init(void) break; } } - opal_mutex_unlock (&ompi_proc_lock); - if (ompi_process_info.num_procs >= ompi_add_procs_cutoff) { - char *val = NULL; - opal_process_name_t wildcard_rank; - wildcard_rank.jobid = OMPI_PROC_MY_NAME->jobid; - wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid; - /* retrieve the local peers */ - OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, - &wildcard_rank, &val, OPAL_STRING); - if (OPAL_SUCCESS == ret && NULL != val) { - char **peers = opal_argv_split(val, ','); - int i; - free(val); - for (i=0; NULL != peers[i]; i++) { - ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10); - opal_process_name_t proc_name = {.vpid = local_rank, .jobid = OMPI_PROC_MY_NAME->jobid}; - - if (OMPI_PROC_MY_NAME->vpid == local_rank) { - continue; - } - (void) ompi_proc_for_name (proc_name); - } - opal_argv_free(peers); + /* if cutoff is larger than # of procs - add all processes + * NOTE that local procs will be automatically skipped as they + * are already in the hash table + */ + if (ompi_process_info.num_procs < ompi_add_procs_cutoff) { + /* sinse ompi_proc_for_name is locking internally - + * we need to release lock here + */ + opal_mutex_unlock (&ompi_proc_lock); + + for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) { + opal_process_name_t proc_name; + proc_name.jobid = OMPI_PROC_MY_NAME->jobid; + proc_name.vpid = i; + (void) ompi_proc_for_name (proc_name); } + + /* acquire lock back for the next step - sort */ + opal_mutex_lock (&ompi_proc_lock); } opal_list_sort (&ompi_proc_list, ompi_proc_compare_vid); + opal_mutex_unlock (&ompi_proc_lock); + return errcode; }