Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 55 additions & 49 deletions ompi/proc/proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
opal_hash_table_set_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name),
proc);

/* by default we consider process to be remote */
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
*procp = proc;

return OMPI_SUCCESS;
Expand All @@ -133,26 +135,14 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
*/
int ompi_proc_complete_init_single (ompi_proc_t *proc)
{
uint16_t u16, *u16ptr;
int ret;

u16ptr = &u16;

if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
/* nothing else to do */
return OMPI_SUCCESS;
}

/* get the locality information - all RTEs are required
* to provide this information at startup */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS != ret) {
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
} else {
proc->super.proc_flags = u16;
}

/* we can retrieve the hostname at no cost because it
* was provided at startup - but make it optional so
* we don't chase after it if some system doesn't
Expand Down Expand Up @@ -287,20 +277,6 @@ int ompi_proc_init(void)
}
#endif

if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
/* create proc structures and find self */
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
if (i == OMPI_PROC_MY_NAME->vpid) {
continue;
}

ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, i, &proc);
if (OMPI_SUCCESS != ret) {
return ret;
}
}
}

return OMPI_SUCCESS;
}

Expand Down Expand Up @@ -329,47 +305,77 @@ static int ompi_proc_compare_vid (opal_list_item_t **a, opal_list_item_t **b)
*/
int ompi_proc_complete_init(void)
{
opal_process_name_t wildcard_rank;
ompi_proc_t *proc;
int ret, errcode = OMPI_SUCCESS;
char *val;

opal_mutex_lock (&ompi_proc_lock);

/* Add all local peers first */
wildcard_rank.jobid = OMPI_PROC_MY_NAME->jobid;
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
/* retrieve the local peers */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
&wildcard_rank, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
char **peers = opal_argv_split(val, ',');
int i;
free(val);
for (i=0; NULL != peers[i]; i++) {
ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10);
uint16_t u16, *u16ptr = &u16;
if (OMPI_PROC_MY_NAME->vpid == local_rank) {
continue;
}
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, local_rank, &proc);
if (OMPI_SUCCESS != ret) {
return ret;
}
/* get the locality information - all RTEs are required
* to provide this information at startup */
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
if (OPAL_SUCCESS == ret) {
proc->super.proc_flags = u16;
}
}
opal_argv_free(peers);
}

/* Complete initialization of node-local procs */
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
ret = ompi_proc_complete_init_single (proc);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
errcode = ret;
break;
}
}
opal_mutex_unlock (&ompi_proc_lock);

if (ompi_process_info.num_procs >= ompi_add_procs_cutoff) {
char *val = NULL;
opal_process_name_t wildcard_rank;
wildcard_rank.jobid = OMPI_PROC_MY_NAME->jobid;
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
/* retrieve the local peers */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
&wildcard_rank, &val, OPAL_STRING);
if (OPAL_SUCCESS == ret && NULL != val) {
char **peers = opal_argv_split(val, ',');
int i;
free(val);
for (i=0; NULL != peers[i]; i++) {
ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10);
opal_process_name_t proc_name = {.vpid = local_rank, .jobid = OMPI_PROC_MY_NAME->jobid};

if (OMPI_PROC_MY_NAME->vpid == local_rank) {
continue;
}
(void) ompi_proc_for_name (proc_name);
}
opal_argv_free(peers);
/* if cutoff is larger than # of procs - add all processes
* NOTE that local procs will be automatically skipped as they
* are already in the hash table
*/
if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
/* sinse ompi_proc_for_name is locking internally -
* we need to release lock here
*/
opal_mutex_unlock (&ompi_proc_lock);

for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
opal_process_name_t proc_name;
proc_name.jobid = OMPI_PROC_MY_NAME->jobid;
proc_name.vpid = i;
(void) ompi_proc_for_name (proc_name);
}

/* acquire lock back for the next step - sort */
opal_mutex_lock (&ompi_proc_lock);
}

opal_list_sort (&ompi_proc_list, ompi_proc_compare_vid);

opal_mutex_unlock (&ompi_proc_lock);

return errcode;
}

Expand Down