Skip to content

Commit a068d17

Browse files
committed
ompi: Avoid unnecessary PMIx lookups when adding procs (step 2).
Follow-up for 717f3fe. (cherry-ported from 1f7a3a2) Signed-off-by: Artem Polyakov <[email protected]>
1 parent 5324cff commit a068d17

File tree

1 file changed

+50
-46
lines changed

1 file changed

+50
-46
lines changed

ompi/proc/proc.c

Lines changed: 50 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
115115
opal_hash_table_set_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name),
116116
proc);
117117

118+
/* by default we consider process to be remote */
119+
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
118120
*procp = proc;
119121

120122
return OMPI_SUCCESS;
@@ -132,26 +134,14 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
132134
*/
133135
int ompi_proc_complete_init_single (ompi_proc_t *proc)
134136
{
135-
uint16_t u16, *u16ptr;
136137
int ret;
137138

138-
u16ptr = &u16;
139-
140139
if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
141140
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
142141
/* nothing else to do */
143142
return OMPI_SUCCESS;
144143
}
145144

146-
/* get the locality information - all RTEs are required
147-
* to provide this information at startup */
148-
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
149-
if (OPAL_SUCCESS != ret) {
150-
proc->super.proc_flags = OPAL_PROC_NON_LOCAL;
151-
} else {
152-
proc->super.proc_flags = u16;
153-
}
154-
155145
/* we can retrieve the hostname at no cost because it
156146
* was provided at startup - but make it optional so
157147
* we don't chase after it if some system doesn't
@@ -286,20 +276,6 @@ int ompi_proc_init(void)
286276
}
287277
#endif
288278

289-
if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
290-
/* create proc structures and find self */
291-
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
292-
if (i == OMPI_PROC_MY_NAME->vpid) {
293-
continue;
294-
}
295-
296-
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, i, &proc);
297-
if (OMPI_SUCCESS != ret) {
298-
return ret;
299-
}
300-
}
301-
}
302-
303279
return OMPI_SUCCESS;
304280
}
305281

@@ -330,41 +306,69 @@ int ompi_proc_complete_init(void)
330306
{
331307
ompi_proc_t *proc;
332308
int ret, errcode = OMPI_SUCCESS;
309+
char *val;
333310

334311
opal_mutex_lock (&ompi_proc_lock);
335312

313+
/* Add all local peers first */
314+
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
315+
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
316+
if (OPAL_SUCCESS == ret && NULL != val) {
317+
char **peers = opal_argv_split(val, ',');
318+
int i;
319+
free(val);
320+
for (i=0; NULL != peers[i]; i++) {
321+
ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10);
322+
uint16_t u16, *u16ptr = &u16;
323+
if (OMPI_PROC_MY_NAME->vpid == local_rank) {
324+
continue;
325+
}
326+
ret = ompi_proc_allocate (OMPI_PROC_MY_NAME->jobid, local_rank, &proc);
327+
if (OMPI_SUCCESS != ret) {
328+
return ret;
329+
}
330+
/* get the locality information - all RTEs are required
331+
* to provide this information at startup */
332+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY, &proc->super.proc_name, &u16ptr, OPAL_UINT16);
333+
if (OPAL_SUCCESS == ret) {
334+
proc->super.proc_flags = u16;
335+
}
336+
}
337+
opal_argv_free(peers);
338+
}
339+
340+
/* Complete initialization of node-local procs */
336341
OPAL_LIST_FOREACH(proc, &ompi_proc_list, ompi_proc_t) {
337342
ret = ompi_proc_complete_init_single (proc);
338343
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
339344
errcode = ret;
340345
break;
341346
}
342347
}
343-
opal_mutex_unlock (&ompi_proc_lock);
344348

345-
if (ompi_process_info.num_procs >= ompi_add_procs_cutoff) {
346-
char *val;
347-
/* retrieve the local peers */
348-
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
349-
ORTE_PROC_MY_NAME, &val, OPAL_STRING);
350-
if (OPAL_SUCCESS == ret && NULL != val) {
351-
char **peers = opal_argv_split(val, ',');
352-
int i;
353-
free(val);
354-
for (i=0; NULL != peers[i]; i++) {
355-
ompi_vpid_t local_rank = strtoul(peers[i], NULL, 10);
356-
opal_process_name_t proc_name = {.vpid = local_rank, .jobid = OMPI_PROC_MY_NAME->jobid};
357-
358-
if (OMPI_PROC_MY_NAME->vpid == local_rank) {
359-
continue;
360-
}
361-
(void) ompi_proc_for_name (proc_name);
362-
}
363-
opal_argv_free(peers);
349+
/* if cutoff is larger than # of procs - add all processes
350+
* NOTE that local procs will be automatically skipped as they
351+
* are already in the hash table
352+
*/
353+
if (ompi_process_info.num_procs < ompi_add_procs_cutoff) {
354+
/* sinse ompi_proc_for_name is locking internally -
355+
* we need to release lock here
356+
*/
357+
opal_mutex_unlock (&ompi_proc_lock);
358+
359+
for (ompi_vpid_t i = 0 ; i < ompi_process_info.num_procs ; ++i ) {
360+
opal_process_name_t proc_name;
361+
proc_name.jobid = OMPI_PROC_MY_NAME->jobid;
362+
proc_name.vpid = i;
363+
(void) ompi_proc_for_name (proc_name);
364364
}
365+
366+
/* acquire lock back for the next step - sort */
367+
opal_mutex_lock (&ompi_proc_lock);
365368
}
366369

367370
opal_list_sort (&ompi_proc_list, ompi_proc_compare_vid);
371+
opal_mutex_unlock (&ompi_proc_lock);
368372

369373
return errcode;
370374
}

0 commit comments

Comments
 (0)