Skip to content

Commit f3d4bc1

Browse files
committed
ompi/proc: add function to get all allocated procs
This commit adds two new functions: - ompi_proc_get_allocated - Returns all procs in the current job that have already been allocated. This is used in init/finalize to determine which procs to pass to add_procs/del_procs. - ompi_proc_world_size - returns the number of processes in MPI_COMM_WORLD. This may be removed in favor of callers just looking at ompi_process_info. The behavior of ompi_proc_world has been restored to return ompi_proc_t's for all processes in the current job. The use of this function is discouraged. Code that was using ompi_proc_world() has been updated to make use of the new functions to avoid the memory overhead of ompi_comm_world (). Signed-off-by: Nathan Hjelm <[email protected]> (cherry picked from open-mpi/ompi@2c89c7f)
1 parent bc889b4 commit f3d4bc1

File tree

5 files changed

+108
-20
lines changed

5 files changed

+108
-20
lines changed

ompi/mca/mtl/mxm/mtl_mxm.c

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -304,27 +304,30 @@ int ompi_mtl_mxm_module_init(void)
304304
}
305305
#endif
306306

307-
if (NULL == (procs = ompi_proc_world(&totps))) {
308-
MXM_ERROR("Unable to obtain process list");
309-
return OMPI_ERROR;
310-
}
307+
totps = ompi_proc_world_size ();
311308

312309
if (totps < (size_t)ompi_mtl_mxm.mxm_np) {
313310
MXM_VERBOSE(1, "MXM support will be disabled because of total number "
314311
"of processes (%lu) is less than the minimum set by the "
315312
"mtl_mxm_np MCA parameter (%u)", totps, ompi_mtl_mxm.mxm_np);
316-
free(procs);
317313
return OMPI_ERR_NOT_SUPPORTED;
318314
}
319315
MXM_VERBOSE(1, "MXM support enabled");
320316

321317
if (ORTE_NODE_RANK_INVALID == (lr = ompi_process_info.my_node_rank)) {
322318
MXM_ERROR("Unable to obtain local node rank");
323-
free(procs);
324319
return OMPI_ERROR;
325320
}
326321
nlps = ompi_process_info.num_local_peers + 1;
327322

323+
/* local procs are always allocated. if that ever changes this will need to
324+
* be modified. */
325+
procs = ompi_proc_get_allocated (&totps);
326+
if (NULL == procs) {
327+
MXM_ERROR("Unable to obtain process list");
328+
return OMPI_ERROR;
329+
}
330+
328331
for (proc = 0; proc < totps; proc++) {
329332
if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->super.proc_flags)) {
330333
mxlr = max(mxlr, procs[proc]->super.proc_name.vpid);
@@ -595,14 +598,8 @@ int ompi_mtl_mxm_del_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs,
595598
size_t i;
596599

597600
#if MXM_API >= MXM_VERSION(3,1)
598-
if (ompi_mtl_mxm.bulk_disconnect) {
599-
size_t nprocs_world;
600-
ompi_proc_t **procs;
601-
procs = ompi_proc_world(&nprocs_world);
602-
if (nprocs == nprocs_world) {
603-
mxm_ep_powerdown(ompi_mtl_mxm.ep);
604-
}
605-
free(procs);
601+
if (ompi_mtl_mxm.bulk_disconnect && nprocs == ompi_proc_world_size ()) {
602+
mxm_ep_powerdown(ompi_mtl_mxm.ep);
606603
}
607604
#endif
608605

ompi/proc/proc.c

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,12 @@ int ompi_proc_finalize (void)
398398
return OMPI_SUCCESS;
399399
}
400400

401-
ompi_proc_t** ompi_proc_world(size_t *size)
401+
int ompi_proc_world_size (void)
402+
{
403+
return ompi_process_info.num_procs;
404+
}
405+
406+
ompi_proc_t **ompi_proc_get_allocated (size_t *size)
402407
{
403408
ompi_proc_t **procs;
404409
ompi_proc_t *proc;
@@ -455,6 +460,55 @@ ompi_proc_t** ompi_proc_world(size_t *size)
455460
return procs;
456461
}
457462

463+
ompi_proc_t **ompi_proc_world (size_t *size)
464+
{
465+
ompi_proc_t **procs;
466+
ompi_proc_t *proc;
467+
size_t count = 0;
468+
ompi_rte_cmp_bitmask_t mask;
469+
ompi_process_name_t my_name;
470+
471+
/* check bozo case */
472+
if (NULL == ompi_proc_local_proc) {
473+
return NULL;
474+
}
475+
476+
/* First count how many match this jobid (we already know this from our process info) */
477+
count = ompi_process_info.num_procs;
478+
479+
/* allocate an array */
480+
procs = (ompi_proc_t **) malloc (count * sizeof(ompi_proc_t*));
481+
if (NULL == procs) {
482+
return NULL;
483+
}
484+
485+
/* now get/allocate all the procs in this jobid */
486+
for (int i = 0 ; i < count ; ++i) {
487+
opal_process_name_t name = {.jobid = OMPI_CAST_RTE_NAME(&ompi_proc_local_proc->super.proc_name)->jobid,
488+
.vpid = i};
489+
490+
/* DO NOT RETAIN THIS OBJECT - the reference count on this
491+
* object will be adjusted by external callers. The intent
492+
* here is to allow the reference count to drop to zero if
493+
* the app no longer desires to communicate with this proc.
494+
* For example, the proc may call comm_disconnect on all
495+
* communicators involving this proc. In such cases, we want
496+
* the proc object to be removed from the list. By not incrementing
497+
* the reference count here, we allow this to occur.
498+
*
499+
* We don't implement that yet, but we are still safe for now as
500+
* the OBJ_NEW in ompi_proc_init owns the initial reference
501+
* count which cannot be released until ompi_proc_finalize is
502+
* called.
503+
*/
504+
procs[i] = ompi_proc_for_name (name);
505+
}
506+
507+
*size = count;
508+
509+
return procs;
510+
}
511+
458512

459513
ompi_proc_t** ompi_proc_all(size_t* size)
460514
{

ompi/proc/proc.h

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,10 @@ OMPI_DECLSPEC int ompi_proc_finalize(void);
138138
* Returns the list of proc instances associated with this job. Given
139139
* the current association between a job and an MPI_COMM_WORLD, this
140140
* function provides the process instances for the current
141-
* MPI_COMM_WORLD.
141+
* MPI_COMM_WORLD. Use this function only if absolutely needed as it
142+
* will cause ompi_proc_t objects to be allocated for every process in
143+
* the job. If you only need the allocated ompi_proc_t objects call
144+
* ompi_proc_get_allocated() instead.
142145
*
143146
* @note The reference count of each process in the array is
144147
* NOT incremented - the caller is responsible for ensuring the
@@ -152,6 +155,36 @@ OMPI_DECLSPEC int ompi_proc_finalize(void);
152155
*/
153156
OMPI_DECLSPEC ompi_proc_t** ompi_proc_world(size_t* size);
154157

158+
/**
159+
* Returns the number of processes in the associated with this job.
160+
*
161+
* Returns the list of proc instances associated with this job. Given
162+
* the current association between a job and an MPI_COMM_WORLD, this
163+
* function provides the number of processes for the current
164+
* MPI_COMM_WORLD.
165+
*/
166+
167+
OMPI_DECLSPEC int ompi_proc_world_size (void);
168+
169+
/**
170+
* Returns the list of proc instances associated with this job.
171+
*
172+
* Returns the list of proc instances associated with this job that have
173+
* already been allocated. Given the current association between a job
174+
* and an MPI_COMM_WORLD, this function provides the allocated process
175+
* instances for the current MPI_COMM_WORLD.
176+
*
177+
* @note The reference count of each process in the array is
178+
* NOT incremented - the caller is responsible for ensuring the
179+
* correctness of the reference count once they are done with
180+
* the array.
181+
*
182+
* @param[in] size Number of processes in the ompi_proc_t array
183+
*
184+
* @return Array of pointers to allocated proc instances in the current
185+
* MPI_COMM_WORLD, or NULL if there is an internal failure.
186+
*/
187+
OMPI_DECLSPEC ompi_proc_t **ompi_proc_get_allocated (size_t *size);
155188

156189
/**
157190
* Returns the list of all known proc instances.

ompi/runtime/ompi_mpi_finalize.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,8 +284,11 @@ int ompi_mpi_finalize(void)
284284
return ret;
285285
}
286286

287+
/* call del_procs on all allocated procs even though some may not be known
288+
* to the pml layer. the pml layer is expected to be resilient and ignore
289+
* any unknown procs. */
287290
nprocs = 0;
288-
procs = ompi_proc_world(&nprocs);
291+
procs = ompi_proc_get_allocated (&nprocs);
289292
MCA_PML_CALL(del_procs(procs, nprocs));
290293
free(procs);
291294

ompi/runtime/ompi_mpi_init.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -742,9 +742,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
742742
goto error;
743743
}
744744

745-
/* add all ompi_proc_t's to PML */
746-
if (NULL == (procs = ompi_proc_world(&nprocs))) {
747-
error = "ompi_proc_world() failed";
745+
/* add all allocated ompi_proc_t's to PML (below the add_procs limit this
746+
* behaves identically to ompi_proc_world ()) */
747+
if (NULL == (procs = ompi_proc_get_allocated (&nprocs))) {
748+
error = "ompi_proc_get_allocated () failed";
748749
goto error;
749750
}
750751
ret = MCA_PML_CALL(add_procs(procs, nprocs));

0 commit comments

Comments
 (0)