Skip to content

Commit 0cb684d

Browse files
committed
Merge remote-tracking branch 'upstream/main' into oshmem_base_exchange
2 parents 0f301a7 + ec08767 commit 0cb684d

File tree

29 files changed

+855
-126
lines changed

29 files changed

+855
-126
lines changed

.ci/community-jenkins/Jenkinsfile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,11 @@ milestone(buildNumber)
3535
// back to the PR. The "Details" link at the bottom of the GitHub PR page brings
3636
// you to the Jenkins Build page, so we're adding the link back to the GitHub PR
3737
// page.
38-
currentBuild.description = "This is a build of <a href=\"${CHANGE_URL}\"}\">Open MPI PR #${CHANGE_ID}</a>"
38+
if (env.CHANGE_URL) {
39+
currentBuild.description = "This is a build of <a href=\"${CHANGE_URL}\"}\">Open MPI PR #${CHANGE_ID}</a>"
40+
} else {
41+
currentBuild.description = "Build of ${BRANCH_NAME}"
42+
}
3943

4044
check_stages = prepare_check_stages()
4145
println("Initialized Pipeline")
@@ -53,8 +57,8 @@ println('Tests Completed')
5357
// build stage is a map of different configurations to test.
5458
def prepare_check_stages() {
5559
def configure_options = ["--disable-dlopen", "--disable-oshmem", "--enable-builtin-atomic", "--enable-ipv6"]
56-
def compilers = ["clang10", "gcc5", "gcc6", "gcc7", "gcc8", "gcc9", "gcc10"]
57-
def platforms = ["amazon_linux_2", "amazon_linux_2-arm64", "rhel8", "ubuntu_18.04"]
60+
def compilers = ["clang10", "gcc7", "gcc8", "gcc9", "gcc10"]
61+
def platforms = ["amazon_linux_2", "amazon_linux_2-arm64", "rhel8"]
5862
def check_stages_list = []
5963

6064
// Build everything stage

docs/man-openmpi/man1/mpirun.1.rst

Lines changed: 43 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ To map processes:
262262

263263
* ``--bynode``: Launch processes one per node, cycling by node in a
264264
round-robin fashion. This spreads processes evenly among nodes and
265-
assigns MPI_COMM_WORLD ranks in a round-robin, "by node" manner.
265+
assigns ``MPI_COMM_WORLD`` ranks in a round-robin, "by node" manner.
266266
(deprecated in favor of ``--map-by node``)
267267

268268
* ``--cpu-list <cpus>``: Comma-delimited list of processor IDs to
@@ -272,7 +272,7 @@ To map processes:
272272
.. note:: You can run Run the hwloc ``lstopo(1)`` command to see a
273273
list of available cores and their logical IDs.
274274

275-
To order processes' ranks in MPI_COMM_WORLD:
275+
To order processes' ranks in ``MPI_COMM_WORLD``:
276276

277277
* ``--rank-by <mode>``: Rank in round-robin fashion according to the
278278
specified mode, defaults to slot. Supported options include
@@ -311,15 +311,15 @@ To manage standard I/O:
311311
specified filename. Any directories in the filename will
312312
automatically be created. Each output file will consist of
313313
``filename.id``, where the ``id`` will be the processes' rank in
314-
MPI_COMM_WORLD, left-filled with zero's for correct ordering in
314+
``MPI_COMM_WORLD``, left-filled with zero's for correct ordering in
315315
listings. A relative path value will be converted to an absolute
316316
path based on the cwd where mpirun is executed. Note that this will
317317
not work on environments where the file system on compute nodes
318318
differs from that where :ref:`mpirun(1) <man1-mpirun>` is
319319
executed.
320320

321-
* ``--stdin <rank>``: The MPI_COMM_WORLD rank of the process that is
322-
to receive stdin. The default is to forward stdin to MPI_COMM_WORLD
321+
* ``--stdin <rank>``: The ``MPI_COMM_WORLD`` rank of the process that is
322+
to receive stdin. The default is to forward stdin to ``MPI_COMM_WORLD``
323323
rank 0, but this option can be used to forward stdin to any
324324
process. It is also acceptable to specify none, indicating that no
325325
processes are to receive stdin.
@@ -329,7 +329,7 @@ To manage standard I/O:
329329

330330
* ``--tag-output``: Tag each line of output to stdout, stderr, and
331331
stddiag with ``[jobid, MCW_rank]<stdxxx>`` indicating the process
332-
jobid and MPI_COMM_WORLD rank of the process that generated the
332+
jobid and ``MPI_COMM_WORLD`` rank of the process that generated the
333333
output, and the channel which generated it.
334334

335335
* ``--timestamp-output``: Timestamp each line of output to stdout,
@@ -342,7 +342,7 @@ To manage standard I/O:
342342
specified file.
343343

344344
* ``--xterm <ranks>``: Display the output from the processes
345-
identified by their MPI_COMM_WORLD ranks in separate xterm
345+
identified by their ``MPI_COMM_WORLD`` ranks in separate xterm
346346
windows. The ranks are specified as a comma-separated list of
347347
ranges, with a ``-1`` indicating all. A separate window will be created
348348
for each specified process.
@@ -422,7 +422,22 @@ Setting MCA parameters:
422422

423423
* ``--mca <key> <value>``: Send arguments to various MCA modules. See
424424
the :ref:`Setting MCA Parameters
425-
<man1-mpirun-setting-mca-parameters>` section for mode details.
425+
<man1-mpirun-setting-mca-parameters>` section for more details.
426+
427+
.. note:: Open MPI will attempt to discern PMIx and PRRTE MCA
428+
parameters passed via ``--mca`` and handle them
429+
appropriately, but it may not always guess correctly. It
430+
is best to use ``--pmixmca`` and ``--prtemca`` when
431+
passing MCA parammeters to PMIx and PRRTE, respectively.
432+
433+
* ``--pmixmca <key> <value>``: Send arguments to MCA modules in the
434+
PMIx subsystem. See the :ref:`Setting MCA Parameters
435+
<man1-mpirun-setting-mca-parameters>` section for more details.
436+
437+
* ``--prtemca <key> <value>``: Send arguments to MCA modules in the
438+
PMIx Reference Runtime Environment (PRRTE) subsystem. See the
439+
:ref:`Setting MCA Parameters <man1-mpirun-setting-mca-parameters>`
440+
section for more details.
426441

427442
* ``--tune <tune_file>``: Specify a tune file to set arguments for
428443
various MCA modules and environment variables. See the :ref:`
@@ -532,11 +547,11 @@ generally useful to most Open MPI users:
532547
just before launch.
533548

534549
* ``--launch-agent``: Name of the executable that is to be used to
535-
start processes on the remote nodes. The default is ``PRRTEd``. This
550+
start processes on the remote nodes. The default is ``prted``. This
536551
option can be used to test new daemon concepts, or to pass options
537552
back to the daemons without having mpirun itself see them. For
538-
example, specifying a launch agent of ``PRRTEd -mca odls_base_verbose
539-
5`` allows the developer to ask the ``PRRTEd`` for debugging output
553+
example, specifying a launch agent of ``prted --prtemca odls_base_verbose
554+
5`` allows the developer to ask the ``prted`` for debugging output
540555
without clutter from ``mpirun`` itself.
541556

542557
* ``--report-state-on-timeout``: When paired with the ``--timeout``
@@ -716,7 +731,7 @@ options that describe mapping policies.
716731

717732
Consider the same hostfile as above, again with ``-n 6``. The table
718733
below lists a few ``mpirun`` variations, and shows which
719-
MPI_COMM_WORLD ranks end up on which node:
734+
``MPI_COMM_WORLD`` ranks end up on which node:
720735

721736
.. list-table::
722737
:header-rows: 1
@@ -832,7 +847,7 @@ Open MPI employs a three-phase procedure for assigning process locations
832847
and ranks:
833848

834849
#. **Mapping**: Assigns a default location to each process
835-
#. **Ranking**: Assigns an MPI_COMM_WORLD rank value to each process
850+
#. **Ranking**: Assigns an ``MPI_COMM_WORLD`` rank value to each process
836851
#. **Binding**: Constrains each process to run on specific processors
837852

838853
The mapping step is used to assign a default location to each process
@@ -864,7 +879,7 @@ gives you detailed control over process binding as well. Rankfiles
864879
are discussed :ref:`below <man1-mpirun-rankfiles>`.
865880

866881
The second phase focuses on the ranking of the process within the
867-
job's MPI_COMM_WORLD. Open MPI separates this from the mapping
882+
job's ``MPI_COMM_WORLD``. Open MPI separates this from the mapping
868883
procedure to allow more flexibility in the relative placement of MPI
869884
processes. This is best illustrated by considering the following
870885
cases where we used the ``--np 8 --map-by ppr:2:package --host aa:4,bb:4`` option:
@@ -1013,10 +1028,12 @@ MCA parameters can be set not only on the mpirun command line, but
10131028
alternatively in a system or user ``mca-params.conf`` file or as
10141029
environment variables, as described in the :ref:`Setting MCA
10151030
Parameters <man1-mpirun-setting-mca-parameters>`. These are MCA parameters for
1016-
the PRRTE runtime so the command line argument ``--PRRTEmca`` must be used to
1017-
pass the MCA parameter key/value pair. Alternatively, the MCA parameter key/
1018-
value pair may be specific on the command line by prefixing the key with
1019-
``PRRTE_MCA_``. Some examples include:
1031+
the PRRTE runtime so the command line argument ``--prtemca``
1032+
(yes, ``prte`` with a single ``r``, not two ``r``'s) must be used to
1033+
pass the MCA parameter key/value pair. Alternatively, the MCA parameter
1034+
key/value pair may be specific on the command line by prefixing the key with
1035+
``PRTE_MCA_`` (again, that is not a typo: ``PRTE`` not ``PRRTE``).
1036+
Some examples include:
10201037

10211038
.. list-table::
10221039
:header-rows: 1
@@ -1166,7 +1183,7 @@ Rankfiles are text files that specify detailed information about how
11661183
individual processes should be mapped to nodes, and to which
11671184
processor(s) they should be bound. Each line of a rankfile specifies
11681185
the location of one process (for MPI jobs, the process' "rank" refers
1169-
to its rank in MPI_COMM_WORLD). The general form of each line in the
1186+
to its rank in ``MPI_COMM_WORLD``). The general form of each line in the
11701187
rankfile is:
11711188

11721189
.. code::
@@ -1299,11 +1316,11 @@ Standard I/O
12991316
retained, or removed?
13001317

13011318
Open MPI directs UNIX standard input to ``/dev/null`` on all processes
1302-
except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0
1319+
except the ``MPI_COMM_WORLD`` rank 0 process. The ``MPI_COMM_WORLD`` rank 0
13031320
process inherits standard input from ``mpirun``.
13041321

13051322
.. note:: The node that invoked ``mpirun`` need not be the same as the
1306-
node where the MPI_COMM_WORLD rank 0 process resides. Open
1323+
node where the ``MPI_COMM_WORLD`` rank 0 process resides. Open
13071324
MPI handles the redirection of ``mpirun``'s standard input
13081325
to the rank 0 process.
13091326

@@ -1320,7 +1337,7 @@ example:
13201337
13211338
shell$ mpirun -n 2 my_app < my_input > my_output
13221339
1323-
Note that in this example only the MPI_COMM_WORLD rank 0 process will
1340+
Note that in this example only the ``MPI_COMM_WORLD`` rank 0 process will
13241341
receive the stream from ``my_input`` on stdin. The stdin on all the other
13251342
nodes will be tied to ``/dev/null``. However, the stdout from all nodes
13261343
will be collected into the ``my_output`` file.
@@ -1643,15 +1660,15 @@ that job are designated "secondary" jobs):
16431660

16441661
* If one or more processes in the primary job normally terminate with
16451662
non-zero exit status, ``mpirun`` returns the exit status of the
1646-
process with the lowest MPI_COMM_WORLD rank to have a non-zero
1663+
process with the lowest ``MPI_COMM_WORLD`` rank to have a non-zero
16471664
status.
16481665

16491666
* If all processes in the primary job normally terminate with exit
16501667
status 0, and one or more processes in a secondary job normally
16511668
terminate with non-zero exit status, ``mpirun``:
16521669

16531670
#. Returns the exit status of the process with the lowest
1654-
MPI_COMM_WORLD rank in the lowest jobid to have a non-zero
1671+
``MPI_COMM_WORLD`` rank in the lowest jobid to have a non-zero
16551672
status, and
16561673
#. Outputs a message summarizing the exit status of the primary and
16571674
all secondary jobs.
@@ -1662,7 +1679,7 @@ that job are designated "secondary" jobs):
16621679
summary print statement.
16631680

16641681
By default, the job will abort when any process terminates with
1665-
non-zero status. The MCA parameter ``--PRRTEmca state_base_error_non_zero_exit``
1682+
non-zero status. The MCA parameter ``--prtemca state_base_error_non_zero_exit``
16661683
can be set to "false" (or "0") to cause Open MPI to not abort a job if
16671684
one or more processes return a non-zero status. In that situation the
16681685
Open MPI records and notes that processes exited with non-zero
@@ -1705,7 +1722,7 @@ processes exited before calling :ref:`MPI_FINALIZE(3) <mpi_finalize>`.
17051722
If an internal error occurred in mpirun, the corresponding error code
17061723
is returned. In the event that one or more processes exit before
17071724
calling :ref:`MPI_FINALIZE(3) <mpi_finalize>`, the return value of
1708-
the MPI_COMM_WORLD rank of the process that mpirun first notices died
1725+
the ``MPI_COMM_WORLD`` rank of the process that mpirun first notices died
17091726
before calling :ref:`MPI_FINALIZE(3) <mpi_finalize>` will be
17101727
returned. Note that, in general, this will be the first process that
17111728
died but is not guaranteed to be so.

docs/release-notes/platform.rst

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,10 @@ that a release of Open MPI supports.
2727
* Systems that have been tested are:
2828

2929
* Linux (various flavors/distros), 64 bit (x86, ppc, aarch64),
30-
with gcc (>=4.8.x+), clang (>=3.6.0), Absoft (fortran), Intel,
30+
with gcc/gfortran (>=7.x+), clang (>=10.x), Intel,
3131
and Portland (be sure to also see :ref:`the Compiler Notes
3232
section <compiler-notes-section-label>`)
33-
* macOS (10.14-10.15, 11.x, 12.x), 64 bit (x86_64) with XCode
34-
compilers
33+
* macOS (14.x, 15.x), 64 bit (x86_64) with XCode compilers
3534

3635
* Other systems have been lightly (but not fully) tested:
3736

docs/tuning-apps/coll-tuned.rst

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ Dynamic Decisions and the Rules File
8282

8383
Given that the best choice of algorithm for a given collective depends on a
8484
number of factors only known at run time, and that some of these factors may
85-
vary with in a run, setting an algorithm on the command line often is an
85+
vary within a run, setting an algorithm on the command line often is an
8686
ineffective means of tuning. The rules file provides a means of choosing
8787
an algorithm at run time based on communicator and message size. The rules
8888
file can be specified on the command line, or the other usual ways to set MCA
@@ -104,13 +104,29 @@ Dynamic tuning files are organized in this format:
104104
.. code-block:: sh
105105
:linenos:
106106
107-
1 # Number of collectives
108-
1 # Collective ID
109-
1 # Number of comm sizes
110-
2 # Comm size
111-
2 # Number of message sizes
112-
0 1 0 0 # Message size 0, algorithm 1, topo and segmentation at 0
113-
1024 2 0 0 # Message size 1024, algorithm 2, topo and segmentation at 0
107+
rule-file-version-2
108+
1 # num of collectives
109+
3 # collective ID
110+
1 # number of comm sizes
111+
#=====================
112+
64 # comm size
113+
14 # number of rules
114+
# Bytes alg topo segs reqs
115+
#----------------------
116+
0 0 0 0 0
117+
512000 4 0 0 64
118+
1536000 4 0 0 64
119+
3072000 4 0 0 64
120+
6144000 4 0 0 64
121+
12288000 4 0 0 16
122+
24576000 4 0 0 16
123+
49152000 4 0 0 16
124+
98304000 4 0 0 16
125+
196608000 4 0 0 8
126+
393216000 4 0 0 8
127+
786432000 4 0 0 1
128+
1572864000 4 0 0 1
129+
2621440000 0 0 0 0
114130
115131
The rules file effectively defines, for one or more collectives, a function of
116132
two variables, which given communicator and message size, returns an algorithm
@@ -128,10 +144,20 @@ for details.
128144
One may provide rules for as many collectives, communicator sizes, and message
129145
sizes as desired. Simply repeat the sections as needed and adjust the relevant
130146
count parameters. One must always provide a rule for message size of zero.
131-
Message size rules are expected in ascending order. The last two parameters in
132-
the rule may or may not be used and have different meaning depending on the
133-
collective and algorithm. As of writing not all of the relevant control
134-
parameters can be set by the rules file (See issue #12589).
147+
Message size rules are expected in ascending order. The last parameters in the
148+
message size rule may or may not be used and have different meaning depending
149+
on the collective and algorithm. The first two parameters in the rule following
150+
the algorithm ID, `topo` and `segment size`, are always required. In version 2
151+
of the file format a third parameter, `max requests`, may also be provided. A
152+
release of Open MPI at least v5.0.7 is required for version 2 features.
153+
154+
The file format version specifier, `rule-file-version-N` where N is an integer
155+
greater or equal to 1, should appear on the first line of the file. If the
156+
version specifier is not present, the file format is assumed to be version 1.
157+
Version 2 or greater is required to use the `max requests` parameter. Open MPI
158+
releases older than v5.0.7 do not support the file format version
159+
identifier. When using older releases of Open MPI do not include a version
160+
specifier and do not use the `max requests` parameter in message size rules.
135161

136162
.. _CollectivesAndAlgorithms:
137163

docs/tuning-apps/networking/cuda.rst

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,12 @@ be found, please also pass ``--with-cuda-libdir``. For example:
5353
``--with-cuda=<path-to-cuda> --with-cuda-libdir=/usr/local/cuda/lib64/stubs``.
5454

5555
Open MPI supports building with CUDA libraries and running on systems
56-
without CUDA libraries or hardware. In order to take advantage of
57-
this functionality, when compiling, you have to specify the CUDA
58-
dependent components to be built as DSOs using the
56+
without CUDA libraries or hardware.
57+
58+
For releases v5.0.2 and newer no special steps are required to get this behavior.
59+
60+
In order to realize this behavior for the v5.0.0 and v5.0.1 releases,
61+
when configuring Open MPI, you have to specify the CUDA dependent components to be built as DSOs using the
5962
``--enable-mca-dso=<comma-delimited-list-of-cuda-components.``
6063
configure option.
6164

ompi/communicator/comm.c

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1784,22 +1784,22 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
17841784
leader_procs[1] = tmp;
17851785
}
17861786

1787-
/* create a unique tag for allocating the leader communicator. we can eliminate this step
1788-
* if we take a CID from the newly allocated block belonging to local_comm. this is
1789-
* a note to make this change at a later time. */
1790-
opal_asprintf (&sub_tag, "%s-OMPIi-LC", tag);
1791-
if (OPAL_UNLIKELY(NULL == sub_tag)) {
1792-
ompi_comm_free (&local_comm);
1787+
leader_group = ompi_group_allocate_plist_w_procs (NULL, leader_procs, 2);
1788+
ompi_set_group_rank (leader_group, my_proc);
1789+
if (OPAL_UNLIKELY(NULL == leader_group)) {
17931790
free(leader_procs);
1791+
ompi_comm_free (&local_comm);
17941792
return OMPI_ERR_OUT_OF_RESOURCE;
17951793
}
17961794

1797-
leader_group = ompi_group_allocate_plist_w_procs (NULL, leader_procs, 2);
1798-
ompi_set_group_rank (leader_group, my_proc);
1799-
if (OPAL_UNLIKELY(NULL == leader_group)) {
1800-
free (sub_tag);
1795+
/* create a unique tag for allocating the leader communicator. we can eliminate this step
1796+
* if we take a CID from the newly allocated block belonging to local_comm. this is
1797+
* a note to make this change at a later time. */
1798+
opal_asprintf (&sub_tag, "%s-OMPIi-LC-%s", tag, OPAL_NAME_PRINT(ompi_group_get_proc_name (leader_group, 0)));
1799+
if (OPAL_UNLIKELY(NULL == sub_tag)) {
18011800
free(leader_procs);
18021801
ompi_comm_free (&local_comm);
1802+
OBJ_RELEASE(leader_group);
18031803
return OMPI_ERR_OUT_OF_RESOURCE;
18041804
}
18051805

@@ -1809,6 +1809,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
18091809
rc = ompi_comm_create_from_group (leader_group, sub_tag, info, errhandler, &leader_comm);
18101810
OBJ_RELEASE(leader_group);
18111811
free (sub_tag);
1812+
sub_tag = NULL;
18121813
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
18131814
free(leader_procs);
18141815
ompi_comm_free (&local_comm);
@@ -1864,7 +1865,16 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
18641865
return rc;
18651866
}
18661867

1867-
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, NULL, false, OMPI_COMM_CID_GROUP_NEW);
1868+
/*
1869+
* append the pmix CONTEXT_ID obtained when creating the leader comm as discriminator
1870+
*/
1871+
opal_asprintf (&sub_tag, "%s-%ld", tag, data[1]);
1872+
if (OPAL_UNLIKELY(NULL == sub_tag)) {
1873+
return OMPI_ERR_OUT_OF_RESOURCE;
1874+
}
1875+
1876+
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) sub_tag, NULL, false, OMPI_COMM_CID_GROUP_NEW);
1877+
free (sub_tag);
18681878
if ( OMPI_SUCCESS != rc ) {
18691879
OBJ_RELEASE(newcomp);
18701880
return rc;

ompi/communicator/comm_cid.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
317317
char *tag = NULL;
318318
size_t proc_count = 0, rproc_count = 0, tproc_count = 0, cid_base = 0UL, ninfo;
319319
int rc, leader_rank;
320-
pmix_proc_t *procs;
320+
pmix_proc_t *procs = NULL;
321321
void *grpinfo = NULL, *list = NULL;
322322
pmix_data_array_t darray;
323323

0 commit comments

Comments
 (0)