Skip to content

Commit b2d103c

Browse files
committed
UBCL: Add PML/UBCL and OSC/UBCL
Co-authored-by: Florent GERMAIN <[email protected]> Co-authored-by: Pierre LEMARINIER <[email protected]> Co-authored-by: Antoine CAPRA <[email protected]> Co-authored-by: Emmanuel BRELLE <[email protected]> Co-authored-by: Van Man NGUYEN <[email protected]> Co-authored-by: Julien DUPRAT <[email protected]> Co-authored-by: Tristan CALS <[email protected]> Co-authored-by: Anton DAUMEN <[email protected]> Co-authored-by: Alice CARIBONI <[email protected]> Co-authored-by: François WELLENREITER <[email protected]> Signed-off-by: Van Man NGUYEN <[email protected]>
1 parent 25ce8f9 commit b2d103c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+7769
-0
lines changed

config/ompi_check_ubcl.m4

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# -*- shell-script -*-
2+
#
3+
# Copyright (C) 2015-2017 Mellanox Technologies, Inc.
4+
# All rights reserved.
5+
# Copyright (c) 2015 Research Organization for Information Science
6+
# and Technology (RIST). All rights reserved.
7+
# Copyright (c) 2016 Los Alamos National Security, LLC. All rights
8+
# reserved.
9+
# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
10+
# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
11+
# Copyright (c) 2024-2025 Bull S.A.S. All rights reserved.
12+
# $COPYRIGHT$
13+
#
14+
# Additional copyrights may follow
15+
#
16+
# $HEADER$
17+
#
18+
19+
# OMPI_CHECK_UBCL(prefix, [action-if-found], [action-if-not-found])
20+
# --------------------------------------------------------
21+
# check if UBCL support can be found. sets prefix_{CPPFLAGS,
22+
# as needed and runs action-if-found if there is
23+
# support, otherwise executes action-if-not-found
24+
AC_DEFUN([OMPI_CHECK_UBCL],[
25+
OPAL_VAR_SCOPE_PUSH([ompi_check_ubcl_dir ompi_check_ubcl_happy])
26+
27+
m4_ifblank([$1], [m4_fatal([First argument to OMPI_CHECK_UBCL cannot be blank])])
28+
29+
AC_ARG_WITH([ubcl],
30+
[AC_HELP_STRING([--with-ubcl(=DIR)],
31+
[Build with UBCL support])])
32+
33+
# UBCL is dlopen'd to avoid direct link to libubcl.so.
34+
# OAC_CHECK_PACKAGE would add this explicit link, so it cannot be used.
35+
# OPAL_CHECK_WITHDIR prints an error if the given path is invalid
36+
OPAL_CHECK_WITHDIR([ubcl], [$with_ubcl], [include/ubcl_api.h])
37+
38+
AS_IF([test "$with_ubcl" == "no"],
39+
[ompi_check_ubcl_happy="no"],
40+
41+
[test -z "$with_ubcl"],
42+
[ompi_check_ubcl_happy="no"],
43+
44+
[ompi_check_ubcl_happy="yes"
45+
$1_CPPFLAGS="${$1_CPPFLAGS} -I$with_ubcl/include/"
46+
AC_MSG_NOTICE([$1_CPPFLAGS is set to: ${$1_CPPFLAGS}])])
47+
48+
49+
OPAL_SUMMARY_ADD([Transports],[UBCL],[],[$ompi_check_ubcl_happy])
50+
51+
AS_IF([test "$ompi_check_ubcl_happy" = "yes"],
52+
[$2],
53+
[$3])
54+
55+
OPAL_VAR_SCOPE_POP
56+
])
57+

ompi/mca/common/ubcl/Makefile.am

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# Copyright (c) 2025 Bull SAS. All rights reserved.
2+
# $COPYRIGHT$
3+
#
4+
# Additional copyrights may follow
5+
#
6+
# $HEADER$
7+
#
8+
9+
#AM_CPPFLAGS = $(common_ubcl_CPPFLAGS)
10+
11+
common_ubcl_sources = \
12+
common_ubcl.c \
13+
common_ubcl.h
14+
15+
lib_LTLIBRARIES =
16+
noinst_LTLIBRARIES =
17+
18+
#Common component naming is forced by MCA_PROCESS_COMPONENT in config/opal_mca.m4
19+
# to lib${<PROJECT>_LIB_NAME}mca_common_ubcl.la but OMPI_LIB_NAME does not exist
20+
# so let's hope that no other project name is empty or there are no other common
21+
comp_inst = libmca_common_ubcl.la
22+
comp_noinst = libmca_common_ubcl_noinst.la
23+
24+
if MCA_BUILD_ompi_common_ubcl_DSO
25+
lib_LTLIBRARIES += $(comp_inst)
26+
else
27+
noinst_LTLIBRARIES += $(comp_noinst)
28+
endif
29+
30+
libmca_common_ubcl_la_SOURCES = $(common_ubcl_sources)
31+
libmca_common_ubcl_la_CFLAGS = $(common_ubcl_CFLAGS)
32+
libmca_common_ubcl_la_CPPFLAGS = $(common_ubcl_CPPFLAGS)
33+
libmca_common_ubcl_la_LDFLAGS = $(common_ubcl_LDFLAGS)
34+
libmca_common_ubcl_la_LIBADD = $(common_ubcl_LIBS) \
35+
$(OPAL_TOP_BUILDDIR)/opal/mca/common/ubcl/lib@OPAL_LIB_NAME@mca_common_ubcl.la
36+
37+
libmca_common_ubcl_noinst_la_SOURCES = $(common_ubcl_sources)
38+
39+
# Conditionally install the header files
40+
41+
if WANT_INSTALL_HEADERS
42+
ompidir = $(ompiincludedir)/$(subdir)
43+
ompi_HEADERS = common_ubcl.h
44+
endif
45+
46+
47+
# This library is linked against various MCA components.
48+
# There's two cases:
49+
#
50+
# 1. libmca_common_ubcl.la is a shared library. By linking that shared
51+
# library to all components that need it, the OS linker will
52+
# automatically load it into the process as necessary, and there will
53+
# only be one copy (i.e., all the components will share *one* copy of
54+
# the code and data).
55+
#
56+
# 2. libmca_common_ubcl.la is a static library. In this case, it will
57+
# be rolled up into the top-level libmpi.la. It will also be rolled
58+
# into each component, but then the component will also be rolled up
59+
# into the upper-level libmpi.la. Linkers universally know how to
60+
# "figure this out" so that we end up with only one copy of the code
61+
# and data.
62+
#
63+
# As per above, we'll either have an installable or noinst result.
64+
# The installable one should follow the same MCA prefix naming rules
65+
# (i.e., libmca_<type>_<name>.la). The noinst one can be named
66+
# whatever it wants, although libmca_<type>_<name>_noinst.la is
67+
# recommended.
68+
69+
# To simplify components that link to this library, we will *always*
70+
# have an output libtool library named libmca_<type>_<name>.la -- even
71+
# for case 2) described above (i.e., so there's no conditional logic
72+
# necessary in component Makefile.am's that link to this library).
73+
# Hence, if we're creating a noinst version of this library (i.e.,
74+
# case 2), we sym link it to the libmca_<type>_<name>.la name
75+
# (libtool will do the Right Things under the covers). See the
76+
# all-local and clean-local rules, below, for how this is effected.
77+
# These two rules will sym link the "noinst" libtool library filename
78+
# to the installable libtool library filename in the case where we are
79+
# compiling this component statically (case 2), described above).
80+
V=0
81+
OMPI_V_LN_SCOMP = $(ompi__v_LN_SCOMP_$V)
82+
ompi__v_LN_SCOMP_ = $(ompi__v_LN_SCOMP_$AM_DEFAULT_VERBOSITY)
83+
ompi__v_LN_SCOMP_0 = @echo " LN_S " `basename $(comp_inst)`;
84+
85+
all-local:
86+
$(OMPI_V_LN_SCOMP) if test -z "$(lib_LTLIBRARIES)"; then \
87+
rm -f "$(comp_inst)"; \
88+
$(LN_S) "$(comp_noinst)" "$(comp_inst)"; \
89+
fi
90+
91+
clean-local:
92+
if test -z "$(lib_LTLIBRARIES)"; then \
93+
rm -f "$(comp_inst)"; \
94+
fi

ompi/mca/common/ubcl/common_ubcl.c

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2+
/*
3+
* Copyright (c) 2025 Bull SAS. All rights reserved.
4+
* $COPYRIGHT$
5+
*
6+
* Additional copyrights may follow
7+
*
8+
* $HEADER$
9+
*/
10+
11+
#include <execinfo.h>
12+
#include <stdint.h>
13+
#include <stdio.h>
14+
15+
#include "ompi/communicator/communicator.h"
16+
#include "ompi/errhandler/errhandler.h"
17+
#include "ompi/include/mpi.h"
18+
#include "ompi/runtime/mpiruntime.h"
19+
#include "ompi/mca/common/ubcl/common_ubcl.h"
20+
#include "ompi/mca/pml/ubcl/pml_ubcl.h"
21+
#include "ompi/mca/pml/pml_constants.h"
22+
#include "opal/mca/common/ubcl/common_ubcl.h"
23+
#include "opal/util/output.h"
24+
25+
/* Default ompi_common_ubcl values */
26+
mca_ompi_common_ubcl_component_t mca_ompi_common_ubcl_component = {
27+
.n_addr = 32,
28+
};
29+
30+
static int mca_common_ubcl_find_rank(const struct ompi_communicator_t *comm, const uint64_t wrank)
31+
{
32+
mca_pml_ubcl_comm_t *pml_comm = comm->c_pml_comm;
33+
34+
if (NULL == comm->c_pml_comm) {
35+
common_ubcl_error("UBCL error: no translation array in comm");
36+
abort();
37+
}
38+
39+
for (uint32_t i = 0; i < pml_comm->size; i++) {
40+
if (pml_comm->array[i] == wrank) {
41+
return i;
42+
}
43+
}
44+
45+
common_ubcl_error("UBCL error irank translation");
46+
47+
return 0;
48+
}
49+
50+
int mca_common_ubcl_get_mpi_rank(const int rank, const struct ompi_communicator_t *comm,
51+
const uint64_t ubcl_rank)
52+
{
53+
if (OMPI_ANY_SOURCE == rank) {
54+
return mca_common_ubcl_find_rank(comm, ubcl_rank);
55+
} else {
56+
return rank;
57+
}
58+
}
59+
60+
void mca_common_ubcl_status_to_ompi(ompi_status_public_t *status,
61+
ubcl_status_t ubcl_status,
62+
struct ompi_communicator_t *comm, int rank)
63+
{
64+
if (MPI_STATUS_IGNORE != status) {
65+
status->_cancelled = 0; //TODO output the information of cancel
66+
status->_ucount = ubcl_status.size;
67+
status->MPI_TAG = (int) ubcl_status.tag;
68+
status->MPI_SOURCE = mca_common_ubcl_get_mpi_rank(rank, comm, ubcl_status.remote);
69+
}
70+
}
71+
72+
int ubcl_error_to_ompi(ubcl_error_t code)
73+
{
74+
int ret;
75+
switch (code) {
76+
case UBCL_SUCCESS:
77+
ret = OPAL_SUCCESS;
78+
break;
79+
case UBCL_ERROR:
80+
ret = OPAL_ERROR;
81+
break;
82+
case UBCL_ERR_RESOURCE_BUSY:
83+
ret = OPAL_ERR_RESOURCE_BUSY;
84+
break;
85+
case UBCL_ERR_OUT_OF_RESOURCE:
86+
ret = OPAL_ERR_OUT_OF_RESOURCE;
87+
break;
88+
case UBCL_ERR_NOT_IMPLEMENTED:
89+
ret = OPAL_ERR_NOT_IMPLEMENTED;
90+
break;
91+
case UBCL_ERR_NOT_AVAILABLE:
92+
ret = OPAL_ERR_NOT_AVAILABLE;
93+
break;
94+
case UBCL_ERR_TEMP_OUT_OF_RESOURCE:
95+
ret = OPAL_ERR_TEMP_OUT_OF_RESOURCE;
96+
break;
97+
case UBCL_ERR_ARG_INVALID:
98+
ret = OPAL_ERR_BAD_PARAM;
99+
break;
100+
case UBCL_ERR_TOO_LATE:
101+
ret = OPAL_ERR_TIMEOUT;
102+
break;
103+
case UBCL_ERR_TRUNCATE:
104+
ret = MPI_ERR_TRUNCATE;
105+
break;
106+
default:
107+
ret = OPAL_ERROR;
108+
break;
109+
}
110+
111+
return ret;
112+
}
113+
114+
void _mca_common_ubcl_error(char *filename, int line, int err,
115+
char abort, int verbose, int output,
116+
int is_init, int comp_verbose,
117+
char *comp_name, char *format, ...)
118+
{
119+
int n_addr = 0;
120+
void **stack_buffer = NULL;
121+
char **stack = NULL;
122+
123+
stack_buffer = malloc(sizeof(void *) * mca_ompi_common_ubcl_component.n_addr);
124+
n_addr = backtrace(stack_buffer, mca_ompi_common_ubcl_component.n_addr);
125+
stack = backtrace_symbols(stack_buffer, n_addr);
126+
127+
int char_per_line = 256;
128+
int n_char = char_per_line * n_addr + 1024;
129+
char *msg = malloc(n_char * sizeof(char));
130+
131+
if (NULL == stack || NULL == msg) {
132+
/* Output small error */
133+
opal_output_verbose(verbose, output,
134+
"========\n== ERROR: Not enough memory while outputting error...\n== "
135+
"%s encountered an error (%d) at %s:%d\n========\n",
136+
comp_name, err, filename, line);
137+
} else {
138+
/* Output full error */
139+
int current = 0;
140+
current += snprintf(msg + current, n_char - current,
141+
"========\n== %s encountered an error (%d) at %s:%d\n== %s:\n\t",
142+
comp_name, err, filename, line, abort ? "ERROR" : "WARNING");
143+
va_list arglist;
144+
va_start(arglist, format);
145+
current += vsnprintf(msg + current, n_char - current, format, arglist);
146+
va_end(arglist);
147+
148+
current += snprintf(msg + current, n_char - current, "\n== STACK:\n");
149+
150+
for (int i = 0; i < n_addr; i++) {
151+
size_t min_char = char_per_line < (n_char - current) ? char_per_line : n_char - current;
152+
current += snprintf(msg + current, min_char, "= [%2d] %s\n", i,
153+
stack[i]);
154+
}
155+
156+
if (is_init && output > 0) {
157+
opal_output_verbose(verbose, output,
158+
"%s========", msg);
159+
} else if (abort || comp_verbose >= verbose) {
160+
fprintf(stderr, "%s\n", msg);
161+
fflush(stderr);
162+
}
163+
}
164+
165+
if (abort) {
166+
OMPI_ERRHANDLER_INVOKE(&ompi_mpi_comm_world.comm, err, stack[0]);
167+
ompi_mpi_abort(&ompi_mpi_comm_world.comm, err);
168+
}
169+
170+
free(stack_buffer);
171+
free(stack);
172+
free(msg);
173+
}

ompi/mca/common/ubcl/common_ubcl.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2+
/*
3+
* Copyright (c) 2025 Bull SAS. All rights reserved.
4+
* $COPYRIGHT$
5+
*
6+
* Additional copyrights may follow
7+
*
8+
* $HEADER$
9+
*/
10+
11+
#ifndef OMPI_MCA_COMMON_UBCL_H
12+
#define OMPI_MCA_COMMON_UBCL_H
13+
14+
#include <stddef.h>
15+
16+
#include "ompi/communicator/communicator.h"
17+
#include "ompi/include/mpi.h"
18+
#include "opal/mca/common/ubcl/common_ubcl.h"
19+
20+
/* Holds common variable used in multiple UBCL components */
21+
struct mca_ompi_common_ubcl_component_s {
22+
int n_addr; /**< Max number of void * addresses in printed stack*/
23+
};
24+
typedef struct mca_ompi_common_ubcl_component_s mca_ompi_common_ubcl_component_t;
25+
extern mca_ompi_common_ubcl_component_t mca_ompi_common_ubcl_component;
26+
27+
int mca_common_ubcl_get_mpi_rank(const int rank, const struct ompi_communicator_t *comm,
28+
const uint64_t ubcl_rank);
29+
void mca_common_ubcl_status_to_ompi(ompi_status_public_t *status,
30+
ubcl_status_t ubcl_status,
31+
struct ompi_communicator_t *comm, int rank);
32+
int ubcl_error_to_ompi(ubcl_error_t code);
33+
/* UBCL rank is on 61 bits, ompi jobid is 32bits, vpid must be truncated to 29bits */
34+
#define COMMON_UBCL_VPID_MAX (((1 << 29) - 1)) /* We need 3 bits for UBCL rank */
35+
#define PML_UBCL_JOBID_MAX (OPAL_JOBID_MAX)
36+
37+
/* Error and warning output function used by UBCL components */
38+
void _mca_common_ubcl_error(char *filename, int line, int err, char abort, int verbose,
39+
int output, int is_init, int comp_verbose, char *comp_name,
40+
char *format, ...);
41+
42+
43+
#endif /* OMPI_MCA_COMMON_UBCL_H */
44+

0 commit comments

Comments
 (0)