From 0ca418388f8894a77dcfe1a1ceb6bdcff56c56e0 Mon Sep 17 00:00:00 2001 From: heyujiao99 Date: Fri, 1 Aug 2025 18:23:28 +0800 Subject: [PATCH] add RVV support for MPI_OP Signed-off-by: heyujiao99 --- ompi/mca/op/riscv64/Makefile.am | 77 ++++ ompi/mca/op/riscv64/configure.m4 | 64 +++ ompi/mca/op/riscv64/op_riscv64.h | 61 +++ ompi/mca/op/riscv64/op_riscv64_component.c | 202 +++++++++ ompi/mca/op/riscv64/op_riscv64_functions.c | 483 +++++++++++++++++++++ 5 files changed, 887 insertions(+) create mode 100644 ompi/mca/op/riscv64/Makefile.am create mode 100644 ompi/mca/op/riscv64/configure.m4 create mode 100644 ompi/mca/op/riscv64/op_riscv64.h create mode 100644 ompi/mca/op/riscv64/op_riscv64_component.c create mode 100644 ompi/mca/op/riscv64/op_riscv64_functions.c diff --git a/ompi/mca/op/riscv64/Makefile.am b/ompi/mca/op/riscv64/Makefile.am new file mode 100644 index 00000000000..5aab8c89890 --- /dev/null +++ b/ompi/mca/op/riscv64/Makefile.am @@ -0,0 +1,77 @@ +# +# Copyright (c) 2019 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2025 Software System Team, SANECHIPS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This is an riscv64 op component. This Makefile.am is a typical +# riscv64 of how to integrate into Open MPI's Automake-based build +# system. +# +# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent +# for more details on how to make Open MPI components. + +# First, list all .h and .c sources. It is necessary to list all .h +# files so that they will be picked up in the distribution tarball. + +sources = \ + op_riscv64.h \ + op_riscv64_component.c + +# Open MPI components can be compiled two ways: +# +# 1. As a standalone dynamic shared object (DSO), sometimes called a +# dynamically loadable library (DLL). +# +# 2. As a static library that is slurped up into the upper-level +# libmpi library (regardless of whether libmpi is a static or dynamic +# library). This is called a "Libtool convenience library". +# +# The component needs to create an output library in this top-level +# component directory, and named either mca__.la (for DSO +# builds) or libmca__.la (for static builds). The OMPI +# build system will have set the +# MCA_BUILD_ompi___DSO AM_CONDITIONAL to indicate +# which way this component should be built. +specialized_op_libs = +if MCA_BUILD_ompi_op_has_rvv_support +specialized_op_libs += liblocal_ops_rvv.la +liblocal_ops_rvv_la_SOURCES = op_riscv64_functions.c +liblocal_ops_rvv_la_CPPFLAGS = -DGENERATE_RVV_CODE=1 +endif + +component_noinst = $(specialized_op_libs) +if MCA_BUILD_ompi_op_riscv64_DSO +component_install = mca_op_riscv64.la +else +component_install = +component_noinst += libmca_op_riscv64.la +endif + +# Specific information for DSO builds. +# +# The DSO should install itself in $(ompilibdir) (by default, +# $prefix/lib/openmpi). + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_op_riscv64_la_SOURCES = $(sources) +mca_op_riscv64_la_LIBADD = $(specialized_op_libs) +mca_op_riscv64_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la + + +# Specific information for static builds. +# +# Note that we *must* "noinst"; the upper-layer Makefile.am's will +# slurp in the resulting .la library into libmpi. + +noinst_LTLIBRARIES = $(component_noinst) +libmca_op_riscv64_la_SOURCES = $(sources) +libmca_op_riscv64_la_LIBADD = $(specialized_op_libs) +libmca_op_riscv64_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/op/riscv64/configure.m4 b/ompi/mca/op/riscv64/configure.m4 new file mode 100644 index 00000000000..dca045825d3 --- /dev/null +++ b/ompi/mca/op/riscv64/configure.m4 @@ -0,0 +1,64 @@ +# -*- shell-script -*- +# +# Copyright (c) 2019-2020 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2025 Software System Team, SANECHIPS. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ompi_op_riscv_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_ompi_op_riscv64_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/op/riscv64/Makefile]) + case "${host}" in + riscv64*) + op_riscv64_check="yes";; + *) + op_riscv64_check="no";; + esac + AS_IF([test "$op_riscv64_check" = "yes"], + [AC_LANG_PUSH([C]) + + # + # Check for RVV support + # + AC_CACHE_CHECK([for RVV support], op_cv_rvv_support, + [ + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[ +#if defined(__riscv) && defined(__riscv_v) && __riscv_xlen == 64 +#include +#else +#error "Not a 64-bit RISC-V target" +#endif + ]], + [[ +#if defined(__riscv) && defined(__riscv_v) && __riscv_xlen == 64 + size_t vl = __riscv_vsetvl_e32m1(4); +#endif + ]])], + [op_cv_rvv_support=yes], + [op_cv_rvv_support=no])]) + AC_LANG_POP +]) + AM_CONDITIONAL([MCA_BUILD_ompi_op_has_rvv_support], + [test "$op_cv_rvv_support" = "yes"]) + + AC_SUBST(MCA_BUILD_ompi_op_has_rvv_support) + + AS_IF([test "$op_cv_rvv_support" = "yes"], + [AC_DEFINE([OMPI_MCA_OP_HAVE_RVV], [1],[RVV supported in the current build])], + [AC_DEFINE([OMPI_MCA_OP_HAVE_RVV], [0],[RVV not supported in the current build])]) + + # If we have at least support for Neon or SVE + AS_IF([test "$op_cv_rvv_support" = "yes"], + [$1], + [$2]) +])dnl diff --git a/ompi/mca/op/riscv64/op_riscv64.h b/ompi/mca/op/riscv64/op_riscv64.h new file mode 100644 index 00000000000..303758ec24a --- /dev/null +++ b/ompi/mca/op/riscv64/op_riscv64.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2025 Software System Team, SANECHIPS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OP_RISCV64_EXPORT_H +#define MCA_OP_RISCV64_EXPORT_H + +#include "ompi_config.h" + +#include "ompi/mca/mca.h" +#include "opal/class/opal_object.h" + +#include "ompi/mca/op/op.h" + +BEGIN_C_DECLS + +/** + * Derive a struct from the base op component struct, allowing us to + * cache some component-specific information on our well-known + * component struct. + */ +typedef struct { + /** The base op component struct */ + ompi_op_base_component_1_0_0_t super; + + /* What follows is riscv64-component-specific cached information. We + tend to use this scheme (caching information on the riscv64 + component itself) instead of lots of individual global + variables for the component. */ + + /** A simple boolean indicating that the hardware is available. */ + bool hardware_available; + + /** A simple boolean indicating whether double precision is + supported. */ + bool double_supported; +} ompi_op_riscv64_component_t; + +/** + * Globally exported variable. Note that it is a *riscv64* component + * (defined above), which has the ompi_op_base_component_t as its + * first member. Hence, the MCA/op framework will find the data that + * it expects in the first memory locations, but then the component + * itself can cache additional information after that that can be used + * by both the component and modules. + */ +OMPI_DECLSPEC extern ompi_op_riscv64_component_t + mca_op_riscv64_component; + +END_C_DECLS + +#endif /* MCA_OP_RISCV64_EXPORT_H */ diff --git a/ompi/mca/op/riscv64/op_riscv64_component.c b/ompi/mca/op/riscv64/op_riscv64_component.c new file mode 100644 index 00000000000..72d722e1861 --- /dev/null +++ b/ompi/mca/op/riscv64/op_riscv64_component.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2025 Software System Team, SANECHIPS. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file + * + * This is the "RISCV64" component source code. + * + */ + +#include "ompi_config.h" + +#include "opal/util/printf.h" + +#include "ompi/constants.h" +#include "ompi/mca/op/riscv64/op_riscv64.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/mca/op/op.h" +#include "ompi/op/op.h" +#include +#include + +static int mca_op_riscv64_component_open(void); +static int mca_op_riscv64_component_close(void); +static int mca_op_riscv64_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple); +static struct ompi_op_base_module_1_0_0_t * + mca_op_riscv64_component_op_query(struct ompi_op_t *op, int *priority); +static int mca_op_riscv64_component_register(void); + +ompi_op_riscv64_component_t mca_op_riscv64_component = { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + { + .opc_version = { + OMPI_OP_BASE_VERSION_1_0_0, + + .mca_component_name = "riscv64", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + .mca_open_component = mca_op_riscv64_component_open, + .mca_close_component = mca_op_riscv64_component_close, + .mca_register_component_params = mca_op_riscv64_component_register, + }, + .opc_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + .opc_init_query = mca_op_riscv64_component_init_query, + .opc_op_query = mca_op_riscv64_component_op_query, + }, +}; +MCA_BASE_COMPONENT_INIT(ompi, op, riscv64) + +/* + * Component open + */ +static int mca_op_riscv64_component_open(void) +{ + + /* A first level check to see if RVV extension is even available in this + process. E.g., you may want to do a first-order check to see + if hardware is available. If so, return OMPI_SUCCESS. If not, + return anything other than OMPI_SUCCESS and the component will + silently be ignored. + + Note that if this function returns non-OMPI_SUCCESS, then this + component won't even be shown in ompi_info output (which is + probably not what you want). + */ + + return OMPI_SUCCESS; +} + +/* + * Component close + */ +static int mca_op_riscv64_component_close(void) +{ + + /* If the riscv64 was opened successfully, close it (i.e., release any + resources that may have been allocated on this component). + Note that _component_close() will always be called at the end + of the process, so it may have been after any/all of the other + component functions have been invoked (and possibly even after + modules have been created and/or destroyed). */ + + return OMPI_SUCCESS; +} + +/* + * Register MCA params. + */ +static int mca_op_riscv64_component_register(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + mca_op_riscv64_component.hardware_available = hwcap & COMPAT_HWCAP_ISA_V; /* Check for V extension */ + (void) mca_base_component_var_register(&mca_op_riscv64_component.super.opc_version, + "hardware_available", + "Whether the RVV hardware is available", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_op_riscv64_component.hardware_available); + + mca_op_riscv64_component.double_supported = hwcap & COMPAT_HWCAP_ISA_D; /* Check for D extension */ + (void) mca_base_component_var_register(&mca_op_riscv64_component.super.opc_version, + "double_supported", + "Whether the double precision data types are supported or not", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_op_riscv64_component.double_supported); + return OMPI_SUCCESS; +} + +/* + * Query whether this component wants to be used in this process. + */ +static int mca_op_riscv64_component_init_query(bool enable_progress_threads, + bool enable_mpi_thread_multiple) +{ + if (mca_op_riscv64_component.hardware_available) { + return OMPI_SUCCESS; + } + return OMPI_ERR_NOT_SUPPORTED; +} + +#if OMPI_MCA_OP_HAVE_RVV +extern ompi_op_base_handler_fn_t +ompi_op_riscv64_functions_rvv[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; +extern ompi_op_base_3buff_handler_fn_t +ompi_op_riscv64_3buff_functions_rvv[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX]; +#endif /* OMPI_MCA_OP_HAVE_RVV */ + +/* + * Query whether this component can be used for a specific op + */ +static struct ompi_op_base_module_1_0_0_t * + mca_op_riscv64_component_op_query(struct ompi_op_t *op, int *priority) +{ + ompi_op_base_module_t *module = NULL; + /* Sanity check -- although the framework should never invoke the + _component_op_query() on non-intrinsic MPI_Op's, we'll put a + check here just to be sure. */ + if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) { + return NULL; + } + + switch (op->o_f_to_c_index) { + case OMPI_OP_BASE_FORTRAN_MAX: + case OMPI_OP_BASE_FORTRAN_MIN: + case OMPI_OP_BASE_FORTRAN_SUM: + case OMPI_OP_BASE_FORTRAN_PROD: + case OMPI_OP_BASE_FORTRAN_BOR: + case OMPI_OP_BASE_FORTRAN_BAND: + case OMPI_OP_BASE_FORTRAN_BXOR: + module = OBJ_NEW(ompi_op_base_module_t); + for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { + module->opm_fns[i] = NULL; + module->opm_3buff_fns[i] = NULL; +#if OMPI_MCA_OP_HAVE_RVV + if( mca_op_riscv64_component.hardware_available ) { + module->opm_fns[i] = ompi_op_riscv64_functions_rvv[op->o_f_to_c_index][i]; + module->opm_3buff_fns[i] = ompi_op_riscv64_3buff_functions_rvv[op->o_f_to_c_index][i]; + } +#endif /* OMPI_MCA_OP_HAVE_RVV */ + } + break; + case OMPI_OP_BASE_FORTRAN_LAND: + case OMPI_OP_BASE_FORTRAN_LOR: + case OMPI_OP_BASE_FORTRAN_LXOR: + case OMPI_OP_BASE_FORTRAN_MAXLOC: + case OMPI_OP_BASE_FORTRAN_MINLOC: + case OMPI_OP_BASE_FORTRAN_REPLACE: + default: + break; + } + /* If we got a module from above, we'll return it. Otherwise, + we'll return NULL, indicating that this component does not want + to be considered for selection for this MPI_Op. Note that the + functions each returned a *riscv64* component pointer + (vs. a *base* component pointer -- where an *riscv64* component + is a base component plus some other module-specific cached + information), so we have to cast it to the right pointer type + before returning. */ + if (NULL != module) { + *priority = 50; + } + return (ompi_op_base_module_1_0_0_t *) module; +} diff --git a/ompi/mca/op/riscv64/op_riscv64_functions.c b/ompi/mca/op/riscv64/op_riscv64_functions.c new file mode 100644 index 00000000000..b98a0ba5c83 --- /dev/null +++ b/ompi/mca/op/riscv64/op_riscv64_functions.c @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2019 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2025 Software System Team, SANECHIPS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#include "opal/util/output.h" + +#include "ompi/op/op.h" +#include "ompi/mca/op/op.h" +#include "ompi/mca/op/base/base.h" +#include "ompi/mca/op/riscv64/op_riscv64.h" + +#if GENERATE_RVV_CODE +# include +# define APPEND _rvv +# define LMUL m4 +#endif + +/* + * Concatenate preprocessor tokens A and B without expanding macro definitions + * (however, if invoked from a macro, macro arguments are expanded). + */ +#define OP_CONCAT_NX(A, B) A ## B + +/* + * Concatenate preprocessor tokens A and B after macro-expanding them. + */ +#define OP_CONCAT(A, B) OP_CONCAT_NX(A, B) + +/* + * Since all the functions in this file are essentially identical, we + * use a macro to substitute in names and types. The core operation + * in all functions that use this macro is the same. + * + * This macro is for (out op in). + * + */ + +typedef float float32_t; +typedef double float64_t; + +#define RVV_FUNC(A) OP_CONCAT(__riscv_, OP_CONCAT(A, LMUL)) +#define RVV_TYPE(A) OP_CONCAT(OP_CONCAT(A, LMUL), _t) + +#if GENERATE_RVV_CODE +#define OP_RISCV64_FUNC(name, type_abbr, type_size, type, op) \ + static void OP_CONCAT(ompi_op_riscv64_2buff_##name##_##type##type_size##_t, APPEND) \ + (const void *_in, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ + { \ + size_t vl; \ + const int cnt = *count; \ + type##type_size##_t *in = (type##type_size##_t *) _in, \ + *out = (type##type_size##_t *) _out; \ + for (size_t i = 0; i < cnt; i += vl) { \ + vl = RVV_FUNC(vsetvl_e##type_size)(cnt - i); \ + RVV_TYPE(v##type##type_size) vsrc = RVV_FUNC(vle##type_size##_v_##type_abbr)(&in[i], vl); \ + RVV_TYPE(v##type##type_size) vdst = RVV_FUNC(vle##type_size##_v_##type_abbr)(&out[i], vl); \ + vdst = RVV_FUNC(op##_vv_##type_abbr)(vdst, vsrc, vl); \ + RVV_FUNC(vse##type_size##_v_##type_abbr)(&out[i], vdst, vl); \ + } \ + } +#endif + +/************************************************************************* + * Max + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) > (b) ? (a) : (b)) + OP_RISCV64_FUNC(max, i8, 8, int, vmax) + OP_RISCV64_FUNC(max, u8, 8, uint, vmaxu) + OP_RISCV64_FUNC(max, i16, 16, int, vmax) + OP_RISCV64_FUNC(max, u16, 16, uint, vmaxu) + OP_RISCV64_FUNC(max, i32, 32, int, vmax) + OP_RISCV64_FUNC(max, u32, 32, uint, vmaxu) + OP_RISCV64_FUNC(max, i64, 64, int, vmax) + OP_RISCV64_FUNC(max, u64, 64, uint, vmaxu) + OP_RISCV64_FUNC(max, f32, 32, float, vfmax) + OP_RISCV64_FUNC(max, f64, 64, float, vfmax) + +/************************************************************************* + * Min + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) < (b) ? (a) : (b)) + OP_RISCV64_FUNC(min, i8, 8, int, vmin) + OP_RISCV64_FUNC(min, u8, 8, uint, vminu) + OP_RISCV64_FUNC(min, i16, 16, int, vmin) + OP_RISCV64_FUNC(min, u16, 16, uint, vminu) + OP_RISCV64_FUNC(min, i32, 32, int, vmin) + OP_RISCV64_FUNC(min, u32, 32, uint, vminu) + OP_RISCV64_FUNC(min, i64, 64, int, vmin) + OP_RISCV64_FUNC(min, u64, 64, uint, vminu) + OP_RISCV64_FUNC(min, f32, 32, float, vfmin) + OP_RISCV64_FUNC(min, f64, 64, float, vfmin) + +/************************************************************************* +* Sum +************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) + (b)) + OP_RISCV64_FUNC(sum, i8, 8, int, vadd) + OP_RISCV64_FUNC(sum, u8, 8, uint, vadd) + OP_RISCV64_FUNC(sum, i16, 16, int, vadd) + OP_RISCV64_FUNC(sum, u16, 16, uint, vadd) + OP_RISCV64_FUNC(sum, i32, 32, int, vadd) + OP_RISCV64_FUNC(sum, u32, 32, uint, vadd) + OP_RISCV64_FUNC(sum, i64, 64, int, vadd) + OP_RISCV64_FUNC(sum, u64, 64, uint, vadd) + OP_RISCV64_FUNC(sum, f32, 32, float, vfadd) + OP_RISCV64_FUNC(sum, f64, 64, float, vfadd) + +/************************************************************************* + * Product + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) * (b)) + OP_RISCV64_FUNC(prod, i8, 8, int, vmul) + OP_RISCV64_FUNC(prod, u8, 8, uint, vmul) + OP_RISCV64_FUNC(prod, i16, 16, int, vmul) + OP_RISCV64_FUNC(prod, u16, 16, uint, vmul) + OP_RISCV64_FUNC(prod, i32, 32, int, vmul) + OP_RISCV64_FUNC(prod, u32, 32, uint, vmul) + OP_RISCV64_FUNC(prod, i64, 64, int, vmul) + OP_RISCV64_FUNC(prod, u64, 64, uint, vmul) + OP_RISCV64_FUNC(prod, f32, 32, float, vfmul) + OP_RISCV64_FUNC(prod, f64, 64, float, vfmul) + +/************************************************************************* + * Bitwise AND + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) & (b)) + OP_RISCV64_FUNC(band, i8, 8, int, vand) + OP_RISCV64_FUNC(band, u8, 8, uint, vand) + OP_RISCV64_FUNC(band, i16, 16, int, vand) + OP_RISCV64_FUNC(band, u16, 16, uint, vand) + OP_RISCV64_FUNC(band, i32, 32, int, vand) + OP_RISCV64_FUNC(band, u32, 32, uint, vand) + OP_RISCV64_FUNC(band, i64, 64, int, vand) + OP_RISCV64_FUNC(band, u64, 64, uint, vand) + + /************************************************************************* + * Bitwise OR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) | (b)) + OP_RISCV64_FUNC(bor, i8, 8, int, vor) + OP_RISCV64_FUNC(bor, u8, 8, uint, vor) + OP_RISCV64_FUNC(bor, i16, 16, int, vor) + OP_RISCV64_FUNC(bor, u16, 16, uint, vor) + OP_RISCV64_FUNC(bor, i32, 32, int, vor) + OP_RISCV64_FUNC(bor, u32, 32, uint, vor) + OP_RISCV64_FUNC(bor, i64, 64, int, vor) + OP_RISCV64_FUNC(bor, u64, 64, uint, vor) + +/************************************************************************* + * Bitwise XOR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) ^ (b)) + OP_RISCV64_FUNC(bxor, i8, 8, int, vxor) + OP_RISCV64_FUNC(bxor, u8, 8, uint, vxor) + OP_RISCV64_FUNC(bxor, i16, 16, int, vxor) + OP_RISCV64_FUNC(bxor, u16, 16, uint, vxor) + OP_RISCV64_FUNC(bxor, i32, 32, int, vxor) + OP_RISCV64_FUNC(bxor, u32, 32, uint, vxor) + OP_RISCV64_FUNC(bxor, i64, 64, int, vxor) + OP_RISCV64_FUNC(bxor, u64, 64, uint, vxor) + +/* +* This is a three buffer (2 input and 1 output) version of the reduction +* routines. +*/ +#if GENERATE_RVV_CODE +#define OP_RISCV64_FUNC_3BUFF(name, type_abbr, type_size, type, op) \ + static void OP_CONCAT(ompi_op_riscv64_3buff_##name##_##type##type_size##_t, APPEND) \ + (const void *_in1, const void *_in2, void *_out, int *count, \ + struct ompi_datatype_t **dtype, \ + struct ompi_op_base_module_1_0_0_t *module) \ + { \ + size_t vl; \ + type##type_size##_t *in1 = (type##type_size##_t *) _in1, \ + *in2 = (type##type_size##_t *) _in2, \ + *out = (type##type_size##_t *) _out; \ + for (size_t i = 0, cnt = *count; i < cnt; i += vl) { \ + vl = RVV_FUNC(vsetvl_e##type_size)(cnt - i); \ + RVV_TYPE(v##type##type_size) vsrc1 = RVV_FUNC(vle##type_size##_v_##type_abbr)(&in1[i], vl); \ + RVV_TYPE(v##type##type_size) vsrc2 = RVV_FUNC(vle##type_size##_v_##type_abbr)(&in2[i], vl); \ + RVV_TYPE(v##type##type_size) vdst = RVV_FUNC(op##_vv_##type_abbr)(vsrc1, vsrc2, vl); \ + RVV_FUNC(vse##type_size##_v_##type_abbr)(&out[i], vdst, vl); \ + } \ + } +#endif + +/************************************************************************* + * Max + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) > (b) ? (a) : (b)) + OP_RISCV64_FUNC_3BUFF(max, i8, 8, int, vmax) + OP_RISCV64_FUNC_3BUFF(max, u8, 8, uint, vmaxu) + OP_RISCV64_FUNC_3BUFF(max, i16, 16, int, vmax) + OP_RISCV64_FUNC_3BUFF(max, u16, 16, uint, vmaxu) + OP_RISCV64_FUNC_3BUFF(max, i32, 32, int, vmax) + OP_RISCV64_FUNC_3BUFF(max, u32, 32, uint, vmaxu) + OP_RISCV64_FUNC_3BUFF(max, i64, 64, int, vmax) + OP_RISCV64_FUNC_3BUFF(max, u64, 64, uint, vmaxu) + OP_RISCV64_FUNC_3BUFF(max, f32, 32, float, vfmax) + OP_RISCV64_FUNC_3BUFF(max, f64, 64, float, vfmax) + +/************************************************************************* +* Min +*************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) < (b) ? (a) : (b)) + OP_RISCV64_FUNC_3BUFF(min, i8, 8, int, vmin) + OP_RISCV64_FUNC_3BUFF(min, u8, 8, uint, vminu) + OP_RISCV64_FUNC_3BUFF(min, i16, 16, int, vmin) + OP_RISCV64_FUNC_3BUFF(min, u16, 16, uint, vminu) + OP_RISCV64_FUNC_3BUFF(min, i32, 32, int, vmin) + OP_RISCV64_FUNC_3BUFF(min, u32, 32, uint, vminu) + OP_RISCV64_FUNC_3BUFF(min, i64, 64, int, vmin) + OP_RISCV64_FUNC_3BUFF(min, u64, 64, uint, vminu) + OP_RISCV64_FUNC_3BUFF(min, f32, 32, float, vfmin) + OP_RISCV64_FUNC_3BUFF(min, f64, 64, float, vfmin) + + /************************************************************************* + * Sum + ************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) + (b)) + OP_RISCV64_FUNC_3BUFF(sum, i8, 8, int, vadd) + OP_RISCV64_FUNC_3BUFF(sum, u8, 8, uint, vadd) + OP_RISCV64_FUNC_3BUFF(sum, i16, 16, int, vadd) + OP_RISCV64_FUNC_3BUFF(sum, u16, 16, uint, vadd) + OP_RISCV64_FUNC_3BUFF(sum, i32, 32, int, vadd) + OP_RISCV64_FUNC_3BUFF(sum, u32, 32, uint, vadd) + OP_RISCV64_FUNC_3BUFF(sum, i64, 64, int, vadd) + OP_RISCV64_FUNC_3BUFF(sum, u64, 64, uint, vadd) + OP_RISCV64_FUNC_3BUFF(sum, f32, 32, float, vfadd) + OP_RISCV64_FUNC_3BUFF(sum, f64, 64, float, vfadd) + +/************************************************************************* + * Product + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) * (b)) + OP_RISCV64_FUNC_3BUFF(prod, i8, 8, int, vmul) + OP_RISCV64_FUNC_3BUFF(prod, u8, 8, uint, vmul) + OP_RISCV64_FUNC_3BUFF(prod, i16, 16, int, vmul) + OP_RISCV64_FUNC_3BUFF(prod, u16, 16, uint, vmul) + OP_RISCV64_FUNC_3BUFF(prod, i32, 32, int, vmul) + OP_RISCV64_FUNC_3BUFF(prod, u32, 32, uint, vmul) + OP_RISCV64_FUNC_3BUFF(prod, i64, 64, int, vmul) + OP_RISCV64_FUNC_3BUFF(prod, u64, 64, uint, vmul) + OP_RISCV64_FUNC_3BUFF(prod, f32, 32, float, vfmul) + OP_RISCV64_FUNC_3BUFF(prod, f64, 64, float, vfmul) + +/************************************************************************* + * Bitwise AND + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) & (b)) + OP_RISCV64_FUNC_3BUFF(band, i8, 8, int, vand) + OP_RISCV64_FUNC_3BUFF(band, u8, 8, uint, vand) + OP_RISCV64_FUNC_3BUFF(band, i16, 16, int, vand) + OP_RISCV64_FUNC_3BUFF(band, u16, 16, uint, vand) + OP_RISCV64_FUNC_3BUFF(band, i32, 32, int, vand) + OP_RISCV64_FUNC_3BUFF(band, u32, 32, uint, vand) + OP_RISCV64_FUNC_3BUFF(band, i64, 64, int, vand) + OP_RISCV64_FUNC_3BUFF(band, u64, 64, uint, vand) + + /************************************************************************* + * Bitwise OR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) | (b)) + OP_RISCV64_FUNC_3BUFF(bor, i8, 8, int, vor) + OP_RISCV64_FUNC_3BUFF(bor, u8, 8, uint, vor) + OP_RISCV64_FUNC_3BUFF(bor, i16, 16, int, vor) + OP_RISCV64_FUNC_3BUFF(bor, u16, 16, uint, vor) + OP_RISCV64_FUNC_3BUFF(bor, i32, 32, int, vor) + OP_RISCV64_FUNC_3BUFF(bor, u32, 32, uint, vor) + OP_RISCV64_FUNC_3BUFF(bor, i64, 64, int, vor) + OP_RISCV64_FUNC_3BUFF(bor, u64, 64, uint, vor) + +/************************************************************************* + * Bitwise XOR + *************************************************************************/ +#undef current_func +#define current_func(a, b) ((a) ^ (b)) + OP_RISCV64_FUNC_3BUFF(bxor, i8, 8, int, vxor) + OP_RISCV64_FUNC_3BUFF(bxor, u8, 8, uint, vxor) + OP_RISCV64_FUNC_3BUFF(bxor, i16, 16, int, vxor) + OP_RISCV64_FUNC_3BUFF(bxor, u16, 16, uint, vxor) + OP_RISCV64_FUNC_3BUFF(bxor, i32, 32, int, vxor) + OP_RISCV64_FUNC_3BUFF(bxor, u32, 32, uint, vxor) + OP_RISCV64_FUNC_3BUFF(bxor, i64, 64, int, vxor) + OP_RISCV64_FUNC_3BUFF(bxor, u64, 64, uint, vxor) + + /** C integer ***********************************************************/ +#define C_INTEGER_BASE(name, ftype) \ + [OMPI_OP_BASE_TYPE_INT8_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_int8_t, APPEND), \ + [OMPI_OP_BASE_TYPE_UINT8_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_uint8_t, APPEND), \ + [OMPI_OP_BASE_TYPE_INT16_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_int16_t, APPEND), \ + [OMPI_OP_BASE_TYPE_UINT16_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_uint16_t, APPEND), \ + [OMPI_OP_BASE_TYPE_INT32_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_int32_t, APPEND), \ + [OMPI_OP_BASE_TYPE_UINT32_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_uint32_t, APPEND) +#define C_INTEGER_EX(name, ftype) \ + [OMPI_OP_BASE_TYPE_INT64_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_int64_t, APPEND), \ + [OMPI_OP_BASE_TYPE_UINT64_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_uint64_t, APPEND) + + /** Floating point, including all the Fortran reals *********************/ +#define FLOAT(name, ftype) OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_float32_t, APPEND) +#define DOUBLE(name, ftype) OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_float64_t, APPEND) + +#define FLOATING_POINT(name, ftype) \ + [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype), \ + [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype) + +/* + * MPI_OP_NULL + * All types + */ +#define FLAGS_NO_FLOAT \ + (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE) +#define FLAGS \ + (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \ + OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE) + + ompi_op_base_handler_fn_t OP_CONCAT(ompi_op_riscv64_functions, APPEND) [OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = +{ + /* Corresponds to MPI_OP_NULL */ + [OMPI_OP_BASE_FORTRAN_NULL] = { + /* Leaving this empty puts in NULL for all entries */ + NULL, + }, + /* Corresponds to MPI_MAX */ + [OMPI_OP_BASE_FORTRAN_MAX] = { + C_INTEGER_BASE(max, 2buff), + C_INTEGER_EX(max, 2buff), + FLOATING_POINT(max, 2buff), + }, + /* Corresponds to MPI_MIN */ + [OMPI_OP_BASE_FORTRAN_MIN] = { + C_INTEGER_BASE(min, 2buff), + C_INTEGER_EX(min, 2buff), + FLOATING_POINT(min, 2buff), + }, + /* Corresponds to MPI_SUM */ + [OMPI_OP_BASE_FORTRAN_SUM] = { + C_INTEGER_BASE(sum, 2buff), + C_INTEGER_EX(sum, 2buff), + FLOATING_POINT(sum, 2buff), + }, + /* Corresponds to MPI_PROD */ + [OMPI_OP_BASE_FORTRAN_PROD] = { + C_INTEGER_BASE(prod, 2buff), + C_INTEGER_EX(prod, 2buff), + FLOATING_POINT(prod, 2buff), + }, + /* Corresponds to MPI_LAND */ + [OMPI_OP_BASE_FORTRAN_LAND] = { + NULL, + }, + /* Corresponds to MPI_BAND */ + [OMPI_OP_BASE_FORTRAN_BAND] = { + C_INTEGER_BASE(band, 2buff), + C_INTEGER_EX(band, 2buff), + }, + /* Corresponds to MPI_LOR */ + [OMPI_OP_BASE_FORTRAN_LOR] = { + NULL, + }, + /* Corresponds to MPI_BOR */ + [OMPI_OP_BASE_FORTRAN_BOR] = { + C_INTEGER_BASE(bor, 2buff), + C_INTEGER_EX(bor, 2buff), + }, + /* Corresponds to MPI_LXOR */ + [OMPI_OP_BASE_FORTRAN_LXOR] = { + NULL, + }, + /* Corresponds to MPI_BXOR */ + [OMPI_OP_BASE_FORTRAN_BXOR] = { + C_INTEGER_BASE(bxor, 2buff), + C_INTEGER_EX(bxor, 2buff), + }, + /* Corresponds to MPI_REPLACE */ + [OMPI_OP_BASE_FORTRAN_REPLACE] = { + /* (MPI_ACCUMULATE is handled differently than the other + reductions, so just zero out its function + implementations here to ensure that users don't invoke + MPI_REPLACE with any reduction operations other than + ACCUMULATE) */ + NULL, + }, + +}; + + ompi_op_base_3buff_handler_fn_t OP_CONCAT(ompi_op_riscv64_3buff_functions, APPEND)[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = +{ + /* Corresponds to MPI_OP_NULL */ + [OMPI_OP_BASE_FORTRAN_NULL] = { + /* Leaving this empty puts in NULL for all entries */ + NULL, + }, + /* Corresponds to MPI_MAX */ + [OMPI_OP_BASE_FORTRAN_MAX] = { + C_INTEGER_BASE(max, 3buff), + C_INTEGER_EX(max, 3buff), + FLOATING_POINT(max, 3buff), + }, + /* Corresponds to MPI_MIN */ + [OMPI_OP_BASE_FORTRAN_MIN] = { + C_INTEGER_BASE(min, 3buff), + C_INTEGER_EX(min, 3buff), + FLOATING_POINT(min, 3buff), + }, + /* Corresponds to MPI_SUM */ + [OMPI_OP_BASE_FORTRAN_SUM] = { + C_INTEGER_BASE(sum, 3buff), + C_INTEGER_EX(sum, 3buff), + FLOATING_POINT(sum, 3buff), + }, + /* Corresponds to MPI_PROD */ + [OMPI_OP_BASE_FORTRAN_PROD] = { + C_INTEGER_BASE(prod, 3buff), + C_INTEGER_EX(prod, 3buff), + FLOATING_POINT(prod, 3buff), + }, + /* Corresponds to MPI_LAND */ + [OMPI_OP_BASE_FORTRAN_LAND] ={ + NULL, + }, + /* Corresponds to MPI_BAND */ + [OMPI_OP_BASE_FORTRAN_BAND] = { + C_INTEGER_BASE(band, 3buff), + C_INTEGER_EX(band, 3buff), + }, + /* Corresponds to MPI_LOR */ + [OMPI_OP_BASE_FORTRAN_LOR] = { + NULL, + }, + /* Corresponds to MPI_BOR */ + [OMPI_OP_BASE_FORTRAN_BOR] = { + C_INTEGER_BASE(bor, 3buff), + C_INTEGER_EX(bor, 3buff), + }, + /* Corresponds to MPI_LXOR */ + [OMPI_OP_BASE_FORTRAN_LXOR] = { + NULL, + }, + /* Corresponds to MPI_BXOR */ + [OMPI_OP_BASE_FORTRAN_BXOR] = { + C_INTEGER_BASE(bxor, 3buff), + C_INTEGER_EX(bxor, 3buff), + }, + /* Corresponds to MPI_REPLACE */ + [OMPI_OP_BASE_FORTRAN_REPLACE] = { + /* MPI_ACCUMULATE is handled differently than the other + reductions, so just zero out its function + implementations here to ensure that users don't invoke + MPI_REPLACE with any reduction operations other than + ACCUMULATE */ + NULL, + }, +};