From 0ca418388f8894a77dcfe1a1ceb6bdcff56c56e0 Mon Sep 17 00:00:00 2001
From: heyujiao99 <he.yujiao@sanechips.com.cn>
Date: Fri, 1 Aug 2025 18:23:28 +0800
Subject: [PATCH] add RVV support for MPI_OP

Signed-off-by: heyujiao99 <he.yujiao@sanechips.com.cn>
---
 ompi/mca/op/riscv64/Makefile.am            |  77 ++++
 ompi/mca/op/riscv64/configure.m4           |  64 +++
 ompi/mca/op/riscv64/op_riscv64.h           |  61 +++
 ompi/mca/op/riscv64/op_riscv64_component.c | 202 +++++++++
 ompi/mca/op/riscv64/op_riscv64_functions.c | 483 +++++++++++++++++++++
 5 files changed, 887 insertions(+)
 create mode 100644 ompi/mca/op/riscv64/Makefile.am
 create mode 100644 ompi/mca/op/riscv64/configure.m4
 create mode 100644 ompi/mca/op/riscv64/op_riscv64.h
 create mode 100644 ompi/mca/op/riscv64/op_riscv64_component.c
 create mode 100644 ompi/mca/op/riscv64/op_riscv64_functions.c
diff --git a/ompi/mca/op/riscv64/Makefile.am b/ompi/mca/op/riscv64/Makefile.am
new file mode 100644
index 00000000000..5aab8c89890
--- /dev/null
+++ b/ompi/mca/op/riscv64/Makefile.am
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2019      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2025      Software System Team, SANECHIPS.  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# This is an riscv64 op component.  This Makefile.am is a typical
+# riscv64 of how to integrate into Open MPI's Automake-based build
+# system.
+#
+# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
+# for more details on how to make Open MPI components.
+
+# First, list all .h and .c sources.  It is necessary to list all .h
+# files so that they will be picked up in the distribution tarball.
+
+sources = \
+    op_riscv64.h \
+    op_riscv64_component.c
+
+# Open MPI components can be compiled two ways:
+#
+# 1. As a standalone dynamic shared object (DSO), sometimes called a
+# dynamically loadable library (DLL).
+#
+# 2. As a static library that is slurped up into the upper-level
+# libmpi library (regardless of whether libmpi is a static or dynamic
+# library).  This is called a "Libtool convenience library".
+#
+# The component needs to create an output library in this top-level
+# component directory, and named either mca_<type>_<name>.la (for DSO
+# builds) or libmca_<type>_<name>.la (for static builds).  The OMPI
+# build system will have set the
+# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
+# which way this component should be built.
+specialized_op_libs =
+if MCA_BUILD_ompi_op_has_rvv_support
+specialized_op_libs += liblocal_ops_rvv.la
+liblocal_ops_rvv_la_SOURCES = op_riscv64_functions.c
+liblocal_ops_rvv_la_CPPFLAGS = -DGENERATE_RVV_CODE=1
+endif
+
+component_noinst  = $(specialized_op_libs)
+if MCA_BUILD_ompi_op_riscv64_DSO
+component_install = mca_op_riscv64.la
+else
+component_install =
+component_noinst  += libmca_op_riscv64.la
+endif
+
+# Specific information for DSO builds.
+#
+# The DSO should install itself in $(ompilibdir) (by default,
+# $prefix/lib/openmpi).
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_op_riscv64_la_SOURCES = $(sources)
+mca_op_riscv64_la_LIBADD = $(specialized_op_libs)
+mca_op_riscv64_la_LDFLAGS = -module -avoid-version $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
+
+
+# Specific information for static builds.
+#
+# Note that we *must* "noinst"; the upper-layer Makefile.am's will
+# slurp in the resulting .la library into libmpi.
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_op_riscv64_la_SOURCES = $(sources)
+libmca_op_riscv64_la_LIBADD = $(specialized_op_libs)
+libmca_op_riscv64_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/op/riscv64/configure.m4 b/ompi/mca/op/riscv64/configure.m4
new file mode 100644
index 00000000000..dca045825d3
--- /dev/null
+++ b/ompi/mca/op/riscv64/configure.m4
@@ -0,0 +1,64 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2019-2020 The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2025      Software System Team, SANECHIPS.  All rights reserved.
+#
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_riscv_CONFIG([action-if-can-compile],
+#                        [action-if-cant-compile])
+# ------------------------------------------------
+AC_DEFUN([MCA_ompi_op_riscv64_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/op/riscv64/Makefile])
+    case "${host}" in
+        riscv64*)
+            op_riscv64_check="yes";;
+        *)
+            op_riscv64_check="no";;
+    esac
+    AS_IF([test "$op_riscv64_check" = "yes"],
+          [AC_LANG_PUSH([C])
+
+           #
+           # Check for RVV support
+           #
+           AC_CACHE_CHECK([for RVV support], op_cv_rvv_support, 
+                [
+                  AC_LINK_IFELSE(
+                      [AC_LANG_PROGRAM([[
+#if defined(__riscv) && defined(__riscv_v) && __riscv_xlen == 64
+#include <riscv_vector.h>
+#else
+#error "Not a 64-bit RISC-V target"
+#endif
+                                       ]],
+                                       [[
+#if defined(__riscv) && defined(__riscv_v) && __riscv_xlen == 64
+    size_t vl = __riscv_vsetvl_e32m1(4);
+#endif
+                                       ]])],
+                      [op_cv_rvv_support=yes],
+                      [op_cv_rvv_support=no])])
+           AC_LANG_POP
+])
+    AM_CONDITIONAL([MCA_BUILD_ompi_op_has_rvv_support],
+                   [test "$op_cv_rvv_support" = "yes"])
+
+    AC_SUBST(MCA_BUILD_ompi_op_has_rvv_support)
+
+    AS_IF([test "$op_cv_rvv_support" = "yes"],
+          [AC_DEFINE([OMPI_MCA_OP_HAVE_RVV], [1],[RVV supported in the current build])],
+          [AC_DEFINE([OMPI_MCA_OP_HAVE_RVV], [0],[RVV not supported in the current build])])
+
+    # If we have at least support for Neon or SVE
+    AS_IF([test "$op_cv_rvv_support" = "yes"],
+          [$1],
+          [$2])
+])dnl
diff --git a/ompi/mca/op/riscv64/op_riscv64.h b/ompi/mca/op/riscv64/op_riscv64.h
new file mode 100644
index 00000000000..303758ec24a
--- /dev/null
+++ b/ompi/mca/op/riscv64/op_riscv64.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2025      Software System Team, SANECHIPS.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_OP_RISCV64_EXPORT_H
+#define MCA_OP_RISCV64_EXPORT_H
+
+#include "ompi_config.h"
+
+#include "ompi/mca/mca.h"
+#include "opal/class/opal_object.h"
+
+#include "ompi/mca/op/op.h"
+
+BEGIN_C_DECLS
+
+/**
+ * Derive a struct from the base op component struct, allowing us to
+ * cache some component-specific information on our well-known
+ * component struct.
+ */
+typedef struct {
+    /** The base op component struct */
+    ompi_op_base_component_1_0_0_t super;
+
+    /* What follows is riscv64-component-specific cached information.  We
+       tend to use this scheme (caching information on the riscv64
+       component itself) instead of lots of individual global
+       variables for the component.  */
+
+    /** A simple boolean indicating that the hardware is available. */
+    bool hardware_available;
+
+    /** A simple boolean indicating whether double precision is
+        supported. */
+    bool double_supported;
+} ompi_op_riscv64_component_t;
+
+/**
+ * Globally exported variable.  Note that it is a *riscv64* component
+ * (defined above), which has the ompi_op_base_component_t as its
+ * first member.  Hence, the MCA/op framework will find the data that
+ * it expects in the first memory locations, but then the component
+ * itself can cache additional information after that that can be used
+ * by both the component and modules.
+ */
+OMPI_DECLSPEC extern ompi_op_riscv64_component_t
+    mca_op_riscv64_component;
+
+END_C_DECLS
+
+#endif /* MCA_OP_RISCV64_EXPORT_H */
diff --git a/ompi/mca/op/riscv64/op_riscv64_component.c b/ompi/mca/op/riscv64/op_riscv64_component.c
new file mode 100644
index 00000000000..72d722e1861
--- /dev/null
+++ b/ompi/mca/op/riscv64/op_riscv64_component.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2025      Software System Team, SANECHIPS.  All rights
+ *                         reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/** @file
+ *
+ * This is the "RISCV64" component source code.
+ *
+ */
+
+#include "ompi_config.h"
+
+#include "opal/util/printf.h"
+
+#include "ompi/constants.h"
+#include "ompi/mca/op/riscv64/op_riscv64.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/op/op.h"
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+static int mca_op_riscv64_component_open(void);
+static int mca_op_riscv64_component_close(void);
+static int mca_op_riscv64_component_init_query(bool enable_progress_threads,
+                                     bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    mca_op_riscv64_component_op_query(struct ompi_op_t *op, int *priority);
+static int mca_op_riscv64_component_register(void);
+
+ompi_op_riscv64_component_t mca_op_riscv64_component = {
+    /* First, the mca_base_component_t struct containing meta
+       information about the component itself */
+    {
+        .opc_version = {
+            OMPI_OP_BASE_VERSION_1_0_0,
+
+            .mca_component_name = "riscv64",
+            MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                                  OMPI_RELEASE_VERSION),
+            .mca_open_component = mca_op_riscv64_component_open,
+            .mca_close_component = mca_op_riscv64_component_close,
+            .mca_register_component_params = mca_op_riscv64_component_register,
+        },
+        .opc_data = {
+            /* The component is checkpoint ready */
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        .opc_init_query = mca_op_riscv64_component_init_query,
+        .opc_op_query = mca_op_riscv64_component_op_query,
+    },
+};
+MCA_BASE_COMPONENT_INIT(ompi, op, riscv64)
+
+/*
+ * Component open
+ */
+static int mca_op_riscv64_component_open(void)
+{
+
+    /* A first level check to see if RVV extension is even available in this
+       process.  E.g., you may want to do a first-order check to see
+       if hardware is available.  If so, return OMPI_SUCCESS.  If not,
+       return anything other than OMPI_SUCCESS and the component will
+       silently be ignored.
+
+       Note that if this function returns non-OMPI_SUCCESS, then this
+       component won't even be shown in ompi_info output (which is
+       probably not what you want).
+    */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Component close
+ */
+static int mca_op_riscv64_component_close(void)
+{
+
+    /* If the riscv64 was opened successfully, close it (i.e., release any
+       resources that may have been allocated on this component).
+       Note that _component_close() will always be called at the end
+       of the process, so it may have been after any/all of the other
+       component functions have been invoked (and possibly even after
+       modules have been created and/or destroyed). */
+
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Register MCA params.
+ */
+static int mca_op_riscv64_component_register(void)
+{
+    unsigned long hwcap = getauxval(AT_HWCAP);
+    mca_op_riscv64_component.hardware_available = hwcap & COMPAT_HWCAP_ISA_V;  /* Check for V extension */
+    (void) mca_base_component_var_register(&mca_op_riscv64_component.super.opc_version,
+                                           "hardware_available",
+                                           "Whether the RVV hardware is available",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_riscv64_component.hardware_available);
+
+    mca_op_riscv64_component.double_supported = hwcap & COMPAT_HWCAP_ISA_D;  /* Check for D extension */
+    (void) mca_base_component_var_register(&mca_op_riscv64_component.super.opc_version,
+                                           "double_supported",
+                                           "Whether the double precision data types are supported or not",
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           &mca_op_riscv64_component.double_supported);
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Query whether this component wants to be used in this process.
+ */
+static int mca_op_riscv64_component_init_query(bool enable_progress_threads,
+                                               bool enable_mpi_thread_multiple)
+{
+    if (mca_op_riscv64_component.hardware_available) {
+        return OMPI_SUCCESS;
+    }
+    return OMPI_ERR_NOT_SUPPORTED;
+}
+
+#if OMPI_MCA_OP_HAVE_RVV
+extern ompi_op_base_handler_fn_t
+ompi_op_riscv64_functions_rvv[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+extern ompi_op_base_3buff_handler_fn_t
+ompi_op_riscv64_3buff_functions_rvv[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+#endif  /* OMPI_MCA_OP_HAVE_RVV */
+
+/*
+ * Query whether this component can be used for a specific op
+ */
+static struct ompi_op_base_module_1_0_0_t *
+    mca_op_riscv64_component_op_query(struct ompi_op_t *op, int *priority)
+{
+     ompi_op_base_module_t *module = NULL;
+    /* Sanity check -- although the framework should never invoke the
+       _component_op_query() on non-intrinsic MPI_Op's, we'll put a
+       check here just to be sure. */
+    if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
+        return NULL;
+    }
+
+    switch (op->o_f_to_c_index) {
+    case OMPI_OP_BASE_FORTRAN_MAX:
+    case OMPI_OP_BASE_FORTRAN_MIN:
+    case OMPI_OP_BASE_FORTRAN_SUM:
+    case OMPI_OP_BASE_FORTRAN_PROD:
+    case OMPI_OP_BASE_FORTRAN_BOR:
+    case OMPI_OP_BASE_FORTRAN_BAND:
+    case OMPI_OP_BASE_FORTRAN_BXOR:
+        module = OBJ_NEW(ompi_op_base_module_t);
+        for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+            module->opm_fns[i] = NULL;
+            module->opm_3buff_fns[i] = NULL;
+#if OMPI_MCA_OP_HAVE_RVV
+            if( mca_op_riscv64_component.hardware_available ) {
+                module->opm_fns[i] = ompi_op_riscv64_functions_rvv[op->o_f_to_c_index][i];
+                module->opm_3buff_fns[i] = ompi_op_riscv64_3buff_functions_rvv[op->o_f_to_c_index][i];
+            }
+#endif  /* OMPI_MCA_OP_HAVE_RVV */
+        }
+        break;
+    case OMPI_OP_BASE_FORTRAN_LAND:
+    case OMPI_OP_BASE_FORTRAN_LOR:
+    case OMPI_OP_BASE_FORTRAN_LXOR:
+    case OMPI_OP_BASE_FORTRAN_MAXLOC:
+    case OMPI_OP_BASE_FORTRAN_MINLOC:
+    case OMPI_OP_BASE_FORTRAN_REPLACE:
+    default:
+        break;
+    }
+    /* If we got a module from above, we'll return it.  Otherwise,
+       we'll return NULL, indicating that this component does not want
+       to be considered for selection for this MPI_Op.  Note that the
+       functions each returned a *riscv64* component pointer
+       (vs. a *base* component pointer -- where an *riscv64* component
+       is a base component plus some other module-specific cached
+       information), so we have to cast it to the right pointer type
+       before returning. */
+    if (NULL != module) {
+        *priority = 50;
+    }
+    return (ompi_op_base_module_1_0_0_t *) module;
+}
diff --git a/ompi/mca/op/riscv64/op_riscv64_functions.c b/ompi/mca/op/riscv64/op_riscv64_functions.c
new file mode 100644
index 00000000000..b98a0ba5c83
--- /dev/null
+++ b/ompi/mca/op/riscv64/op_riscv64_functions.c
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2019      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2025      Software System Team, SANECHIPS.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#include "opal/util/output.h"
+
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/mca/op/riscv64/op_riscv64.h"
+
+#if GENERATE_RVV_CODE
+#    include <riscv_vector.h> 
+#    define APPEND _rvv
+#    define LMUL m4
+#endif
+
+/*
+ * Concatenate preprocessor tokens A and B without expanding macro definitions
+ * (however, if invoked from a macro, macro arguments are expanded).
+ */
+#define OP_CONCAT_NX(A, B) A ## B
+
+/*
+ * Concatenate preprocessor tokens A and B after macro-expanding them.
+ */
+#define OP_CONCAT(A, B) OP_CONCAT_NX(A, B)
+
+/*
+ * Since all the functions in this file are essentially identical, we
+ * use a macro to substitute in names and types.  The core operation
+ * in all functions that use this macro is the same.
+ *
+ * This macro is for (out op in).
+ *
+ */
+
+typedef float float32_t;
+typedef double float64_t;
+
+#define RVV_FUNC(A)  OP_CONCAT(__riscv_, OP_CONCAT(A, LMUL))
+#define RVV_TYPE(A)  OP_CONCAT(OP_CONCAT(A, LMUL), _t)
+
+#if GENERATE_RVV_CODE
+#define OP_RISCV64_FUNC(name, type_abbr, type_size, type, op)                                           \
+    static void OP_CONCAT(ompi_op_riscv64_2buff_##name##_##type##type_size##_t, APPEND)                 \
+        (const void *_in, void *_out, int *count,                                                       \
+        struct ompi_datatype_t **dtype,                                                                 \
+        struct ompi_op_base_module_1_0_0_t *module)                                                     \
+    {                                                                                                   \
+        size_t vl;                                                                                      \
+        const int cnt = *count;                                                                         \
+        type##type_size##_t *in = (type##type_size##_t *) _in,                                          \
+                            *out = (type##type_size##_t *) _out;                                        \
+        for (size_t i = 0; i < cnt; i += vl) {                                                          \
+            vl = RVV_FUNC(vsetvl_e##type_size)(cnt - i);                                                \
+            RVV_TYPE(v##type##type_size) vsrc = RVV_FUNC(vle##type_size##_v_##type_abbr)(&in[i], vl);   \
+            RVV_TYPE(v##type##type_size) vdst = RVV_FUNC(vle##type_size##_v_##type_abbr)(&out[i], vl);  \
+            vdst = RVV_FUNC(op##_vv_##type_abbr)(vdst, vsrc, vl);                                       \
+            RVV_FUNC(vse##type_size##_v_##type_abbr)(&out[i], vdst, vl);                                \
+        }                                                                                               \
+    }
+#endif
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+    OP_RISCV64_FUNC(max, i8, 8, int, vmax)
+    OP_RISCV64_FUNC(max, u8, 8, uint, vmaxu)
+    OP_RISCV64_FUNC(max, i16, 16, int, vmax)
+    OP_RISCV64_FUNC(max, u16, 16, uint, vmaxu)
+    OP_RISCV64_FUNC(max, i32, 32, int, vmax)
+    OP_RISCV64_FUNC(max, u32, 32, uint, vmaxu)
+    OP_RISCV64_FUNC(max, i64, 64, int, vmax)
+    OP_RISCV64_FUNC(max, u64, 64, uint, vmaxu)
+    OP_RISCV64_FUNC(max, f32, 32, float, vfmax)
+    OP_RISCV64_FUNC(max, f64, 64, float, vfmax)
+
+/*************************************************************************
+ * Min
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+    OP_RISCV64_FUNC(min, i8, 8, int, vmin)
+    OP_RISCV64_FUNC(min, u8, 8, uint, vminu)
+    OP_RISCV64_FUNC(min, i16, 16, int, vmin)
+    OP_RISCV64_FUNC(min, u16, 16, uint, vminu)
+    OP_RISCV64_FUNC(min, i32, 32, int, vmin)
+    OP_RISCV64_FUNC(min, u32, 32, uint, vminu)
+    OP_RISCV64_FUNC(min, i64, 64, int, vmin)
+    OP_RISCV64_FUNC(min, u64, 64, uint, vminu)
+    OP_RISCV64_FUNC(min, f32, 32, float, vfmin)
+    OP_RISCV64_FUNC(min, f64, 64, float, vfmin)
+
+/*************************************************************************
+* Sum
+************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
+    OP_RISCV64_FUNC(sum, i8, 8, int, vadd)
+    OP_RISCV64_FUNC(sum, u8, 8, uint, vadd)
+    OP_RISCV64_FUNC(sum, i16, 16, int, vadd)
+    OP_RISCV64_FUNC(sum, u16, 16, uint, vadd)
+    OP_RISCV64_FUNC(sum, i32, 32, int, vadd)
+    OP_RISCV64_FUNC(sum, u32, 32, uint, vadd)
+    OP_RISCV64_FUNC(sum, i64, 64, int, vadd)
+    OP_RISCV64_FUNC(sum, u64, 64, uint, vadd)
+    OP_RISCV64_FUNC(sum, f32, 32, float, vfadd)
+    OP_RISCV64_FUNC(sum, f64, 64, float, vfadd)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
+    OP_RISCV64_FUNC(prod, i8, 8, int, vmul)
+    OP_RISCV64_FUNC(prod, u8, 8, uint, vmul)
+    OP_RISCV64_FUNC(prod, i16, 16, int, vmul)
+    OP_RISCV64_FUNC(prod, u16, 16, uint, vmul)
+    OP_RISCV64_FUNC(prod, i32, 32, int, vmul)
+    OP_RISCV64_FUNC(prod, u32, 32, uint, vmul)
+    OP_RISCV64_FUNC(prod, i64, 64, int, vmul)
+    OP_RISCV64_FUNC(prod, u64, 64, uint, vmul)
+    OP_RISCV64_FUNC(prod, f32, 32, float, vfmul)
+    OP_RISCV64_FUNC(prod, f64, 64, float, vfmul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+    OP_RISCV64_FUNC(band, i8, 8, int, vand)
+    OP_RISCV64_FUNC(band, u8, 8, uint, vand)
+    OP_RISCV64_FUNC(band, i16, 16, int, vand)
+    OP_RISCV64_FUNC(band, u16, 16, uint, vand)
+    OP_RISCV64_FUNC(band, i32, 32, int, vand)
+    OP_RISCV64_FUNC(band, u32, 32, uint, vand)
+    OP_RISCV64_FUNC(band, i64, 64, int, vand)
+    OP_RISCV64_FUNC(band, u64, 64, uint, vand)
+
+ /*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+    OP_RISCV64_FUNC(bor, i8, 8, int, vor)
+    OP_RISCV64_FUNC(bor, u8, 8, uint, vor)
+    OP_RISCV64_FUNC(bor, i16, 16, int, vor)
+    OP_RISCV64_FUNC(bor, u16, 16, uint, vor)
+    OP_RISCV64_FUNC(bor, i32, 32, int, vor)
+    OP_RISCV64_FUNC(bor, u32, 32, uint, vor)
+    OP_RISCV64_FUNC(bor, i64, 64, int, vor)
+    OP_RISCV64_FUNC(bor, u64, 64, uint, vor)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+    OP_RISCV64_FUNC(bxor, i8, 8, int, vxor)
+    OP_RISCV64_FUNC(bxor, u8, 8, uint, vxor)
+    OP_RISCV64_FUNC(bxor, i16, 16, int, vxor)
+    OP_RISCV64_FUNC(bxor, u16, 16, uint, vxor)
+    OP_RISCV64_FUNC(bxor, i32, 32, int, vxor)
+    OP_RISCV64_FUNC(bxor, u32, 32, uint, vxor)
+    OP_RISCV64_FUNC(bxor, i64, 64, int, vxor)
+    OP_RISCV64_FUNC(bxor, u64, 64, uint, vxor)
+
+/*
+*  This is a three buffer (2 input and 1 output) version of the reduction
+*  routines.
+*/
+#if GENERATE_RVV_CODE
+#define OP_RISCV64_FUNC_3BUFF(name, type_abbr, type_size, type, op)                                     \
+    static void OP_CONCAT(ompi_op_riscv64_3buff_##name##_##type##type_size##_t, APPEND)                 \
+        (const void *_in1, const void *_in2, void *_out, int *count,                                    \
+        struct ompi_datatype_t **dtype,                                                                 \
+        struct ompi_op_base_module_1_0_0_t *module)                                                     \
+    {                                                                                                   \
+        size_t vl;                                                                                      \
+        type##type_size##_t *in1 = (type##type_size##_t *) _in1,                                        \
+                            *in2 = (type##type_size##_t *) _in2,                                        \
+                            *out = (type##type_size##_t *) _out;                                        \
+        for (size_t i = 0, cnt = *count; i < cnt; i += vl) {                                            \
+            vl = RVV_FUNC(vsetvl_e##type_size)(cnt - i);                                                \
+            RVV_TYPE(v##type##type_size) vsrc1 = RVV_FUNC(vle##type_size##_v_##type_abbr)(&in1[i], vl); \
+            RVV_TYPE(v##type##type_size) vsrc2 = RVV_FUNC(vle##type_size##_v_##type_abbr)(&in2[i], vl); \
+            RVV_TYPE(v##type##type_size) vdst = RVV_FUNC(op##_vv_##type_abbr)(vsrc1, vsrc2, vl);        \
+            RVV_FUNC(vse##type_size##_v_##type_abbr)(&out[i], vdst, vl);                                \
+        }                                                                                               \
+    }
+#endif
+
+/*************************************************************************
+ * Max
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) > (b) ? (a) : (b))
+    OP_RISCV64_FUNC_3BUFF(max, i8, 8, int, vmax)
+    OP_RISCV64_FUNC_3BUFF(max, u8, 8, uint, vmaxu)
+    OP_RISCV64_FUNC_3BUFF(max, i16, 16, int, vmax)
+    OP_RISCV64_FUNC_3BUFF(max, u16, 16, uint, vmaxu)
+    OP_RISCV64_FUNC_3BUFF(max, i32, 32, int, vmax)
+    OP_RISCV64_FUNC_3BUFF(max, u32, 32, uint, vmaxu)
+    OP_RISCV64_FUNC_3BUFF(max, i64, 64, int, vmax)
+    OP_RISCV64_FUNC_3BUFF(max, u64, 64, uint, vmaxu)
+    OP_RISCV64_FUNC_3BUFF(max, f32, 32, float, vfmax)
+    OP_RISCV64_FUNC_3BUFF(max, f64, 64, float, vfmax)
+
+/*************************************************************************
+* Min
+*************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) < (b) ? (a) : (b))
+    OP_RISCV64_FUNC_3BUFF(min, i8, 8, int, vmin)
+    OP_RISCV64_FUNC_3BUFF(min, u8, 8, uint, vminu)
+    OP_RISCV64_FUNC_3BUFF(min, i16, 16, int, vmin)
+    OP_RISCV64_FUNC_3BUFF(min, u16, 16, uint, vminu)
+    OP_RISCV64_FUNC_3BUFF(min, i32, 32, int, vmin)
+    OP_RISCV64_FUNC_3BUFF(min, u32, 32, uint, vminu)
+    OP_RISCV64_FUNC_3BUFF(min, i64, 64, int, vmin)
+    OP_RISCV64_FUNC_3BUFF(min, u64, 64, uint, vminu)
+    OP_RISCV64_FUNC_3BUFF(min, f32, 32, float, vfmin)
+    OP_RISCV64_FUNC_3BUFF(min, f64, 64, float, vfmin)
+
+ /*************************************************************************
+ * Sum
+ ************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) + (b))
+    OP_RISCV64_FUNC_3BUFF(sum, i8, 8, int, vadd)
+    OP_RISCV64_FUNC_3BUFF(sum, u8, 8, uint, vadd)
+    OP_RISCV64_FUNC_3BUFF(sum, i16, 16, int, vadd)
+    OP_RISCV64_FUNC_3BUFF(sum, u16, 16, uint, vadd)
+    OP_RISCV64_FUNC_3BUFF(sum, i32, 32, int, vadd)
+    OP_RISCV64_FUNC_3BUFF(sum, u32, 32, uint, vadd)
+    OP_RISCV64_FUNC_3BUFF(sum, i64, 64, int, vadd)
+    OP_RISCV64_FUNC_3BUFF(sum, u64, 64, uint, vadd)
+    OP_RISCV64_FUNC_3BUFF(sum, f32, 32, float, vfadd)
+    OP_RISCV64_FUNC_3BUFF(sum, f64, 64, float, vfadd)
+
+/*************************************************************************
+ * Product
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) * (b))
+    OP_RISCV64_FUNC_3BUFF(prod, i8, 8, int, vmul)
+    OP_RISCV64_FUNC_3BUFF(prod, u8, 8, uint, vmul)
+    OP_RISCV64_FUNC_3BUFF(prod, i16, 16, int, vmul)
+    OP_RISCV64_FUNC_3BUFF(prod, u16, 16, uint, vmul)
+    OP_RISCV64_FUNC_3BUFF(prod, i32, 32, int, vmul)
+    OP_RISCV64_FUNC_3BUFF(prod, u32, 32, uint, vmul)
+    OP_RISCV64_FUNC_3BUFF(prod, i64, 64, int, vmul)
+    OP_RISCV64_FUNC_3BUFF(prod, u64, 64, uint, vmul)
+    OP_RISCV64_FUNC_3BUFF(prod, f32, 32, float, vfmul)
+    OP_RISCV64_FUNC_3BUFF(prod, f64, 64, float, vfmul)
+
+/*************************************************************************
+ * Bitwise AND
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) & (b))
+    OP_RISCV64_FUNC_3BUFF(band, i8, 8, int, vand)
+    OP_RISCV64_FUNC_3BUFF(band, u8, 8, uint, vand)
+    OP_RISCV64_FUNC_3BUFF(band, i16, 16, int, vand)
+    OP_RISCV64_FUNC_3BUFF(band, u16, 16, uint, vand)
+    OP_RISCV64_FUNC_3BUFF(band, i32, 32, int, vand)
+    OP_RISCV64_FUNC_3BUFF(band, u32, 32, uint, vand)
+    OP_RISCV64_FUNC_3BUFF(band, i64, 64, int, vand)
+    OP_RISCV64_FUNC_3BUFF(band, u64, 64, uint, vand)
+
+ /*************************************************************************
+ * Bitwise OR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) | (b))
+    OP_RISCV64_FUNC_3BUFF(bor, i8, 8, int, vor)
+    OP_RISCV64_FUNC_3BUFF(bor, u8, 8, uint, vor)
+    OP_RISCV64_FUNC_3BUFF(bor, i16, 16, int, vor)
+    OP_RISCV64_FUNC_3BUFF(bor, u16, 16, uint, vor)
+    OP_RISCV64_FUNC_3BUFF(bor, i32, 32, int, vor)
+    OP_RISCV64_FUNC_3BUFF(bor, u32, 32, uint, vor)
+    OP_RISCV64_FUNC_3BUFF(bor, i64, 64, int, vor)
+    OP_RISCV64_FUNC_3BUFF(bor, u64, 64, uint, vor)
+
+/*************************************************************************
+ * Bitwise XOR
+ *************************************************************************/
+#undef current_func
+#define current_func(a, b) ((a) ^ (b))
+    OP_RISCV64_FUNC_3BUFF(bxor, i8, 8, int, vxor)
+    OP_RISCV64_FUNC_3BUFF(bxor, u8, 8, uint, vxor)
+    OP_RISCV64_FUNC_3BUFF(bxor, i16, 16, int, vxor)
+    OP_RISCV64_FUNC_3BUFF(bxor, u16, 16, uint, vxor)
+    OP_RISCV64_FUNC_3BUFF(bxor, i32, 32, int, vxor)
+    OP_RISCV64_FUNC_3BUFF(bxor, u32, 32, uint, vxor)
+    OP_RISCV64_FUNC_3BUFF(bxor, i64, 64, int, vxor)
+    OP_RISCV64_FUNC_3BUFF(bxor, u64, 64, uint, vxor)
+
+    /** C integer ***********************************************************/
+#define C_INTEGER_BASE(name, ftype)                                         \
+    [OMPI_OP_BASE_TYPE_INT8_T]   = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_int8_t, APPEND), \
+    [OMPI_OP_BASE_TYPE_UINT8_T]  = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_uint8_t, APPEND), \
+    [OMPI_OP_BASE_TYPE_INT16_T]  = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_int16_t, APPEND), \
+    [OMPI_OP_BASE_TYPE_UINT16_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_uint16_t, APPEND), \
+    [OMPI_OP_BASE_TYPE_INT32_T]  = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_int32_t, APPEND), \
+    [OMPI_OP_BASE_TYPE_UINT32_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_uint32_t, APPEND)
+#define C_INTEGER_EX(name, ftype)                                         \
+    [OMPI_OP_BASE_TYPE_INT64_T]  = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_int64_t, APPEND), \
+    [OMPI_OP_BASE_TYPE_UINT64_T] = OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_uint64_t, APPEND)
+
+    /** Floating point, including all the Fortran reals *********************/
+#define FLOAT(name, ftype)  OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_float32_t, APPEND)
+#define DOUBLE(name, ftype) OP_CONCAT(ompi_op_riscv64_##ftype##_##name##_float64_t, APPEND)
+
+#define FLOATING_POINT(name, ftype)                                    \
+    [OMPI_OP_BASE_TYPE_FLOAT] = FLOAT(name, ftype),                    \
+    [OMPI_OP_BASE_TYPE_DOUBLE] = DOUBLE(name, ftype)
+
+/*
+ * MPI_OP_NULL
+ * All types
+ */
+#define FLAGS_NO_FLOAT \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+#define FLAGS \
+        (OMPI_OP_FLAGS_INTRINSIC | OMPI_OP_FLAGS_ASSOC | \
+         OMPI_OP_FLAGS_FLOAT_ASSOC | OMPI_OP_FLAGS_COMMUTE)
+
+    ompi_op_base_handler_fn_t OP_CONCAT(ompi_op_riscv64_functions, APPEND) [OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER_BASE(max, 2buff),
+        C_INTEGER_EX(max, 2buff),
+        FLOATING_POINT(max, 2buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER_BASE(min, 2buff),
+        C_INTEGER_EX(min, 2buff),
+        FLOATING_POINT(min, 2buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER_BASE(sum, 2buff),
+        C_INTEGER_EX(sum, 2buff),
+        FLOATING_POINT(sum, 2buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER_BASE(prod, 2buff),
+        C_INTEGER_EX(prod, 2buff),
+        FLOATING_POINT(prod, 2buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER_BASE(band, 2buff),
+        C_INTEGER_EX(band, 2buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER_BASE(bor, 2buff),
+        C_INTEGER_EX(bor, 2buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER_BASE(bxor, 2buff),
+        C_INTEGER_EX(bxor, 2buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* (MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           implementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE) */
+        NULL,
+    },
+
+};
+
+    ompi_op_base_3buff_handler_fn_t OP_CONCAT(ompi_op_riscv64_3buff_functions, APPEND)[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] =
+{
+    /* Corresponds to MPI_OP_NULL */
+    [OMPI_OP_BASE_FORTRAN_NULL] = {
+        /* Leaving this empty puts in NULL for all entries */
+        NULL,
+    },
+    /* Corresponds to MPI_MAX */
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        C_INTEGER_BASE(max, 3buff),
+        C_INTEGER_EX(max, 3buff),
+        FLOATING_POINT(max, 3buff),
+    },
+    /* Corresponds to MPI_MIN */
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        C_INTEGER_BASE(min, 3buff),
+        C_INTEGER_EX(min, 3buff),
+        FLOATING_POINT(min, 3buff),
+    },
+    /* Corresponds to MPI_SUM */
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        C_INTEGER_BASE(sum, 3buff),
+        C_INTEGER_EX(sum, 3buff),
+        FLOATING_POINT(sum, 3buff),
+    },
+    /* Corresponds to MPI_PROD */
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        C_INTEGER_BASE(prod, 3buff),
+        C_INTEGER_EX(prod, 3buff),
+        FLOATING_POINT(prod, 3buff),
+    },
+    /* Corresponds to MPI_LAND */
+    [OMPI_OP_BASE_FORTRAN_LAND] ={
+        NULL,
+    },
+    /* Corresponds to MPI_BAND */
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        C_INTEGER_BASE(band, 3buff),
+        C_INTEGER_EX(band, 3buff),
+    },
+    /* Corresponds to MPI_LOR */
+    [OMPI_OP_BASE_FORTRAN_LOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BOR */
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        C_INTEGER_BASE(bor, 3buff),
+        C_INTEGER_EX(bor, 3buff),
+    },
+    /* Corresponds to MPI_LXOR */
+    [OMPI_OP_BASE_FORTRAN_LXOR] = {
+        NULL,
+    },
+    /* Corresponds to MPI_BXOR */
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        C_INTEGER_BASE(bxor, 3buff),
+        C_INTEGER_EX(bxor, 3buff),
+    },
+    /* Corresponds to MPI_REPLACE */
+    [OMPI_OP_BASE_FORTRAN_REPLACE] = {
+        /* MPI_ACCUMULATE is handled differently than the other
+           reductions, so just zero out its function
+           implementations here to ensure that users don't invoke
+           MPI_REPLACE with any reduction operations other than
+           ACCUMULATE */
+        NULL,
+    },
+};