摩尔添加：softmax(含bf16)

Sxy-17 · Sxy-17 · commit 19f1322a4a79 · 2025-12-18T23:33:37.000+08:00
diff --git a/src/infiniop/ops/softmax/moore/softmax_moore.h b/src/infiniop/ops/softmax/moore/softmax_moore.h
@@ -0,0 +1,8 @@
+#ifndef __SOFTMAX_MOORE_H__
+#define __SOFTMAX_MOORE_H__
+
+#include "../softmax.h"
+
+DESCRIPTOR(moore)
+
+#endif
diff --git a/src/infiniop/ops/softmax/moore/softmax_moore.mu b/src/infiniop/ops/softmax/moore/softmax_moore.mu
@@ -0,0 +1,82 @@
+#include "../../../devices/moore/moore_common.h"
+#include "softmax_moore.h"
+
+#include <cub/block/block_reduce.cuh>
+#include "../../../devices/moore/moore_kernel_common.h"
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "softmax_moore_kernel.h"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_MOORE_KERNEL softmax_kernel(
+    Tdata *y, const Tdata *x,
+    size_t othersize, size_t dimsize, ptrdiff_t stride) {
+    softmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, othersize, dimsize, stride);
+}
+
+namespace op::softmax::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int axis) {
+    auto info = SoftmaxInfo::create(y_desc, x_desc, axis);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t othersize, size_t dimsize, ptrdiff_t stride,
+                            musaStream_t stream) {
+    dim3 grid(uint32_t(othersize), 1, 1);
+    if (dtype == INFINI_DTYPE_F16) {
+        softmax_kernel<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                             othersize, dimsize, stride);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        softmax_kernel<BLOCK_SIZE, __mt_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__mt_bfloat16 *)y, (const __mt_bfloat16 *)x,
+                                             othersize, dimsize, stride);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        softmax_kernel<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                             othersize, dimsize, stride);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    musaStream_t stream = (musaStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::softmax::moore
diff --git a/src/infiniop/ops/softmax/moore/softmax_moore_kernel.h b/src/infiniop/ops/softmax/moore/softmax_moore_kernel.h
@@ -0,0 +1,86 @@
+#ifndef __SOFTMAX_KERNEL_CUH__
+#define __SOFTMAX_KERNEL_CUH__
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void softmaxKernel(
+    Tdata *y_, const Tdata *x_,
+    size_t othersize,   // = outer_size * inner_size
+    size_t dimsize,     // = axis_size
+    ptrdiff_t stride    // = inner_size
+) {
+    size_t other_idx = blockIdx.x;
+    if (other_idx >= othersize) return;
+
+    // -----------------------------------
+    // 正确计算 softmax slice 的 base
+    // -----------------------------------
+    size_t inner_idx = other_idx % stride;
+    size_t outer_idx = other_idx / stride;
+
+    const Tdata *x = x_ + outer_idx * dimsize * stride + inner_idx;
+    Tdata *y       = y_ + outer_idx * dimsize * stride + inner_idx;
+
+    // ---------------------------
+    // 1. block max
+    // ---------------------------
+    __shared__ Tcompute s_reduce[BLOCK_SIZE];
+    __shared__ Tcompute s_max;
+
+    Tcompute local_max = -INFINITY;
+
+    for (size_t i = threadIdx.x; i < dimsize; i += BLOCK_SIZE) {
+        Tcompute v = static_cast<Tcompute>(x[i * stride]);
+        local_max = v > local_max ? v : local_max;
+    }
+
+    s_reduce[threadIdx.x] = local_max;
+    __syncthreads();
+
+    for (unsigned int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (threadIdx.x < s) {
+            s_reduce[threadIdx.x] =
+                max(s_reduce[threadIdx.x], s_reduce[threadIdx.x + s]);
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) s_max = s_reduce[0];
+    __syncthreads();
+
+    // ---------------------------
+    // 2. exp & sum
+    // ---------------------------
+    Tcompute local_sum = 0;
+
+    for (size_t i = threadIdx.x; i < dimsize; i += BLOCK_SIZE) {
+        Tcompute v =
+            expf(static_cast<float>(x[i * stride]) - static_cast<float>(s_max));
+        y[i * stride] = static_cast<Tdata>(v);
+        local_sum += v;
+    }
+
+    s_reduce[threadIdx.x] = local_sum;
+    __syncthreads();
+
+    for (unsigned int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (threadIdx.x < s) {
+            s_reduce[threadIdx.x] += s_reduce[threadIdx.x + s];
+        }
+        __syncthreads();
+    }
+
+    Tcompute sum = s_reduce[0];
+    __syncthreads();
+
+    // ---------------------------
+    // 3. normalize
+    // ---------------------------
+    for (size_t i = threadIdx.x; i < dimsize; i += BLOCK_SIZE) {
+        y[i * stride] =
+            static_cast<Tdata>(
+                static_cast<float>(y[i * stride]) / static_cast<float>(sum));
+    }
+}
+
+
+#endif // __SOFTMAX_KERNEL_CUH__
diff --git a/src/infiniop/ops/softmax/operator.cc b/src/infiniop/ops/softmax/operator.cc
@@ -5,6 +5,9 @@
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
 #include "nvidia/softmax_nvidia.cuh"
 #endif
+#ifdef ENABLE_MOORE_API
+#include "moore/softmax_moore.h"
+#endif
 
 __C infiniStatus_t infiniopCreateSoftmaxDescriptor(
     infiniopHandle_t handle,
@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateSoftmaxDescriptor(
 #endif
 #ifdef ENABLE_HYGON_API
         CREATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore)
 #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopGetSoftmaxWorkspaceSize(infiniopSoftmaxDescriptor_t d
 #endif
 #ifdef ENABLE_HYGON_API
         GET(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
 #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -86,6 +95,9 @@ __C infiniStatus_t infiniopSoftmax(
 #endif
 #ifdef ENABLE_HYGON_API
         CALCULATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore)
 #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -110,6 +122,9 @@ __C infiniStatus_t infiniopDestroySoftmaxDescriptor(infiniopSoftmaxDescriptor_t
 #endif
 #ifdef ENABLE_HYGON_API
         DESTROY(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore)
 #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
diff --git a/test/infiniop/softmax.py b/test/infiniop/softmax.py
@@ -34,15 +34,19 @@
     ((1, 16, 512, 512), 1),
     ((1, 16, 512, 512), 2),
     ((1, 16, 512, 512), 3),
+    ((1, 32, 4096, 4096), 3),   # GPT-3 / LLaMA attention
+    ((2, 16, 2048, 2048), 3),
+    ((4, 8, 1024, 1024), 3),
 ]
 
 # Data types used for testing
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
 
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
     InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
     InfiniDtype.F32: {"atol": 3e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
 }