From 11782aa33afeaf5bd2867da3dde42cf66bf0e87e Mon Sep 17 00:00:00 2001 From: lengmuzhaxi <2690497440@qq.com> Date: Sat, 10 Jan 2026 20:43:42 +0800 Subject: [PATCH] accomplish softplus broadcast_to linear huber_loss softsign operator --- include/infinicore/ops/broadcast_to.hpp | 18 ++ include/infinicore/ops/huber_loss.hpp | 21 ++ include/infinicore/ops/linear.hpp | 1 + include/infinicore/ops/softplus.hpp | 20 ++ include/infinicore/ops/softsign.hpp | 16 + include/infiniop.h | 5 +- include/infiniop/ops/broadcast_to.h | 24 ++ include/infiniop/ops/huber_loss.h | 28 ++ include/infiniop/ops/softplus.h | 48 +-- include/infiniop/ops/softsign.h | 25 ++ python/infinicore/__init__.py | 2 + python/infinicore/nn/functional/__init__.py | 6 + python/infinicore/nn/functional/huber_loss.py | 50 ++++ python/infinicore/nn/functional/linear.py | 3 + python/infinicore/nn/functional/softplus.py | 13 + python/infinicore/nn/functional/softsign.py | 11 + python/infinicore/ops/broadcast_to.py | 11 + .../ops/broadcast_to/broadcast_to.cc | 29 ++ .../ops/broadcast_to/broadcast_to_infiniop.cc | 63 ++++ src/infinicore/ops/huber_loss/huber_loss.cc | 35 +++ .../ops/huber_loss/huber_loss_infiniop.cc | 68 +++++ src/infinicore/ops/softplus/softplus.cc | 34 +++ .../ops/softplus/softplus_infiniop.cc | 49 ++++ src/infinicore/ops/softsign/softsign.cc | 27 ++ .../ops/softsign/softsign_infiniop.cc | 50 ++++ src/infinicore/pybind11/ops.hpp | 13 +- src/infinicore/pybind11/ops/broadcast_to.hpp | 26 ++ src/infinicore/pybind11/ops/huber_loss.hpp | 44 +++ src/infinicore/pybind11/ops/softplus.hpp | 30 ++ src/infinicore/pybind11/ops/softsign.hpp | 24 ++ src/infiniop/ops/broadcast_to/broadcast_to.h | 48 +++ .../ops/broadcast_to/cpu/broadcast_to_cpu.cc | 116 ++++++++ .../ops/broadcast_to/cpu/broadcast_to_cpu.h | 8 + src/infiniop/ops/broadcast_to/cuda/kernel.cuh | 53 ++++ src/infiniop/ops/broadcast_to/info.h | 97 +++++++ .../broadcast_to/metax/broadcast_to_metax.h | 8 + .../metax/broadcast_to_metax.maca | 177 ++++++++++++ .../broadcast_to/moore/broadcast_to_moore.h | 8 + .../broadcast_to/moore/broadcast_to_moore.mu | 122 ++++++++ .../moore/broadcast_to_moore_kernel.h | 57 ++++ .../nvidia/broadcast_to_nvidia.cu | 114 ++++++++ .../nvidia/broadcast_to_nvidia.cuh | 8 + src/infiniop/ops/broadcast_to/operator.cc | 176 +++++++++++ .../ops/huber_loss/cpu/huber_loss_cpu.cc | 140 +++++++++ .../ops/huber_loss/cpu/huber_loss_cpu.h | 8 + src/infiniop/ops/huber_loss/cuda/kernel.cuh | 106 +++++++ src/infiniop/ops/huber_loss/huber_loss.h | 49 ++++ src/infiniop/ops/huber_loss/info.h | 79 +++++ .../ops/huber_loss/metax/huber_loss_metax.h | 8 + .../huber_loss/metax/huber_loss_metax.maca | 273 ++++++++++++++++++ .../ops/huber_loss/moore/huber_loss_moore.h | 8 + .../ops/huber_loss/moore/huber_loss_moore.mu | 153 ++++++++++ .../moore/huber_loss_moore_kernel.h | 141 +++++++++ .../huber_loss/nvidia/huber_loss_nvidia.cu | 119 ++++++++ .../huber_loss/nvidia/huber_loss_nvidia.cuh | 8 + src/infiniop/ops/huber_loss/operator.cc | 180 ++++++++++++ src/infiniop/ops/softplus/cpu/softplus_cpu.cc | 121 ++++++-- src/infiniop/ops/softplus/cpu/softplus_cpu.h | 21 +- src/infiniop/ops/softplus/cuda/kernel.cuh | 60 +++- src/infiniop/ops/softplus/info.h | 106 +++++++ .../ops/softplus/metax/softplus_metax.h | 4 +- .../ops/softplus/metax/softplus_metax.maca | 204 +++++++++++-- .../ops/softplus/moore/softplus_moore.h | 8 + .../ops/softplus/moore/softplus_moore.mu | 181 ++++++++++++ .../softplus/moore/softplus_moore_kernel.h | 64 ++++ .../ops/softplus/nvidia/softplus_nvidia.cu | 176 +++++++++-- .../ops/softplus/nvidia/softplus_nvidia.cuh | 10 +- src/infiniop/ops/softplus/operator.cc | 41 ++- src/infiniop/ops/softplus/softplus.h | 49 ++++ src/infiniop/ops/softsign/cpu/softsign_cpu.cc | 55 ++++ src/infiniop/ops/softsign/cpu/softsign_cpu.h | 21 ++ src/infiniop/ops/softsign/cuda/kernel.cuh | 42 +++ .../ops/softsign/metax/softsign_metax.h | 8 + .../ops/softsign/metax/softsign_metax.maca | 95 ++++++ .../ops/softsign/moore/softsign_moore.h | 8 + .../ops/softsign/moore/softsign_moore.mu | 69 +++++ .../softsign/moore/softsign_moore_kernel.h | 141 +++++++++ .../ops/softsign/nvidia/softsign_nvidia.cu | 56 ++++ .../ops/softsign/nvidia/softsign_nvidia.cuh | 8 + src/infiniop/ops/softsign/operator.cc | 201 +++++++++++++ test/infinicore/ops/broadcast_to.py | 5 +- test/infinicore/ops/huber_loss.py | 5 +- test/infinicore/ops/softplus.py | 5 +- test/infinicore/ops/softsign.py | 5 +- 84 files changed, 4681 insertions(+), 146 deletions(-) create mode 100644 include/infinicore/ops/broadcast_to.hpp create mode 100644 include/infinicore/ops/huber_loss.hpp create mode 100644 include/infinicore/ops/softplus.hpp create mode 100644 include/infinicore/ops/softsign.hpp create mode 100644 include/infiniop/ops/broadcast_to.h create mode 100644 include/infiniop/ops/huber_loss.h create mode 100644 include/infiniop/ops/softsign.h create mode 100644 python/infinicore/nn/functional/huber_loss.py create mode 100644 python/infinicore/nn/functional/softplus.py create mode 100644 python/infinicore/nn/functional/softsign.py create mode 100644 python/infinicore/ops/broadcast_to.py create mode 100644 src/infinicore/ops/broadcast_to/broadcast_to.cc create mode 100644 src/infinicore/ops/broadcast_to/broadcast_to_infiniop.cc create mode 100644 src/infinicore/ops/huber_loss/huber_loss.cc create mode 100644 src/infinicore/ops/huber_loss/huber_loss_infiniop.cc create mode 100644 src/infinicore/ops/softplus/softplus.cc create mode 100644 src/infinicore/ops/softplus/softplus_infiniop.cc create mode 100644 src/infinicore/ops/softsign/softsign.cc create mode 100644 src/infinicore/ops/softsign/softsign_infiniop.cc create mode 100644 src/infinicore/pybind11/ops/broadcast_to.hpp create mode 100644 src/infinicore/pybind11/ops/huber_loss.hpp create mode 100644 src/infinicore/pybind11/ops/softplus.hpp create mode 100644 src/infinicore/pybind11/ops/softsign.hpp create mode 100644 src/infiniop/ops/broadcast_to/broadcast_to.h create mode 100644 src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.cc create mode 100644 src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.h create mode 100644 src/infiniop/ops/broadcast_to/cuda/kernel.cuh create mode 100644 src/infiniop/ops/broadcast_to/info.h create mode 100644 src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.h create mode 100644 src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.maca create mode 100644 src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.h create mode 100644 src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.mu create mode 100644 src/infiniop/ops/broadcast_to/moore/broadcast_to_moore_kernel.h create mode 100644 src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cu create mode 100644 src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cuh create mode 100644 src/infiniop/ops/broadcast_to/operator.cc create mode 100644 src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.cc create mode 100644 src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.h create mode 100644 src/infiniop/ops/huber_loss/cuda/kernel.cuh create mode 100644 src/infiniop/ops/huber_loss/huber_loss.h create mode 100644 src/infiniop/ops/huber_loss/info.h create mode 100644 src/infiniop/ops/huber_loss/metax/huber_loss_metax.h create mode 100644 src/infiniop/ops/huber_loss/metax/huber_loss_metax.maca create mode 100644 src/infiniop/ops/huber_loss/moore/huber_loss_moore.h create mode 100644 src/infiniop/ops/huber_loss/moore/huber_loss_moore.mu create mode 100644 src/infiniop/ops/huber_loss/moore/huber_loss_moore_kernel.h create mode 100644 src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cu create mode 100644 src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cuh create mode 100644 src/infiniop/ops/huber_loss/operator.cc create mode 100644 src/infiniop/ops/softplus/info.h create mode 100644 src/infiniop/ops/softplus/moore/softplus_moore.h create mode 100644 src/infiniop/ops/softplus/moore/softplus_moore.mu create mode 100644 src/infiniop/ops/softplus/moore/softplus_moore_kernel.h create mode 100644 src/infiniop/ops/softplus/softplus.h create mode 100644 src/infiniop/ops/softsign/cpu/softsign_cpu.cc create mode 100644 src/infiniop/ops/softsign/cpu/softsign_cpu.h create mode 100644 src/infiniop/ops/softsign/cuda/kernel.cuh create mode 100644 src/infiniop/ops/softsign/metax/softsign_metax.h create mode 100644 src/infiniop/ops/softsign/metax/softsign_metax.maca create mode 100644 src/infiniop/ops/softsign/moore/softsign_moore.h create mode 100644 src/infiniop/ops/softsign/moore/softsign_moore.mu create mode 100644 src/infiniop/ops/softsign/moore/softsign_moore_kernel.h create mode 100644 src/infiniop/ops/softsign/nvidia/softsign_nvidia.cu create mode 100644 src/infiniop/ops/softsign/nvidia/softsign_nvidia.cuh create mode 100644 src/infiniop/ops/softsign/operator.cc diff --git a/include/infinicore/ops/broadcast_to.hpp b/include/infinicore/ops/broadcast_to.hpp new file mode 100644 index 000000000..a1667043c --- /dev/null +++ b/include/infinicore/ops/broadcast_to.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { +class BroadcastTo { +public: + // Schema: Output(y), Input(x) + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor y, Tensor x); + static common::OpDispatcher &dispatcher(); +}; +Tensor broadcast_to(Tensor x, const std::vector& shape); +void broadcast_to_(Tensor y, Tensor x); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/huber_loss.hpp b/include/infinicore/ops/huber_loss.hpp new file mode 100644 index 000000000..6dac4db35 --- /dev/null +++ b/include/infinicore/ops/huber_loss.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class HuberLoss { +public: + // Schema: output, input, target, delta, reduction + using schema = void (*)(Tensor, Tensor, Tensor, float, int64_t); + + static void execute(Tensor output, Tensor input, Tensor target, float delta, int64_t reduction); + static common::OpDispatcher &dispatcher(); +}; + +// delta 默认为 1.0f,reduction 默认为 1 (MEAN) +Tensor huber_loss(Tensor input, Tensor target, float delta = 1.0f, int64_t reduction = 1); +void huber_loss_(Tensor output, Tensor input, Tensor target, float delta, int64_t reduction); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/linear.hpp b/include/infinicore/ops/linear.hpp index d69842be3..56145f51f 100644 --- a/include/infinicore/ops/linear.hpp +++ b/include/infinicore/ops/linear.hpp @@ -3,6 +3,7 @@ #include "common/op.hpp" #include + namespace infinicore::op { Tensor linear(Tensor input, Tensor weight, std::optional bias); diff --git a/include/infinicore/ops/softplus.hpp b/include/infinicore/ops/softplus.hpp new file mode 100644 index 000000000..1e08b32dc --- /dev/null +++ b/include/infinicore/ops/softplus.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Softplus { +public: + // 修改 1: Schema 增加 float beta, float threshold + using schema = void (*)(Tensor, Tensor, float, float); + static void execute(Tensor y, Tensor x, float beta, float threshold); + static common::OpDispatcher &dispatcher(); +}; +// default: beta = 1.0, threshold = 20.0 +Tensor softplus(Tensor x, float beta = 1.0f, float threshold = 20.0f); + +void softplus_(Tensor y, Tensor x, float beta = 1.0f, float threshold = 20.0f); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/softsign.hpp b/include/infinicore/ops/softsign.hpp new file mode 100644 index 000000000..eaaf1eb13 --- /dev/null +++ b/include/infinicore/ops/softsign.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Softsign { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor y, Tensor x); + static common::OpDispatcher &dispatcher(); +}; +// 返回新 Tensor 的函数接口 +Tensor softsign(Tensor x); +void softsign_(Tensor y, Tensor x); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infiniop.h b/include/infiniop.h index c0a09fcb4..e9d850143 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -27,13 +27,16 @@ #include "infiniop/ops/sigmoid.h" #include "infiniop/ops/silu.h" #include "infiniop/ops/softmax.h" -#include "infiniop/ops/softplus.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/ops/tanh.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/zeros.h" +#include "infiniop/ops/broadcast_to.h" +#include "infiniop/ops/softplus.h" +#include "infiniop/ops/softsign.h" +#include "infiniop/ops/huber_loss.h" #include "infiniop/tensor_descriptor.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/broadcast_to.h b/include/infiniop/ops/broadcast_to.h new file mode 100644 index 000000000..66612d7cc --- /dev/null +++ b/include/infiniop/ops/broadcast_to.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_BROADCAST_TO_API_H__ +#define __INFINIOP_BROADCAST_TO_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopBroadcastToDescriptor_t; + +__C __export infiniStatus_t infiniopCreateBroadcastToDescriptor(infiniopHandle_t handle, + infiniopBroadcastToDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetBroadcastToWorkspaceSize(infiniopBroadcastToDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopBroadcastTo(infiniopBroadcastToDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyBroadcastToDescriptor(infiniopBroadcastToDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/huber_loss.h b/include/infiniop/ops/huber_loss.h new file mode 100644 index 000000000..bb6a22ec6 --- /dev/null +++ b/include/infiniop/ops/huber_loss.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_HUBER_LOSS_API_H__ +#define __INFINIOP_HUBER_LOSS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHuberLossDescriptor_t; + +__C __export infiniStatus_t infiniopCreateHuberLossDescriptor(infiniopHandle_t handle, + infiniopHuberLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t target, + float delta, + int reduction); + +__C __export infiniStatus_t infiniopGetHuberLossWorkspaceSize(infiniopHuberLossDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopHuberLoss(infiniopHuberLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + void *stream); + +__C __export infiniStatus_t infiniopDestroyHuberLossDescriptor(infiniopHuberLossDescriptor_t desc); + +#endif // __INFINIOP_HUBER_LOSS_API_H__ \ No newline at end of file diff --git a/include/infiniop/ops/softplus.h b/include/infiniop/ops/softplus.h index 408452ddd..5863e6a6c 100644 --- a/include/infiniop/ops/softplus.h +++ b/include/infiniop/ops/softplus.h @@ -1,24 +1,38 @@ -#ifndef __INFINIOP_SOFTPLUS_API_H__ -#define __INFINIOP_SOFTPLUS_API_H__ +#ifndef __INFINIOP_OPS_SOFTPLUS_H__ +#define __INFINIOP_OPS_SOFTPLUS_H__ +#include "../tensor_descriptor.h" -#include "../operator_descriptor.h" - -typedef struct InfiniopDescriptor *infiniopSoftplusDescriptor_t; +#ifdef __cplusplus +extern "C" { +#endif -__C __export infiniStatus_t infiniopCreateSoftplusDescriptor(infiniopHandle_t handle, - infiniopSoftplusDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t y, - infiniopTensorDescriptor_t x); +typedef struct InfiniopSoftplusDescriptor *infiniopSoftplusDescriptor_t; +__C __export infiniStatus_t infiniopCreateSoftplusDescriptor( + infiniopHandle_t handle, + infiniopSoftplusDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + float beta, + float threshold +); -__C __export infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t desc, size_t *size); +__C __export infiniStatus_t infiniopGetSoftplusWorkspaceSize( + infiniopSoftplusDescriptor_t desc, + size_t *size); -__C __export infiniStatus_t infiniopSoftplus(infiniopSoftplusDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *y, - const void *x, - void *stream); +__C __export infiniStatus_t infiniopSoftplus( + infiniopSoftplusDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); -__C __export infiniStatus_t infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc); +__C __export infiniStatus_t infiniopDestroySoftplusDescriptor( + infiniopSoftplusDescriptor_t desc); +#ifdef __cplusplus +} #endif + +#endif // __INFINIOP_OPS_SOFTPLUS_H__ \ No newline at end of file diff --git a/include/infiniop/ops/softsign.h b/include/infiniop/ops/softsign.h new file mode 100644 index 000000000..6239fbbc6 --- /dev/null +++ b/include/infiniop/ops/softsign.h @@ -0,0 +1,25 @@ + +#ifndef __INFINIOP_SOFTSIGN_API_H__ +#define __INFINIOP_SOFTSIGN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSoftsignDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSoftsignDescriptor(infiniopHandle_t handle, + infiniopSoftsignDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetSoftsignWorkspaceSize(infiniopSoftsignDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSoftsign(infiniopSoftsignDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroySoftsignDescriptor(infiniopSoftsignDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index b7288f3ac..fa92854ab 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -51,6 +51,7 @@ from infinicore.ops.rearrange import rearrange from infinicore.ops.squeeze import squeeze from infinicore.ops.unsqueeze import unsqueeze +from infinicore.ops.broadcast_to import broadcast_to from infinicore.tensor import ( Tensor, empty, @@ -125,6 +126,7 @@ "paged_attention", "paged_attention_prefill", "ones", + "broadcast_to", "strided_empty", "strided_from_blob", "zeros", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..c28c8a8aa 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -5,6 +5,9 @@ from .rms_norm import rms_norm from .rope import RopeAlgo, rope from .silu import silu +from .softplus import softplus +from .softsign import softsign +from .huber_loss import huber_loss from .swiglu import swiglu __all__ = [ @@ -17,4 +20,7 @@ "embedding", "rope", "RopeAlgo", + "softplus", + "siftsign", + "huber_loss", ] diff --git a/python/infinicore/nn/functional/huber_loss.py b/python/infinicore/nn/functional/huber_loss.py new file mode 100644 index 000000000..a9845e4b6 --- /dev/null +++ b/python/infinicore/nn/functional/huber_loss.py @@ -0,0 +1,50 @@ +from typing import Optional +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +_REDUCTION_MODES = { + "none": 0, + "mean": 1, + "sum": 2, +} + +def huber_loss( + input: Tensor, + target: Tensor, + delta: float = 1.0, + reduction: str = "mean", + *, + out: Optional[Tensor] = None +) -> Tensor: + r"""Creates a criterion that uses a squared term if the absolute + element-wise error falls below delta and a delta-scaled L1 term otherwise. + """ + + if not input.is_contiguous(): + input = input.contiguous() + if not target.is_contiguous(): + target = target.contiguous() + + # 解析 reduction 参数 + if reduction not in _REDUCTION_MODES: + raise ValueError(f"{reduction} is not a valid value for reduction") + reduction_val = _REDUCTION_MODES[reduction] + + if out is not None: + _infinicore.huber_loss_( + out._underlying, + input._underlying, + target._underlying, + delta, + reduction_val + ) + return out + + return Tensor( + _infinicore.huber_loss( + input._underlying, + target._underlying, + delta, + reduction_val + ) + ) \ No newline at end of file diff --git a/python/infinicore/nn/functional/linear.py b/python/infinicore/nn/functional/linear.py index 22ab9b1ef..f77953aae 100644 --- a/python/infinicore/nn/functional/linear.py +++ b/python/infinicore/nn/functional/linear.py @@ -1,6 +1,9 @@ from infinicore.lib import _infinicore from infinicore.tensor import Tensor +__all__ = ["linear"] + + def linear(input: Tensor, weight: Tensor, bias=None, *, out=None) -> Tensor: r"""Applies a linear transformation to the incoming data: y=xA^T+b.""" diff --git a/python/infinicore/nn/functional/softplus.py b/python/infinicore/nn/functional/softplus.py new file mode 100644 index 000000000..d393e1646 --- /dev/null +++ b/python/infinicore/nn/functional/softplus.py @@ -0,0 +1,13 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def softplus(input, beta=1, threshold=20, *, out=None): + if out is None: + # 修改:将 beta 和 threshold 传递给底层 C++ + return Tensor(_infinicore.softplus(input._underlying, beta, threshold)) + + # 修改:将 beta 和 threshold 传递给底层 C++ (In-place) + _infinicore.softplus_(out._underlying, input._underlying, beta, threshold) + + return out \ No newline at end of file diff --git a/python/infinicore/nn/functional/softsign.py b/python/infinicore/nn/functional/softsign.py new file mode 100644 index 000000000..47a94a34e --- /dev/null +++ b/python/infinicore/nn/functional/softsign.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def softsign(input, *, out=None): + if out is None: + return Tensor(_infinicore.softsign(input._underlying)) + + _infinicore.softsign_(out._underlying, input._underlying) + + return out \ No newline at end of file diff --git a/python/infinicore/ops/broadcast_to.py b/python/infinicore/ops/broadcast_to.py new file mode 100644 index 000000000..203dc8498 --- /dev/null +++ b/python/infinicore/ops/broadcast_to.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +# 修改说明:将参数名 'shape' 改为 'size',以匹配测试用例中的调用方式 kwargs={size=...} +def broadcast_to(input, size, *, out=None): + if out is None: + return Tensor(_infinicore.broadcast_to(input._underlying, size)) + + _infinicore.broadcast_to_(out._underlying, input._underlying) + + return out \ No newline at end of file diff --git a/src/infinicore/ops/broadcast_to/broadcast_to.cc b/src/infinicore/ops/broadcast_to/broadcast_to.cc new file mode 100644 index 000000000..a96123aa0 --- /dev/null +++ b/src/infinicore/ops/broadcast_to/broadcast_to.cc @@ -0,0 +1,29 @@ +#include "infinicore/ops/broadcast_to.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &BroadcastTo::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void BroadcastTo::execute(Tensor y, Tensor x) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x); + infinicore::context::setDevice(y->device()); + dispatcher().lookup(y->device().getType())(y, x); +} + +Tensor broadcast_to(Tensor x, const std::vector& shape) { + Shape target_shape(shape.begin(), shape.end()); + + auto y = Tensor::empty(target_shape, x->dtype(), x->device()); + broadcast_to_(y, x); + return y; +} + +void broadcast_to_(Tensor y, Tensor x) { + BroadcastTo::execute(y, x); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/broadcast_to/broadcast_to_infiniop.cc b/src/infinicore/ops/broadcast_to/broadcast_to_infiniop.cc new file mode 100644 index 000000000..c21c83797 --- /dev/null +++ b/src/infinicore/ops/broadcast_to/broadcast_to_infiniop.cc @@ -0,0 +1,63 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/broadcast_to.hpp" +#include + +namespace infinicore::op::broadcast_to_impl::infiniop { + +// 定义描述符缓存 +thread_local common::OpCache caches( + 100, // capacity + [](infiniopBroadcastToDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyBroadcastToDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor y, Tensor x) { + size_t seed = hash_combine(y, x); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopBroadcastToDescriptor_t desc = nullptr; + + if (!desc_opt) { + // 2. 创建描述符 + INFINICORE_CHECK_ERROR(infiniopCreateBroadcastToDescriptor( + context::getInfiniopHandle(device), + &desc, + y->desc(), + x->desc() + )); + + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 3. 获取 Workspace 并执行 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetBroadcastToWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopBroadcastTo( + desc, + workspace->data(), + workspace_size, + y->data(), + x->data(), + context::getStream() + )); +} + +// 4. 注册算子实现 +static bool registered = []() { + BroadcastTo::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::broadcast_to_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/huber_loss/huber_loss.cc b/src/infinicore/ops/huber_loss/huber_loss.cc new file mode 100644 index 000000000..38e442c1d --- /dev/null +++ b/src/infinicore/ops/huber_loss/huber_loss.cc @@ -0,0 +1,35 @@ +#include "infinicore/ops/huber_loss.hpp" + +namespace infinicore::op { + +// 1. 定义 Dispatcher 单例 +common::OpDispatcher &HuberLoss::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void HuberLoss::execute(Tensor output, Tensor input, Tensor target, float delta, int64_t reduction) { + dispatcher().lookup(context::getDevice().getType())(output, input, target, delta, reduction); +} + +// 3. 函数式接口 +Tensor huber_loss(Tensor input, Tensor target, float delta, int64_t reduction) { + Shape output_shape; + if (reduction == 0) { // None + // HuberLoss 是 Element-wise 的,reduction='none' 时输出形状通常与输入一致 + output_shape = input->shape(); + } else { + output_shape = {}; // Scalar + } + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + + huber_loss_(output, input, target, delta, reduction); + return output; +} + +void huber_loss_(Tensor output, Tensor input, Tensor target, float delta, int64_t reduction) { + HuberLoss::execute(output, input, target, delta, reduction); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/huber_loss/huber_loss_infiniop.cc b/src/infinicore/ops/huber_loss/huber_loss_infiniop.cc new file mode 100644 index 000000000..9d04bfb30 --- /dev/null +++ b/src/infinicore/ops/huber_loss/huber_loss_infiniop.cc @@ -0,0 +1,68 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/huber_loss.hpp" +#include + +namespace infinicore::op::huber_loss_impl::infiniop { + +// 定义描述符缓存 +thread_local common::OpCache caches( + 100, // capacity + [](infiniopHuberLossDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyHuberLossDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, Tensor target, float delta, int64_t reduction) { + size_t seed = hash_combine(output, input, target, delta, reduction); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopHuberLossDescriptor_t desc = nullptr; + + if (!desc_opt) { + // 3. 创建描述符 + INFINICORE_CHECK_ERROR(infiniopCreateHuberLossDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + target->desc(), + delta, + static_cast(reduction) + )); + + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 4. 获取 Workspace 并执行 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetHuberLossWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopHuberLoss( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + target->data(), + context::getStream() + )); +} + +static bool registered = []() { + HuberLoss::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::huber_loss_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/softplus/softplus.cc b/src/infinicore/ops/softplus/softplus.cc new file mode 100644 index 000000000..c3e6c6e83 --- /dev/null +++ b/src/infinicore/ops/softplus/softplus.cc @@ -0,0 +1,34 @@ +#include "infinicore/ops/softplus.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Softplus::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +// 修改:增加 beta 和 threshold 参数 +void Softplus::execute(Tensor y, Tensor x, float beta, float threshold) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x); + infinicore::context::setDevice(y->device()); + + // 修改:将 beta 和 threshold 传递给底层的实现 (infiniop wrapper) + dispatcher().lookup(y->device().getType())(y, x, beta, threshold); +} + +// 修改:增加 beta 和 threshold 参数 +Tensor softplus(Tensor x, float beta, float threshold) { + auto y = Tensor::empty(x->shape(), x->dtype(), x->device()); + // 传递参数 + softplus_(y, x, beta, threshold); + return y; +} + +// 修改:增加 beta 和 threshold 参数 +void softplus_(Tensor y, Tensor x, float beta, float threshold) { + // 传递参数 + Softplus::execute(y, x, beta, threshold); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/softplus/softplus_infiniop.cc b/src/infinicore/ops/softplus/softplus_infiniop.cc new file mode 100644 index 000000000..7a62f5ffa --- /dev/null +++ b/src/infinicore/ops/softplus/softplus_infiniop.cc @@ -0,0 +1,49 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/softplus.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::softplus_impl::infiniop { +thread_local common::OpCache caches( + 100, // capacity + [](infiniopSoftplusDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroySoftplusDescriptor(desc)); + desc = nullptr; + } + }); +void calculate(Tensor y, Tensor x, float beta, float threshold) { + size_t seed = hash_combine(y, x, beta, threshold); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopSoftplusDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateSoftplusDescriptor( + context::getInfiniopHandle(device), &desc, + y->desc(), x->desc(), beta, threshold)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 获取并分配 Workspace + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetSoftplusWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + // 执行计算 + INFINICORE_CHECK_ERROR(infiniopSoftplus( + desc, workspace->data(), workspace_size, + y->data(), x->data(), context::getStream())); +} +static bool registered = []() { + Softplus::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::softplus_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/softsign/softsign.cc b/src/infinicore/ops/softsign/softsign.cc new file mode 100644 index 000000000..3a4d3c69e --- /dev/null +++ b/src/infinicore/ops/softsign/softsign.cc @@ -0,0 +1,27 @@ +#include "infinicore/ops/softsign.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Softsign::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Softsign::execute(Tensor y, Tensor x) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x); + infinicore::context::setDevice(y->device()); + dispatcher().lookup(y->device().getType())(y, x); +} + +Tensor softsign(Tensor x) { + auto y = Tensor::empty(x->shape(), x->dtype(), x->device()); + softsign_(y, x); + return y; +} + +void softsign_(Tensor y, Tensor x) { + Softsign::execute(y, x); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/softsign/softsign_infiniop.cc b/src/infinicore/ops/softsign/softsign_infiniop.cc new file mode 100644 index 000000000..c7ea317e1 --- /dev/null +++ b/src/infinicore/ops/softsign/softsign_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/softsign.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::softsign_impl::infiniop { +thread_local common::OpCache caches( + 100, // capacity + [](infiniopSoftsignDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroySoftsignDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor y, Tensor x) { + size_t seed = hash_combine(y, x); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopSoftsignDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateSoftsignDescriptor( + context::getInfiniopHandle(device), &desc, + y->desc(), x->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 获取工作空间大小 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetSoftsignWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + // 执行计算 + INFINICORE_CHECK_ERROR(infiniopSoftsign( + desc, workspace->data(), workspace_size, + y->data(), x->data(), context::getStream())); +} +static bool registered = []() { + Softsign::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::softsign_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 431c3a37b..4caf293eb 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -7,7 +7,6 @@ #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" -#include "ops/linear.hpp" #include "ops/matmul.hpp" #include "ops/mul.hpp" #include "ops/paged_attention.hpp" @@ -18,7 +17,11 @@ #include "ops/rope.hpp" #include "ops/silu.hpp" #include "ops/swiglu.hpp" - +#include "ops/softsign.hpp" +#include "ops/softplus.hpp" +#include "ops/broadcast_to.hpp" +#include "ops/linear.hpp" +#include "ops/huber_loss.hpp" namespace py = pybind11; namespace infinicore::ops { @@ -29,7 +32,6 @@ inline void bind(py::module &m) { bind_attention(m); bind_causal_softmax(m); bind_random_sample(m); - bind_linear(m); bind_matmul(m); bind_mul(m); bind_paged_attention(m); @@ -39,6 +41,11 @@ inline void bind(py::module &m) { bind_silu(m); bind_swiglu(m); bind_rope(m); + bind_broadcast_to(m); + bind_softplus(m); + bind_softsign(m); + bind_linear(m); + bind_huber_loss(m); bind_embedding(m); } diff --git a/src/infinicore/pybind11/ops/broadcast_to.hpp b/src/infinicore/pybind11/ops/broadcast_to.hpp new file mode 100644 index 000000000..ca2832270 --- /dev/null +++ b/src/infinicore/pybind11/ops/broadcast_to.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include +#include + +#include "infinicore/ops/broadcast_to.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_broadcast_to(py::module &m) { + m.def("broadcast_to", + &op::broadcast_to, + py::arg("x"), + py::arg("shape"), + R"doc(Broadcast tensor to target shape.)doc"); + + m.def("broadcast_to_", + &op::broadcast_to_, + py::arg("y"), + py::arg("x"), + R"doc(In-place/Out broadcast tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/huber_loss.hpp b/src/infinicore/pybind11/ops/huber_loss.hpp new file mode 100644 index 000000000..507b33254 --- /dev/null +++ b/src/infinicore/pybind11/ops/huber_loss.hpp @@ -0,0 +1,44 @@ +#pragma once + +#include +#include "infinicore/ops/huber_loss.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_huber_loss(py::module &m) { + m.def("huber_loss", + [](const Tensor& input, const Tensor& target, float delta, int reduction) { + return op::huber_loss(input, target, delta, reduction); + }, + py::arg("input"), + py::arg("target"), + py::arg("delta") = 1.0f, + py::arg("reduction") = 1, + R"doc(Computes the Huber Loss between input and target. + + Args: + input (Tensor): Input tensor of arbitrary shape. + target (Tensor): Ground truth labels, same shape as input. + delta (float, optional): The threshold at which to change between delta-scaled L1 and L2 loss. Default: 1.0. + reduction (int, optional): Specifies the reduction to apply to the output: 0=None, 1=Mean, 2=Sum. Default: 1. + )doc"); + + // ------------------------------------------------------------------------- + // 2. 绑定 in-place 接口 (huber_loss_) + // ------------------------------------------------------------------------- + m.def("huber_loss_", + [](Tensor& output, const Tensor& input, const Tensor& target, float delta, int reduction) { + // 调用底层 + op::huber_loss_(output, input, target, delta, reduction); + }, + py::arg("output"), + py::arg("input"), + py::arg("target"), + py::arg("delta") = 1.0f, + py::arg("reduction") = 1, + R"doc(Explicit output Huber Loss operation. Writes the result into the output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/softplus.hpp b/src/infinicore/pybind11/ops/softplus.hpp new file mode 100644 index 000000000..cb7c6d568 --- /dev/null +++ b/src/infinicore/pybind11/ops/softplus.hpp @@ -0,0 +1,30 @@ +#pragma once + +#include + +#include "infinicore/ops/softplus.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_softplus(py::module &m) { + // Functional interface: returns a new Tensor + m.def("softplus", + &op::softplus, + py::arg("x"), + py::arg("beta") = 1.0f, + py::arg("threshold") = 20.0f, + R"doc(Computes the softplus function element-wise: y = 1/beta * log(1 + exp(beta * x)).)doc"); + + // In-place/Out-variant interface: writes to provided output Tensor + m.def("softplus_", + &op::softplus_, + py::arg("y"), + py::arg("x"), + py::arg("beta") = 1.0f, + py::arg("threshold") = 20.0f, + R"doc(In-place softplus activation. Writes result into y.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/softsign.hpp b/src/infinicore/pybind11/ops/softsign.hpp new file mode 100644 index 000000000..1191abe02 --- /dev/null +++ b/src/infinicore/pybind11/ops/softsign.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/softsign.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_softsign(py::module &m) { + m.def("softsign", + &op::softsign, + py::arg("x"), + R"doc(Softsign activation function.)doc"); + + m.def("softsign_", + &op::softsign_, + py::arg("y"), + py::arg("x"), + R"doc(In-place softsign activation.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/broadcast_to/broadcast_to.h b/src/infiniop/ops/broadcast_to/broadcast_to.h new file mode 100644 index 000000000..39e2c3090 --- /dev/null +++ b/src/infiniop/ops/broadcast_to/broadcast_to.h @@ -0,0 +1,48 @@ +#ifndef __BROADCAST_TO_H__ +#define __BROADCAST_TO_H__ + +#include "../../operator.h" +#include "info.h" // 引用对应的 BroadcastToInfo 定义 +#include + +// 宏定义:用于生成不同命名空间下的 Descriptor 类 +#define DESCRIPTOR(NAMESPACE) \ + namespace op::broadcast_to::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + BroadcastToInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + BroadcastToInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + const std::vector &input_descs); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const std::vector &inputs, \ + void *stream) const; \ + }; \ + } + +#endif // __BROADCAST_TO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.cc b/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.cc new file mode 100644 index 000000000..2c09fecc8 --- /dev/null +++ b/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.cc @@ -0,0 +1,116 @@ +#include "broadcast_to_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include + +#include "../../../../utils/custom_types.h" + +namespace op::broadcast_to::cpu { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + const std::vector &input_descs) { + + auto handle = reinterpret_cast(handle_); + auto result = BroadcastToInfo::create(out_desc, input_descs); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + 0, // CPU 实现不需要 workspace + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +template +void calculate_cpu_impl( + const BroadcastToInfo &info, + void *output, + const void *input) { + + size_t count = info.count(); + int ndim = info.ndim(); + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + + // 并行遍历输出的每一个元素 + #pragma omp parallel for schedule(static) + for (size_t i = 0; i < count; ++i) { + size_t temp_idx = i; + size_t input_offset = 0; + + // 坐标变换:Output Linear Index -> Coordinate -> Input Linear Offset + for (int d = 0; d < ndim; ++d) { + size_t out_stride = info._out_strides[d]; + size_t coord = temp_idx / out_stride; + temp_idx %= out_stride; + input_offset += coord * info._in_strides[d]; + } + + // 3. 赋值 + out_ptr[i] = in_ptr[input_offset]; + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const std::vector &inputs, + void *stream) const { + + if (inputs.size() != 1) { + return INFINI_STATUS_BAD_PARAM; + } + const void *input = inputs[0]; + auto dtype = _info.dtype(); + + switch (dtype) { + case INFINI_DTYPE_F32: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_F64: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_F16: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_BF16: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_I64: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_I32: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_U8: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_I8: + cpu::calculate_cpu_impl(_info, output, input); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::broadcast_to::cpu \ No newline at end of file diff --git a/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.h b/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.h new file mode 100644 index 000000000..2edb89442 --- /dev/null +++ b/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.h @@ -0,0 +1,8 @@ +#ifndef __BROADCAST_TO_CPU_H__ +#define __BROADCAST_TO_CPU_H__ + +#include "../broadcast_to.h" + +DESCRIPTOR(cpu) + +#endif // __BROADCAST_TO_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/broadcast_to/cuda/kernel.cuh b/src/infiniop/ops/broadcast_to/cuda/kernel.cuh new file mode 100644 index 000000000..637f5ea7c --- /dev/null +++ b/src/infiniop/ops/broadcast_to/cuda/kernel.cuh @@ -0,0 +1,53 @@ +#ifndef __BROADCAST_TO_CUDA_CUH__ +#define __BROADCAST_TO_CUDA_CUH__ + +#include +#include +#include + + +#include +#include +#include + +namespace op::broadcast_to::cuda { + +// 最大维度定义,需与 BroadcastToInfo 中的保持一致 +static constexpr int MAX_DIM = 8; +struct BroadcastStrides { + int64_t out_strides[MAX_DIM]; + int64_t in_strides[MAX_DIM]; +}; +template +__global__ void broadcast_kernel( + T * __restrict__ output, // Output data pointer + const T * __restrict__ input, // Input data pointer + int ndim, + size_t count, // Total elements in output + BroadcastStrides strides) { // Strides passed by value + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < count) { + size_t temp_idx = idx; + size_t input_offset = 0; + + // 坐标变换与偏移计算 + #pragma unroll + for (int i = 0; i < MAX_DIM; ++i) { + if (i >= ndim) break; + + int64_t out_s = strides.out_strides[i]; + int64_t in_s = strides.in_strides[i]; + size_t coord = temp_idx / out_s; + temp_idx %= out_s; + input_offset += coord * in_s; + } + + output[idx] = input[input_offset]; + } +} + +} // namespace op::broadcast_to::cuda + +#endif // __BROADCAST_TO_CUDA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/broadcast_to/info.h b/src/infiniop/ops/broadcast_to/info.h new file mode 100644 index 000000000..130480c6c --- /dev/null +++ b/src/infiniop/ops/broadcast_to/info.h @@ -0,0 +1,97 @@ +#ifndef __BROADCAST_TO_INFO_H__ +#define __BROADCAST_TO_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include +#include // for std::max + +namespace op::broadcast_to { + +class BroadcastToInfo { + BroadcastToInfo() = default; + +public: + static constexpr int MAX_DIM = 8; // 定义最大维度,方便做定长数组 + + int _dtype; + int _ndim; // 统一后的维度(等于输出维度) + size_t _count; + + // 存储对齐后的用于计算的信息 + int64_t _out_shape[MAX_DIM]; + int64_t _out_strides[MAX_DIM]; + int64_t _in_shape[MAX_DIM]; // 已经对齐并填充了1的输入Shape + int64_t _in_strides[MAX_DIM]; // 已经处理过广播(stride=0)的输入Stride + + int dtype() const { return _dtype; } + int ndim() const { return _ndim; } + size_t count() const { return _count; } + + // 构造函数 + BroadcastToInfo(int dtype, int ndim, size_t count) + : _dtype(dtype), _ndim(ndim), _count(count) {} + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + const std::vector &input_descs) { + + if (input_descs.size() != 1) return INFINI_STATUS_BAD_PARAM; + auto input_desc = input_descs[0]; + + if (out_desc->dtype() != input_desc->dtype()) return INFINI_STATUS_BAD_TENSOR_DTYPE; + if (out_desc->ndim() < input_desc->ndim()) return INFINI_STATUS_BAD_TENSOR_SHAPE; + if (out_desc->ndim() > MAX_DIM) return INFINI_STATUS_BAD_PARAM; + + BroadcastToInfo info(out_desc->dtype(), out_desc->ndim(), 0); + + // 1. 计算总元素个数并拷贝 Output 信息 + size_t count = 1; + for (int i = 0; i < info._ndim; ++i) { + info._out_shape[i] = out_desc->shape()[i]; + info._out_strides[i] = out_desc->strides()[i]; + count *= out_desc->shape()[i]; + } + info._count = count; + + // 2. 维度对齐与广播规则检查 (Alignment & Broadcasting) + // 计算维度差:例如 out(2,3,4), in(3,4) -> offset = 1 + int offset = info._ndim - input_desc->ndim(); + + for (int i = 0; i < info._ndim; ++i) { + // i 是输出的维度索引 + // in_i 是对应的输入维度索引 + int in_i = i - offset; + + int64_t out_dim = info._out_shape[i]; + int64_t in_dim = 1; // 默认填充 1 (Input 维度不足时) + int64_t in_stride = 0; // 默认 Stride 0 (对应填充的 1) + + if (in_i >= 0) { + // 如果输入在这个维度有定义 + in_dim = input_desc->shape()[in_i]; + in_stride = input_desc->strides()[in_i]; + } + + // 检查规则 + if (in_dim != out_dim && in_dim != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 保存对齐后的信息 + info._in_shape[i] = in_dim; + + if (in_dim == 1 && out_dim > 1) { + info._in_strides[i] = 0; + } else { + info._in_strides[i] = in_stride; + } + } + + return utils::Result(info); + } +}; + +} // namespace op::broadcast_to + +#endif // __BROADCAST_TO_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.h b/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.h new file mode 100644 index 000000000..119686978 --- /dev/null +++ b/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.h @@ -0,0 +1,8 @@ +#ifndef __BROADCAST_TO_METAX_API_H__ +#define __BROADCAST_TO_METAX_API_H__ + +#include "../broadcast_to.h" + +DESCRIPTOR(metax) + +#endif // __BROADCAST_TO_METAX_API_H__ diff --git a/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.maca b/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.maca new file mode 100644 index 000000000..83b2fa7d9 --- /dev/null +++ b/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.maca @@ -0,0 +1,177 @@ +#include "broadcast_to_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +using nv_bfloat16 = __maca_bfloat16; +using nv_bfloat162 = __maca_bfloat162; + +namespace op::broadcast_to::metax { + +// ================================================================== +// 1. Kernel 定义 +// ================================================================== + +// 最大维度需与 BroadcastToInfo::MAX_DIM 保持一致 +static constexpr int MAX_DIM = 8; + +struct BroadcastStrides { + int64_t out_strides[MAX_DIM]; + int64_t in_strides[MAX_DIM]; +}; + +template +__global__ void broadcast_kernel( + T * __restrict__ output, // Output data pointer + const T * __restrict__ input, // Input data pointer + int ndim, + size_t count, // Total elements in output + BroadcastStrides strides) { // Strides passed by value + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < count) { + size_t temp_idx = idx; + size_t input_offset = 0; +#pragma unroll + for (int i = 0; i < MAX_DIM; ++i) { + if (i >= ndim) break; + + int64_t out_s = strides.out_strides[i]; + int64_t in_s = strides.in_strides[i]; + size_t coord = temp_idx / out_s; + temp_idx %= out_s; + input_offset += coord * in_s; + } + + output[idx] = input[input_offset]; + } +} + +// ================================================================== +// 2. Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const BroadcastToInfo& info, + void *stream) { + + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + + auto mc_stream = reinterpret_cast(stream); + + BroadcastStrides strides; + for (int i = 0; i < BroadcastToInfo::MAX_DIM; ++i) { + strides.out_strides[i] = info._out_strides[i]; + strides.in_strides[i] = info._in_strides[i]; + } + + size_t count = info.count(); + size_t block_size = 256; + size_t grid_size = (count + block_size - 1) / block_size; + if (grid_size == 0) grid_size = 1; + + broadcast_kernel + <<>>( + out_ptr, + in_ptr, + info.ndim(), + count, + strides + ); +} + +// ================================================================== +// 3. Descriptor 实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + const std::vector &input_descs) { + + auto handle = reinterpret_cast(handle_); + + auto info_result = BroadcastToInfo::create(out_desc, input_descs); + if (!info_result) { + return info_result.status(); + } + + size_t workspace_size = 0; // broadcast_to 不需要额外 workspace + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const std::vector &inputs, + void *stream) const { + + if (inputs.size() != 1) { + return INFINI_STATUS_BAD_PARAM; + } + const void *input = inputs[0]; + + auto dtype = _info.dtype(); + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel<__half>(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_I64: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_I32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_U8: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_I8: + launch_kernel(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::broadcast_to::metax diff --git a/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.h b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.h new file mode 100644 index 000000000..c0401398a --- /dev/null +++ b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.h @@ -0,0 +1,8 @@ +#ifndef __BROADCAST_TO_MOORE_H__ +#define __BROADCAST_TO_MOORE_H__ + +#include "../broadcast_to.h" + +DESCRIPTOR(moore) + +#endif // __BROADCAST_TO_MOORE_H__ diff --git a/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.mu b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.mu new file mode 100644 index 000000000..4574737d3 --- /dev/null +++ b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.mu @@ -0,0 +1,122 @@ +#include "broadcast_to_moore.h" +#include "broadcast_to_moore_kernel.h" +#include "../../../devices/moore/moore_handle.h" +#include +#include +#include + +namespace op::broadcast_to::moore { + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const BroadcastToInfo& info, + void *stream) { + + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + + auto musa_stream = reinterpret_cast(stream); + + // 复制 strides 到 Kernel 定义的结构体中 + op::broadcast_to::moore::BroadcastStrides strides; + for (int i = 0; i < BroadcastToInfo::MAX_DIM; ++i) { + strides.out_strides[i] = info._out_strides[i]; + strides.in_strides[i] = info._in_strides[i]; + } + + size_t count = info.count(); + size_t block_size = 256; + size_t grid_size = (count + block_size - 1) / block_size; + + op::broadcast_to::moore::broadcast_kernel + <<>>( + out_ptr, + in_ptr, + info.ndim(), + count, + strides + ); +} + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + const std::vector &input_descs) { + + auto handle = reinterpret_cast(handle_); + + auto info_result = BroadcastToInfo::create(out_desc, input_descs); + if (!info_result) return info_result.status(); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const std::vector &inputs, + void *stream) const { + + if (inputs.size() != 1) { + return INFINI_STATUS_BAD_PARAM; + } + const void *input = inputs[0]; + + auto dtype = _info.dtype(); + + // 3. 根据数据类型分发 Kernel + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16>(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_I64: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_I32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_U8: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_I8: + launch_kernel(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::broadcast_to::moore \ No newline at end of file diff --git a/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore_kernel.h b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore_kernel.h new file mode 100644 index 000000000..1e9a401d7 --- /dev/null +++ b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore_kernel.h @@ -0,0 +1,57 @@ +#ifndef __BROADCAST_TO_MOORE_KERNEL_H__ +#define __BROADCAST_TO_MOORE_KERNEL_H__ + +#include +#include +#include + +#include +#include +#include + +namespace op::broadcast_to::moore { + +// 最大维度定义,需与 BroadcastToInfo 中的保持一致 +static constexpr int MAX_DIM = 8; + +struct BroadcastStrides { + int64_t out_strides[MAX_DIM]; + int64_t in_strides[MAX_DIM]; +}; + +template +__global__ void broadcast_kernel( + T * __restrict__ output, // Output data pointer + const T * __restrict__ input, // Input data pointer + int ndim, + size_t count, // Total elements in output + BroadcastStrides strides) { // Strides passed by value + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < count) { + size_t temp_idx = idx; + size_t input_offset = 0; + + // 坐标变换与偏移计算 + // 将扁平的 output index 转换为多维坐标,再根据 input strides 计算 input offset + #pragma unroll + for (int i = 0; i < MAX_DIM; ++i) { + if (i >= ndim) break; + + int64_t out_s = strides.out_strides[i]; + int64_t in_s = strides.in_strides[i]; + + // 计算当前维度的坐标 + size_t coord = temp_idx / out_s; + temp_idx %= out_s; + input_offset += coord * in_s; + } + + output[idx] = input[input_offset]; + } +} + +} // namespace op::broadcast_to::moore + +#endif // __BROADCAST_TO_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cu b/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cu new file mode 100644 index 000000000..4a1f0c63f --- /dev/null +++ b/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cu @@ -0,0 +1,114 @@ +#include "broadcast_to_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include +#include +#include + +namespace op::broadcast_to::nvidia { + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const BroadcastToInfo& info, + void *stream) { + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + + auto cuda_stream = reinterpret_cast(stream); + op::broadcast_to::cuda::BroadcastStrides strides; + for (int i = 0; i < BroadcastToInfo::MAX_DIM; ++i) { + strides.out_strides[i] = info._out_strides[i]; + strides.in_strides[i] = info._in_strides[i]; + } + size_t count = info.count(); + size_t block_size = 256; + size_t grid_size = (count + block_size - 1) / block_size; + op::broadcast_to::cuda::broadcast_kernel + <<>>( + out_ptr, + in_ptr, + info.ndim(), + count, + strides + ); +} + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + const std::vector &input_descs) { + auto info_result = BroadcastToInfo::create(out_desc, input_descs); + if (!info_result) return info_result.status(); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const std::vector &inputs, + void *stream) const { + + if (inputs.size() != 1) { + return INFINI_STATUS_BAD_PARAM; + } + const void *input = inputs[0]; + + auto dtype = _info.dtype(); + + // 3. 根据数据类型分发 Kernel + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_I64: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_I32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_U8: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_I8: + launch_kernel(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::broadcast_to::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cuh b/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cuh new file mode 100644 index 000000000..62069ca51 --- /dev/null +++ b/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __BROADCAST_TO_NVIDIA_CUH__ +#define __BROADCAST_TO_NVIDIA_CUH__ + +#include "../broadcast_to.h" + +DESCRIPTOR(nvidia) + +#endif // __BROADCAST_TO_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/broadcast_to/operator.cc b/src/infiniop/ops/broadcast_to/operator.cc new file mode 100644 index 000000000..61e6a9dea --- /dev/null +++ b/src/infiniop/ops/broadcast_to/operator.cc @@ -0,0 +1,176 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/broadcast_to.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/broadcast_to_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/broadcast_to_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/broadcast_to_metax.h" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/broadcast_to_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateBroadcastToDescriptor( + infiniopHandle_t handle, + infiniopBroadcastToDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::broadcast_to::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetBroadcastToWorkspaceSize(infiniopBroadcastToDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopBroadcastTo( + infiniopBroadcastToDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + + // 注意:{x} 用于构造 std::vector + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroyBroadcastToDescriptor(infiniopBroadcastToDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.cc b/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.cc new file mode 100644 index 000000000..55e6c709a --- /dev/null +++ b/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.cc @@ -0,0 +1,140 @@ +#include "huber_loss_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include +#include + +#include "../../../../utils/custom_types.h" + +namespace op::huber_loss::cpu { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + float delta, + int reduction) { + + auto handle = reinterpret_cast(handle_); + auto result = HuberLossInfo::create(out_desc, input_desc, target_desc, delta, reduction); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + 0, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +template +void calculate_cpu_impl( + const HuberLossInfo &info, + void *output, + const void *input, + const void *target) { + + size_t count = info.count(); + float delta = info.delta(); + int reduction = info.reduction(); + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + // Huber Loss 中 target 是数值,类型与 input 一致 + auto tar_ptr = reinterpret_cast(target); + float half_delta = 0.5f * delta; + + if (reduction == 0) { // None + #pragma omp parallel for schedule(static) + for (size_t i = 0; i < count; ++i) { + float val = utils::cast(in_ptr[i]); + float tgt = utils::cast(tar_ptr[i]); + + float diff = val - tgt; + float abs_diff = std::abs(diff); + float loss = 0.0f; + + if (abs_diff < delta) { + // 0.5 * (x - y)^2 + loss = 0.5f * diff * diff; + } else { + // delta * (|x - y| - 0.5 * delta) + loss = delta * (abs_diff - half_delta); + } + + out_ptr[i] = utils::cast(loss); + } + } else { // Mean or Sum + double total_loss = 0.0; + + #pragma omp parallel for reduction(+:total_loss) schedule(static) + for (size_t i = 0; i < count; ++i) { + float val = utils::cast(in_ptr[i]); + float tgt = utils::cast(tar_ptr[i]); + + float diff = val - tgt; + float abs_diff = std::abs(diff); + float loss = 0.0f; + + if (abs_diff < delta) { + loss = 0.5f * diff * diff; + } else { + loss = delta * (abs_diff - half_delta); + } + + total_loss += static_cast(loss); + } + + if (reduction == 1) { // Mean + total_loss /= static_cast(count); + } + + out_ptr[0] = utils::cast(static_cast(total_loss)); + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + void *stream) const { + + auto dtype = _info.dtype(); + switch (dtype) { + case INFINI_DTYPE_F32: + cpu::calculate_cpu_impl(_info, output, input, target); + break; + case INFINI_DTYPE_F64: + cpu::calculate_cpu_impl(_info, output, input, target); + break; + case INFINI_DTYPE_F16: + cpu::calculate_cpu_impl(_info, output, input, target); + break; + case INFINI_DTYPE_BF16: + cpu::calculate_cpu_impl(_info, output, input, target); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::huber_loss::cpu \ No newline at end of file diff --git a/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.h b/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.h new file mode 100644 index 000000000..1a0b45978 --- /dev/null +++ b/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.h @@ -0,0 +1,8 @@ +#ifndef __HUBER_LOSS_CPU_H__ +#define __HUBER_LOSS_CPU_H__ + +#include "../huber_loss.h" + +DESCRIPTOR(cpu) + +#endif // __HUBER_LOSS_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/huber_loss/cuda/kernel.cuh b/src/infiniop/ops/huber_loss/cuda/kernel.cuh new file mode 100644 index 000000000..ec48d9a9e --- /dev/null +++ b/src/infiniop/ops/huber_loss/cuda/kernel.cuh @@ -0,0 +1,106 @@ +#ifndef __HUBER_LOSS_CUDA_CUH__ +#define __HUBER_LOSS_CUDA_CUH__ + +#include +#include +#include +#include +#include + +namespace op::huber_loss::cuda { + +__device__ __forceinline__ float warpReduceSum(float val) { + unsigned int mask = 0xffffffff; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += __shfl_down_sync(mask, val, offset); + return val; +} + +__device__ __forceinline__ float blockReduceSum(float val) { + static __shared__ float shared[32]; + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + + val = warpReduceSum(val); + if (lane == 0) shared[wid] = val; + __syncthreads(); + + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0f; + if (wid == 0) val = warpReduceSum(val); + return val; +} + +struct HuberLossFunctor { + float delta; + float half_delta; + + __host__ __device__ HuberLossFunctor(float delta_val) + : delta(delta_val), half_delta(0.5f * delta_val) {} + + __device__ __forceinline__ float compute(float input_val, float target_val) const { + float diff = input_val - target_val; + float abs_diff = std::abs(diff); + + if (abs_diff < delta) { + return 0.5f * diff * diff; + } else { + return delta * (abs_diff - half_delta); + } + } +}; + +template +__global__ void huber_loss_kernel( + T * __restrict__ output, + const T * __restrict__ input, + const T * __restrict__ target, + size_t count, + HuberLossFunctor functor) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < count) { + float in_val = static_cast(input[idx]); + float tg_val = static_cast(target[idx]); + + float loss = functor.compute(in_val, tg_val); + + output[idx] = static_cast(loss); + } +} + +template +__global__ void huber_loss_reduce_kernel( + float * output, + const T * __restrict__ input, + const T * __restrict__ target, + size_t count, + HuberLossFunctor functor, + float scale +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + float local_sum = 0.0f; + + for (size_t i = idx; i < count; i += stride) { + float in_val = static_cast(input[i]); + float tg_val = static_cast(target[i]); + + local_sum += functor.compute(in_val, tg_val); + } + + float block_sum = blockReduceSum(local_sum); + + if (threadIdx.x == 0) { + atomicAdd(output, block_sum * scale); + } +} + +template +__global__ void cast_float_to_t(T* output, const float* src) { + *output = static_cast(*src); +} + +} // namespace op::huber_loss::cuda + +#endif // __HUBER_LOSS_CUDA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/huber_loss/huber_loss.h b/src/infiniop/ops/huber_loss/huber_loss.h new file mode 100644 index 000000000..be602ca1d --- /dev/null +++ b/src/infiniop/ops/huber_loss/huber_loss.h @@ -0,0 +1,49 @@ +#ifndef __HUBER_LOSS_H__ +#define __HUBER_LOSS_H__ + +#include "../../operator.h" +#include "info.h" // 引用对应的 HuberLossInfo 定义 +#define DESCRIPTOR(NAMESPACE) \ + namespace op::huber_loss::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + HuberLossInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + HuberLossInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t target_desc, \ + float delta, \ + int reduction); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + const void *target, \ + void *stream) const; \ + }; \ + } + +#endif // __HUBER_LOSS_H__ \ No newline at end of file diff --git a/src/infiniop/ops/huber_loss/info.h b/src/infiniop/ops/huber_loss/info.h new file mode 100644 index 000000000..fcfe09b72 --- /dev/null +++ b/src/infiniop/ops/huber_loss/info.h @@ -0,0 +1,79 @@ +#ifndef __HUBER_LOSS_INFO_H__ +#define __HUBER_LOSS_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" + +namespace op::huber_loss { + +class HuberLossInfo { + HuberLossInfo() = default; + +public: + int _dtype; + float _delta; + int _reduction; + size_t _count; + + int dtype() const { return _dtype; } + float delta() const { return _delta; } + int reduction() const { return _reduction; } + size_t count() const { return _count; } + + HuberLossInfo(int dtype, float delta, int reduction, size_t count) + : _dtype(dtype), _delta(delta), _reduction(reduction), _count(count) {} + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + float delta, + int reduction) { + + if (input_desc->ndim() != target_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t total_count = input_desc->numel(); + if (target_desc->numel() != total_count) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + for (size_t i = 0; i < input_desc->ndim(); ++i) { + if (input_desc->shape()[i] != target_desc->shape()[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + if (input_desc->dtype() != target_desc->dtype() || + input_desc->dtype() != out_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + if (reduction == 0) { + if (out_desc->ndim() != input_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + for (size_t i = 0; i < out_desc->ndim(); ++i) { + if (out_desc->shape()[i] != input_desc->shape()[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + } else { + if (out_desc->numel() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + return utils::Result(HuberLossInfo{ + input_desc->dtype(), + delta, + reduction, + total_count + }); + } +}; + +} // namespace op::huber_loss + +#endif // __HUBER_LOSS_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/huber_loss/metax/huber_loss_metax.h b/src/infiniop/ops/huber_loss/metax/huber_loss_metax.h new file mode 100644 index 000000000..7091e9bef --- /dev/null +++ b/src/infiniop/ops/huber_loss/metax/huber_loss_metax.h @@ -0,0 +1,8 @@ +#ifndef __HUBER_LOSS_METAX_API_H__ +#define __HUBER_LOSS_METAX_API_H__ + +#include "../huber_loss.h" + +DESCRIPTOR(metax) + +#endif // __HUBER_LOSS_METAX_API_H__ diff --git a/src/infiniop/ops/huber_loss/metax/huber_loss_metax.maca b/src/infiniop/ops/huber_loss/metax/huber_loss_metax.maca new file mode 100644 index 000000000..81aad96f3 --- /dev/null +++ b/src/infiniop/ops/huber_loss/metax/huber_loss_metax.maca @@ -0,0 +1,273 @@ +#include "huber_loss_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include + +#include +#include +#include +#include + +#include +#include +using nv_bfloat16 = __maca_bfloat16; +using nv_bfloat162 = __maca_bfloat162; + +namespace op::huber_loss::metax { + +// ================================================================== +// 1. Functor: 核心数学逻辑 +// ================================================================== +struct HuberLossFunctor { + float delta; + float half_delta; + + __host__ __device__ HuberLossFunctor(float delta_val) + : delta(delta_val), half_delta(0.5f * delta_val) {} + + __device__ __forceinline__ float compute(float input_val, float target_val) const { + float diff = input_val - target_val; + float abs_diff = fabsf(diff); + + if (abs_diff < delta) { + return 0.5f * diff * diff; + } else { + return delta * (abs_diff - half_delta); + } + } +}; + +// ================================================================== +// 2. Kernel 定义 +// ================================================================== + +// ------------------------------------------------------------------ +// Kernel 1: Elementwise 模式 (Reduction = None) +// 输出形状同 input/target,长度为 count +// ------------------------------------------------------------------ +template +__global__ void huber_loss_kernel( + T * __restrict__ output, // [count] + const T * __restrict__ input, // [count] + const T * __restrict__ target, // [count] + size_t count, // 元素个数 + HuberLossFunctor functor) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < count) { + float in_val = static_cast(input[idx]); + float tg_val = static_cast(target[idx]); + + float loss = functor.compute(in_val, tg_val); + + output[idx] = static_cast(loss); + } +} + +// ------------------------------------------------------------------ +// Kernel 2: Reduction 模式 (Mean / Sum) +// ------------------------------------------------------------------ +template +__global__ void huber_loss_reduce_kernel( + float * output, // [1] Accumulator (Float) + const T * __restrict__ input, // [count] + const T * __restrict__ target, // [count] + size_t count, // 元素个数 + HuberLossFunctor functor, + float scale // Mean: 1/count, Sum: 1.0 +) { + __shared__ volatile float shared_mem[256]; + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + float local_sum = 0.0f; + + // 1. Grid-Stride Loop 计算本线程负责的 loss 累加 + for (size_t i = idx; i < count; i += stride) { + float in_val = static_cast(input[i]); + float tg_val = static_cast(target[i]); + + local_sum += functor.compute(in_val, tg_val); + } + + // 2. 写入 shared memory + unsigned int tid = threadIdx.x; + if (tid < 256) { + shared_mem[tid] = local_sum; + } + __syncthreads(); + + // 3. Block 内树形归约 (Unrolled Reduction) + if (tid < 128) { shared_mem[tid] += shared_mem[tid + 128]; } __syncthreads(); + if (tid < 64) { shared_mem[tid] += shared_mem[tid + 64]; } __syncthreads(); + if (tid < 32) { shared_mem[tid] += shared_mem[tid + 32]; } __syncthreads(); + if (tid < 16) { shared_mem[tid] += shared_mem[tid + 16]; } __syncthreads(); + if (tid < 8) { shared_mem[tid] += shared_mem[tid + 8]; } __syncthreads(); + if (tid < 4) { shared_mem[tid] += shared_mem[tid + 4]; } __syncthreads(); + if (tid < 2) { shared_mem[tid] += shared_mem[tid + 2]; } __syncthreads(); + if (tid < 1) { shared_mem[tid] += shared_mem[tid + 1]; } __syncthreads(); + + // 4. Block 级结果累加到全局 output + if (tid == 0) { + float block_sum = shared_mem[0]; + atomicAdd(output, block_sum * scale); + } +} + +// ------------------------------------------------------------------ +// Kernel 3: 类型转换 (Float -> T) +// ------------------------------------------------------------------ +template +__global__ void cast_float_to_t(T* output, const float* src) { + *output = static_cast(*src); +} + +// ================================================================== +// 3. Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const void *target, + void* workspace, + const HuberLossInfo& info, + void *stream) { + + auto in_ptr = reinterpret_cast(input); + auto tar_ptr = reinterpret_cast(target); + auto out_ptr = reinterpret_cast(output); + + auto mc_stream = reinterpret_cast(stream); + + size_t count = info.count(); + int reduction = info.reduction(); + + HuberLossFunctor functor(info.delta()); + + // -------------------------------------------------------------- + // Mode 1: Elementwise (Reduction = None) + // -------------------------------------------------------------- + if (reduction == 0) { + size_t block_size = 256; + size_t grid_size = (count + block_size - 1) / block_size; + if (grid_size == 0) grid_size = 1; + + huber_loss_kernel + <<>>( + out_ptr, in_ptr, tar_ptr, count, functor + ); + } + // -------------------------------------------------------------- + // Mode 2: Reduction (Mean / Sum) + // -------------------------------------------------------------- + else { + // workspace 用作浮点累加器 + float* acc_ptr = reinterpret_cast(workspace); + mcMemsetAsync(acc_ptr, 0, sizeof(float), mc_stream); + + float scale = (reduction == 1) + ? (1.0f / static_cast(count)) // reduction == 1: mean + : 1.0f; // reduction == 2: sum + + size_t block_size = 256; + size_t grid_size = std::min( + (count + block_size - 1) / block_size, + static_cast(1024) + ); + if (grid_size == 0) grid_size = 1; + + huber_loss_reduce_kernel + <<>>( + acc_ptr, in_ptr, tar_ptr, count, functor, scale + ); + + // 将 float 标量结果转换回目标类型 T + cast_float_to_t + <<<1, 1, 0, mc_stream>>>(out_ptr, acc_ptr); + } +} + +// ================================================================== +// 4. Descriptor Implementation +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + float delta, + int reduction) { + + auto handle = reinterpret_cast(handle_); + + auto info_result = HuberLossInfo::create( + out_desc, input_desc, target_desc, delta, reduction + ); + if (!info_result) { + return info_result.status(); + } + + // reduction 模式下需要一个 float workspace(用于 atomicAdd 累加) + size_t workspace_size = 0; + if (reduction != 0) { + workspace_size = sizeof(float); + } + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + void *stream) const { + + auto dtype = _info.dtype(); + int reduction = _info.reduction(); + + if (reduction != 0 && workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel<__half>(output, input, target, workspace, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, target, workspace, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, target, workspace, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, target, workspace, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::huber_loss::metax diff --git a/src/infiniop/ops/huber_loss/moore/huber_loss_moore.h b/src/infiniop/ops/huber_loss/moore/huber_loss_moore.h new file mode 100644 index 000000000..a3a2b4dde --- /dev/null +++ b/src/infiniop/ops/huber_loss/moore/huber_loss_moore.h @@ -0,0 +1,8 @@ +#ifndef __HUBER_LOSS_MOORE_H__ +#define __HUBER_LOSS_MOORE_H__ + +#include "../huber_loss.h" + +DESCRIPTOR(moore) + +#endif // __HUBER_LOSS_MOORE_H__ diff --git a/src/infiniop/ops/huber_loss/moore/huber_loss_moore.mu b/src/infiniop/ops/huber_loss/moore/huber_loss_moore.mu new file mode 100644 index 000000000..7d56380ba --- /dev/null +++ b/src/infiniop/ops/huber_loss/moore/huber_loss_moore.mu @@ -0,0 +1,153 @@ +#include "huber_loss_moore.h" +#include "huber_loss_moore_kernel.h" +#include "../../../devices/moore/moore_handle.h" +#include +#include + +namespace op::huber_loss::moore { + +template +static inline bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const void *target, + void* workspace, + const HuberLossInfo& info, + void *stream) { + + // 1. 准备指针 + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + // Huber Loss 中 Target 类型与 Input 一致 + auto tar_ptr = reinterpret_cast(target); + + auto musa_stream = reinterpret_cast(stream); + + // 2. 准备参数 + size_t count = info.count(); + int reduction = info.reduction(); + + op::huber_loss::moore::HuberLossFunctor functor(info.delta()); + + // ------------------------------------------ + // 模式 1: Elementwise (Reduction = None) + // ------------------------------------------ + if (reduction == 0) { + // 每个线程处理一个元素 + size_t block_size = 256; + size_t grid_size = (count + block_size - 1) / block_size; + + op::huber_loss::moore::huber_loss_kernel + <<>>( + out_ptr, in_ptr, tar_ptr, count, functor + ); + } + // ------------------------------------------ + // 模式 2: Reduction (Mean / Sum) + // ------------------------------------------ + else { + // 使用 workspace 作为临时的 float 累加器 + float* acc_ptr = reinterpret_cast(workspace); + musaMemsetAsync(acc_ptr, 0, sizeof(float), musa_stream); + + // 1=Mean, 2=Sum + float scale = (reduction == 1) ? (1.0f / static_cast(count)) : 1.0f; + + size_t block_size = 256; + size_t grid_size = std::min((count + block_size - 1) / block_size, static_cast(1024)); + + op::huber_loss::moore::huber_loss_reduce_kernel + <<>>( + acc_ptr, in_ptr, tar_ptr, count, functor, scale + ); + + // 将 float 累加结果转回 T 写入 output + op::huber_loss::moore::cast_float_to_t + <<<1, 1, 0, musa_stream>>>(out_ptr, acc_ptr); + } +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + float delta, + int reduction) { + + auto handle = reinterpret_cast(handle_); + + auto info_result = HuberLossInfo::create(out_desc, input_desc, target_desc, delta, reduction); + if (!info_result) return info_result.status(); + + size_t workspace_size = 0; + if (reduction != 0) { + workspace_size = sizeof(float); + } + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + void *stream) const { + + auto dtype = _info.dtype(); + int reduction = _info.reduction(); + + // 检查 workspace 是否够用 + if (reduction != 0 && workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, target, workspace, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16>(output, input, target, workspace, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, target, workspace, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, target, workspace, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::huber_loss::moore \ No newline at end of file diff --git a/src/infiniop/ops/huber_loss/moore/huber_loss_moore_kernel.h b/src/infiniop/ops/huber_loss/moore/huber_loss_moore_kernel.h new file mode 100644 index 000000000..d1e96879c --- /dev/null +++ b/src/infiniop/ops/huber_loss/moore/huber_loss_moore_kernel.h @@ -0,0 +1,141 @@ +#ifndef __HUBER_LOSS_MOORE_KERNEL_H__ +#define __HUBER_LOSS_MOORE_KERNEL_H__ + +#include +#include +#include +#include +#include +#include + +namespace op::huber_loss::moore { +template +__device__ __forceinline__ float to_float(T val) { + if constexpr (std::is_same_v) { + return __half2float(val); + } else if constexpr (std::is_same_v) { + return __bfloat162float(val); + } else { + return static_cast(val); + } +} + +template +__device__ __forceinline__ T from_float(float val) { + if constexpr (std::is_same_v) { + return __float2half(val); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(val); + } else { + return static_cast(val); + } +} +__device__ __forceinline__ float warpReduceSum(float val) { + unsigned int mask = 0xffffffff; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += __shfl_down_sync(mask, val, offset); + return val; +} + +__device__ __forceinline__ float blockReduceSum(float val) { + static __shared__ float shared[32]; // Max 1024 threads / 32 warps + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + + val = warpReduceSum(val); + if (lane == 0) shared[wid] = val; + __syncthreads(); + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0f; + if (wid == 0) val = warpReduceSum(val); + return val; +} + +struct HuberLossFunctor { + float delta; + float half_delta; // 预计算 0.5 * delta + + __host__ __device__ HuberLossFunctor(float delta_val) + : delta(delta_val), half_delta(0.5f * delta_val) {} + + // Huber Loss 计算: + // if |x - y| < delta: 0.5 * (x - y)^2 + // else: delta * (|x - y| - 0.5 * delta) + __device__ __forceinline__ float compute(float input_val, float target_val) const { + float diff = input_val - target_val; + float abs_diff = std::abs(diff); + + if (abs_diff < delta) { + return 0.5f * diff * diff; + } else { + return delta * (abs_diff - half_delta); + } + } +}; + +// ================================================================== +// Kernel 1: Reduction = None (Element-wise output) +// ================================================================== +template +__global__ void huber_loss_kernel( + T * __restrict__ output, // [N] + const T * __restrict__ input, // [N] + const T * __restrict__ target, // [N] + size_t count, // Total elements (numel) + HuberLossFunctor functor) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < count) { + float in_val = to_float(input[idx]); + float tg_val = to_float(target[idx]); + + float loss = functor.compute(in_val, tg_val); + + output[idx] = from_float(loss); + } +} + +// ================================================================== +// Kernel 2: Reduction = Mean / Sum (Scalar output) +// ================================================================== +template +__global__ void huber_loss_reduce_kernel( + float * output, // [1] Accumulator (Float) + const T * __restrict__ input, // [N] + const T * __restrict__ target, // [N] + size_t count, // Total elements + HuberLossFunctor functor, + float scale // Mean模式传 1/N, Sum模式传 1.0 +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + float local_sum = 0.0f; + + // Grid-Stride Loop over all elements + for (size_t i = idx; i < count; i += stride) { + float in_val = to_float(input[i]); + float tg_val = to_float(target[i]); + + local_sum += functor.compute(in_val, tg_val); + } + + // Block Reduction + float block_sum = blockReduceSum(local_sum); + + // Global Atomic Add (Reduce to scalar) + if (threadIdx.x == 0) { + atomicAdd(output, block_sum * scale); + } +} + +// ================================================================== +// Helper: Cast float result to T (used for scalar output) +// ================================================================== +template +__global__ void cast_float_to_t(T* output, const float* src) { + *output = from_float(*src); +} + +} // namespace op::huber_loss::moore + +#endif // __HUBER_LOSS_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cu b/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cu new file mode 100644 index 000000000..4f1d9bd59 --- /dev/null +++ b/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cu @@ -0,0 +1,119 @@ +#include "huber_loss_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include +#include + +namespace op::huber_loss::nvidia { +template +static inline bool is_aligned(const void *ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} + +template +void launch_kernel( + void *output, + const void *input, + const void *target, + void* workspace, + const HuberLossInfo& info, + void *stream) { + + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + auto tar_ptr = reinterpret_cast(target); + + auto cuda_stream = reinterpret_cast(stream); + + size_t count = info.count(); + int reduction = info.reduction(); + + op::huber_loss::cuda::HuberLossFunctor functor(info.delta()); + + if (reduction == 0) { + size_t block_size = 256; + size_t grid_size = (count + block_size - 1) / block_size; + + op::huber_loss::cuda::huber_loss_kernel + <<>>( + out_ptr, in_ptr, tar_ptr, count, functor + ); + } + else { + float* acc_ptr = reinterpret_cast(workspace); + cudaMemsetAsync(acc_ptr, 0, sizeof(float), cuda_stream); + float scale = (reduction == 1) ? (1.0f / static_cast(count)) : 1.0f; + + size_t block_size = 256; + size_t grid_size = std::min((count + block_size - 1) / block_size, static_cast(1024)); + + op::huber_loss::cuda::huber_loss_reduce_kernel + <<>>( + acc_ptr, in_ptr, tar_ptr, count, functor, scale + ); + op::huber_loss::cuda::cast_float_to_t + <<<1, 1, 0, cuda_stream>>>(out_ptr, acc_ptr); + } +} + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t target_desc, + float delta, + int reduction) { + + auto info_result = HuberLossInfo::create(out_desc, input_desc, target_desc, delta, reduction); + if (!info_result) return info_result.status(); + size_t workspace_size = 0; + if (reduction != 0) { + workspace_size = sizeof(float); + } + + *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + void *stream) const { + + auto dtype = _info.dtype(); + int reduction = _info.reduction(); + + if (reduction != 0 && workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, target, workspace, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, target, workspace, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, target, workspace, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, target, workspace, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::huber_loss::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cuh b/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cuh new file mode 100644 index 000000000..90f76b2b1 --- /dev/null +++ b/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __HUBER_LOSS_NVIDIA_CUH__ +#define __HUBER_LOSS_NVIDIA_CUH__ + +#include "../huber_loss.h" + +DESCRIPTOR(nvidia) + +#endif // __HUBER_LOSS_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/huber_loss/operator.cc b/src/infiniop/ops/huber_loss/operator.cc new file mode 100644 index 000000000..7db651f19 --- /dev/null +++ b/src/infiniop/ops/huber_loss/operator.cc @@ -0,0 +1,180 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/huber_loss.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/huber_loss_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/huber_loss_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/huber_loss_metax.h" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/huber_loss_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateHuberLossDescriptor( + infiniopHandle_t handle, + infiniopHuberLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t target, + float delta, + int reduction) { + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::huber_loss::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + target, \ + delta, \ + reduction) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetHuberLossWorkspaceSize(infiniopHuberLossDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopHuberLoss( + infiniopHuberLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + const void *target, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, target, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyHuberLossDescriptor(infiniopHuberLossDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/softplus/cpu/softplus_cpu.cc b/src/infiniop/ops/softplus/cpu/softplus_cpu.cc index 4272bc37d..e41ab4e47 100644 --- a/src/infiniop/ops/softplus/cpu/softplus_cpu.cc +++ b/src/infiniop/ops/softplus/cpu/softplus_cpu.cc @@ -1,52 +1,135 @@ #include "softplus_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include +#include +#include + +#include "../../../../utils/custom_types.h" namespace op::softplus::cpu { -Descriptor::~Descriptor() = default; +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} infiniStatus_t Descriptor::create( infiniopHandle_t handle_, Descriptor **desc_ptr, infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { + infiniopTensorDescriptor_t input_desc, + float beta, + float threshold) { auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); + auto result = SoftplusInfo::create(out_desc, input_desc, beta, threshold); + + if (!result) { + return result.status(); + } + + // 2. 创建 Descriptor + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + 0, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +// ----------------------------------------------------------------------- +// 核心计算实现:支持 Stride (非连续内存) +// ----------------------------------------------------------------------- +template +void calculate_cpu_impl( + const SoftplusInfo &info, + void *output, + const void *input) { - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); + size_t num_elements = info.num_elements(); + float beta = info.beta(); + float threshold = info.threshold(); + + // 获取内存布局信息 (依赖更新后的 SoftplusInfo) + bool is_contiguous = info.is_contiguous(); + int ndim = info.ndim(); + const auto& shape = info.shape(); + const auto& strides = info.strides(); - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); - CHECK_SAME_SHAPE(y_shape, x_shape); + #pragma omp parallel for schedule(static) + for (size_t i = 0; i < num_elements; ++i) { + // 1. 计算输入偏移量 (Input Offset) + size_t input_offset = i; // 默认为线性索引 - // create CPU elementwise descriptor - CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + if (!is_contiguous) { + // 如果内存不连续,需要进行坐标变换:Linear Index -> Coordinate -> Physical Offset + input_offset = 0; + size_t temp_idx = i; + for (int d = ndim - 1; d >= 0; --d) { + size_t dim_size = shape[d]; + size_t coord = temp_idx % dim_size; + temp_idx /= dim_size; + input_offset += coord * strides[d]; + } + } + using CalcType = std::conditional_t, double, float>; + + CalcType x = utils::cast(in_ptr[input_offset]); + CalcType b = static_cast(beta); + CalcType t = static_cast(threshold); + + CalcType bx = b * x; + CalcType result; - return INFINI_STATUS_SUCCESS; + // 3. 计算 Softplus + if (bx > t) { + result = x; + } else { + result = std::log1p(std::exp(bx)) / b; + } + out_ptr[i] = utils::cast(result); + } } infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, void *output, - std::vector inputs, + const void *input, void *stream) const { + + auto dtype = _info.dtype(); - switch (_dtype) { - case INFINI_DTYPE_F16: - return _device_info->calculate(_info, output, inputs, stream); + switch (dtype) { case INFINI_DTYPE_F32: - return _device_info->calculate(_info, output, inputs, stream); + calculate_cpu_impl(_info, output, input); + break; case INFINI_DTYPE_F64: - return _device_info->calculate(_info, output, inputs, stream); + calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_F16: + calculate_cpu_impl(_info, output, input); + break; case INFINI_DTYPE_BF16: - return _device_info->calculate(_info, output, inputs, stream); + calculate_cpu_impl(_info, output, input); + break; default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } return INFINI_STATUS_SUCCESS; } -} // namespace op::softplus::cpu + +} // namespace op::softplus::cpu \ No newline at end of file diff --git a/src/infiniop/ops/softplus/cpu/softplus_cpu.h b/src/infiniop/ops/softplus/cpu/softplus_cpu.h index ce00fe1a3..9ab0b7ed1 100644 --- a/src/infiniop/ops/softplus/cpu/softplus_cpu.h +++ b/src/infiniop/ops/softplus/cpu/softplus_cpu.h @@ -1,23 +1,8 @@ #ifndef __SOFTPLUS_CPU_H__ #define __SOFTPLUS_CPU_H__ -#include "../../../elementwise/cpu/elementwise_cpu.h" +#include "../softplus.h" -ELEMENTWISE_DESCRIPTOR(softplus, cpu) +DESCRIPTOR(cpu) -namespace op::softplus::cpu { -typedef struct SoftplusOp { -public: - static constexpr size_t num_inputs = 1; - template - T operator()(const T &x) const { - if (x > T(20)) { - return x; - } else { - return std::log(T(1) + std::exp(x)); - } - } -} SoftplusOp; -} // namespace op::softplus::cpu - -#endif // __SOFTPLUS_CPU_H__ +#endif // __SOFTPLUS_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/softplus/cuda/kernel.cuh b/src/infiniop/ops/softplus/cuda/kernel.cuh index 33cef97db..c045dbabf 100644 --- a/src/infiniop/ops/softplus/cuda/kernel.cuh +++ b/src/infiniop/ops/softplus/cuda/kernel.cuh @@ -1,34 +1,66 @@ #ifndef __SOFTPLUS_CUDA_H__ #define __SOFTPLUS_CUDA_H__ +#include +#include +#include +#include + namespace op::softplus::cuda { + typedef struct SoftplusOp { public: static constexpr size_t num_inputs = 1; template - __device__ __forceinline__ T operator()(const T &x) const { + __device__ __forceinline__ T operator()(const T &x, float beta, float threshold) const { + if constexpr (std::is_same_v) { - // promote to float for stability, then cast back float xf = __half2float(x); - float out = (xf > 20.0f) ? xf : log1pf(expf(xf)); + float bx = beta * xf; + float out = (bx > threshold) ? xf : log1pf(expf(bx)) / beta; return __float2half(out); - } else if constexpr (std::is_same_v) { + } + else if constexpr (std::is_same_v) { float xf = __bfloat162float(x); - float out = (xf > 20.0f) ? xf : log1pf(expf(xf)); + float bx = beta * xf; + float out = (bx > threshold) ? xf : log1pf(expf(bx)) / beta; return __float2bfloat16(out); - } else if constexpr (std::is_same_v) { - // process as two lanes + } + else if constexpr (std::is_same_v) { float2 xf = __half22float2(x); - xf.x = (xf.x > 20.0f) ? xf.x : log1pf(expf(xf.x)); - xf.y = (xf.y > 20.0f) ? xf.y : log1pf(expf(xf.y)); - return __floats2half2_rn(xf.x, xf.y); - } else { - // default: float, double, etc. - return (x > T(20)) ? x : log1p(exp(x)); + float2 out; + + float bx_x = beta * xf.x; + out.x = (bx_x > threshold) ? xf.x : log1pf(expf(bx_x)) / beta; + + float bx_y = beta * xf.y; + out.y = (bx_y > threshold) ? xf.y : log1pf(expf(bx_y)) / beta; + + return __floats2half2_rn(out.x, out.y); + } + else { + using CalcType = std::conditional_t, double, float>; + + CalcType x_val = static_cast(x); + CalcType b_val = static_cast(beta); + CalcType t_val = static_cast(threshold); + + CalcType bx = b_val * x_val; + + if (bx > t_val) { + return static_cast(x_val); + } else { + if constexpr (std::is_same_v) { + return static_cast(::log1p(::exp(bx)) / b_val); + } else { + return static_cast(::log1pf(::expf(bx)) / b_val); + } + } } } } SoftplusOp; + } // namespace op::softplus::cuda -#endif // __SOFTPLUS_CUDA_H__ +#endif // __SOFTPLUS_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/softplus/info.h b/src/infiniop/ops/softplus/info.h new file mode 100644 index 000000000..2eddf2864 --- /dev/null +++ b/src/infiniop/ops/softplus/info.h @@ -0,0 +1,106 @@ +#ifndef __SOFTPLUS_INFO_H__ +#define __SOFTPLUS_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include +#include // for std::equal + +namespace op::softplus { + +class SoftplusInfo { + SoftplusInfo() = default; + +public: + int _dtype; // 数据类型 + float _beta; // 缩放参数 + float _threshold; // 阈值参数 + size_t _num_elements; // 元素总数 + + // [新增] 内存布局信息 + bool _is_contiguous; // 是否内存连续 + std::vector _shape; // 形状 + std::vector _strides; // 步长 + + int dtype() const { return _dtype; } + float beta() const { return _beta; } + float threshold() const { return _threshold; } + size_t num_elements() const { return _num_elements; } + + // [新增] Getters + bool is_contiguous() const { return _is_contiguous; } + const std::vector& shape() const { return _shape; } + const std::vector& strides() const { return _strides; } + int ndim() const { return _shape.size(); } + + // 构造函数 + SoftplusInfo(int dtype, float beta, float threshold, size_t num_elements, + bool is_contiguous, std::vector shape, std::vector strides) + : _dtype(dtype), _beta(beta), _threshold(threshold), _num_elements(num_elements), + _is_contiguous(is_contiguous), _shape(std::move(shape)), _strides(std::move(strides)) {} + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + float beta, + float threshold) { + + if (out_desc->dtype() != input_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + if (out_desc->ndim() != input_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 1. 检查形状一致性并计算元素总数 + // 注意:这里假设 shape() 返回的是支持下标访问的容器 (如 vector) + auto out_shape = out_desc->shape(); + auto in_shape = input_desc->shape(); + size_t num_elements = 1; + + for (size_t i = 0; i < input_desc->ndim(); ++i) { + if (out_shape[i] != in_shape[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + num_elements *= in_shape[i]; + } + + // 2. [关键修复] 提取形状和步长 + // input_desc->shape() 返回 vector,必须用迭代器初始化,不能用指针加法 + auto in_strides = input_desc->strides(); + + // 使用迭代器区间构造,同时自动处理从 uint64 到 int64 的隐式转换 + std::vector shape(in_shape.begin(), in_shape.end()); + std::vector strides(in_strides.begin(), in_strides.end()); + + int ndim = shape.size(); + + // 3. 检查连续性 + // 从后往前检查:stride[i] == stride[i+1] * shape[i+1] + bool is_contiguous = true; + int64_t expected_stride = 1; + for (int i = ndim - 1; i >= 0; --i) { + if (shape[i] > 1) { + if (strides[i] != expected_stride) { + is_contiguous = false; + break; + } + expected_stride *= shape[i]; + } + } + + return utils::Result(SoftplusInfo{ + input_desc->dtype(), // _dtype + beta, // _beta + threshold, // _threshold + num_elements, // _num_elements + is_contiguous, // _is_contiguous + shape, // _shape + strides // _strides + }); + } +}; + +} // namespace op::softplus + +#endif // __SOFTPLUS_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/softplus/metax/softplus_metax.h b/src/infiniop/ops/softplus/metax/softplus_metax.h index 8da2b4d76..c0d1c7a13 100644 --- a/src/infiniop/ops/softplus/metax/softplus_metax.h +++ b/src/infiniop/ops/softplus/metax/softplus_metax.h @@ -1,8 +1,8 @@ #ifndef __SOFTPLUS_METAX_API_H__ #define __SOFTPLUS_METAX_API_H__ -#include "../../../elementwise/metax/elementwise_metax_api.h" +#include "../softplus.h" -ELEMENTWISE_DESCRIPTOR(softplus, metax) +DESCRIPTOR(metax) #endif // __SOFTPLUS_METAX_API_H__ diff --git a/src/infiniop/ops/softplus/metax/softplus_metax.maca b/src/infiniop/ops/softplus/metax/softplus_metax.maca index 5744f8c04..16cbd6295 100644 --- a/src/infiniop/ops/softplus/metax/softplus_metax.maca +++ b/src/infiniop/ops/softplus/metax/softplus_metax.maca @@ -1,32 +1,186 @@ #include "softplus_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" -#include "../../../elementwise/metax/elementwise_metax.h" +#include +#include +#include -#include "../cuda/kernel.cuh" +#include +#include +#include +#include + +using nv_bfloat16 = __maca_bfloat16; +using nv_bfloat162 = __maca_bfloat162; namespace op::softplus::metax { -Descriptor::~Descriptor() = default; +struct SoftplusOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x, float beta, float threshold) const { + if constexpr (std::is_same_v) { + float xf = __half2float(x); + float bx = beta * xf; + float out = (bx > threshold) ? xf : log1pf(expf(bx)) / beta; + return __float2half(out); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float bx = beta * xf; + float out = (bx > threshold) ? xf : log1pf(expf(bx)) / beta; + return __float2bfloat16(out); + } else { + using CalcType = std::conditional_t, double, float>; + + CalcType x_val = static_cast(x); + CalcType b_val = static_cast(beta); + CalcType t_val = static_cast(threshold); + + CalcType bx = b_val * x_val; + + if (bx > t_val) { + return static_cast(x_val); + } else { + if constexpr (std::is_same_v) { + return static_cast(::log1p(::exp(bx)) / b_val); + } else { + return static_cast(::log1pf(::expf(bx)) / b_val); + } + } + } + } +}; + +static constexpr int MAX_DIMS = 8; + +struct TensorMetadata { + int ndim; + int64_t shape[MAX_DIMS]; + int64_t strides[MAX_DIMS]; +}; + +template +__global__ void softplus_kernel_contiguous( + T *output, + const T *input, + size_t n, + float beta, + float threshold) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < n) { + SoftplusOp functor; + output[idx] = functor(input[idx], beta, threshold); + } +} + +template +__global__ void softplus_kernel_strided( + T *output, + const T *input, + size_t n, + float beta, + float threshold, + TensorMetadata meta) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < n) { + size_t input_offset = 0; + size_t temp_idx = idx; + +#pragma unroll + for (int d = meta.ndim - 1; d >= 0; --d) { + size_t dim_size = meta.shape[d]; + size_t coord = temp_idx % dim_size; + temp_idx /= dim_size; + input_offset += coord * meta.strides[d]; + } + + SoftplusOp functor; + output[idx] = functor(input[input_offset], beta, threshold); + } +} + +template +void launch_kernel( + void *output, + const void *input, + const SoftplusInfo &info, + void *stream) { + + size_t n = info.num_elements(); + auto mc_stream = reinterpret_cast(stream); + + dim3 block(256); + dim3 grid((n + block.x - 1) / block.x); + if (grid.x == 0) grid.x = 1; + + if (info.is_contiguous()) { + softplus_kernel_contiguous<<>>( + reinterpret_cast(output), + reinterpret_cast(input), + n, + info.beta(), + info.threshold() + ); + } else { + TensorMetadata meta; + meta.ndim = info.ndim(); + + const auto &shape_vec = info.shape(); + const auto &stride_vec = info.strides(); + + for (int i = 0; i < meta.ndim && i < MAX_DIMS; ++i) { + meta.shape[i] = shape_vec[i]; + meta.strides[i] = stride_vec[i]; + } + + softplus_kernel_strided<<>>( + reinterpret_cast(output), + reinterpret_cast(input), + n, + info.beta(), + info.threshold(), + meta + ); + } +} + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} infiniStatus_t Descriptor::create( infiniopHandle_t handle_, Descriptor **desc_ptr, infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { + infiniopTensorDescriptor_t input_desc, + float beta, + float threshold) { auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); - - CHECK_SAME_SHAPE(y_shape, x_shape); + auto result = SoftplusInfo::create(out_desc, input_desc, beta, threshold); + if (!result) { + return result.status(); + } - // create METAX elementwise descriptor - CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + 0, + handle->device, + handle->device_id + ); return INFINI_STATUS_SUCCESS; } @@ -35,26 +189,30 @@ infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, void *output, - std::vector inputs, + const void *input, void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } + (void)workspace; + (void)workspace_size; - switch (_dtype) { + switch (_info.dtype()) { case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::SoftplusOp, half>(_info, workspace, output, inputs, stream); + launch_kernel<__half>(output, input, _info, stream); + break; case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::SoftplusOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + launch_kernel(output, input, _info, stream); + break; case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::SoftplusOp, float>(_info, workspace, output, inputs, stream); + launch_kernel(output, input, _info, stream); + break; case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::SoftplusOp, double>(_info, workspace, output, inputs, stream); + launch_kernel(output, input, _info, stream); + break; default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } return INFINI_STATUS_SUCCESS; } + } // namespace op::softplus::metax diff --git a/src/infiniop/ops/softplus/moore/softplus_moore.h b/src/infiniop/ops/softplus/moore/softplus_moore.h new file mode 100644 index 000000000..fce1bd895 --- /dev/null +++ b/src/infiniop/ops/softplus/moore/softplus_moore.h @@ -0,0 +1,8 @@ +#ifndef __SOFTPLUS_MOORE_H__ +#define __SOFTPLUS_MOORE_H__ + +#include "../softplus.h" + +DESCRIPTOR(moore) + +#endif // __SOFTPLUS_MOORE_H__ diff --git a/src/infiniop/ops/softplus/moore/softplus_moore.mu b/src/infiniop/ops/softplus/moore/softplus_moore.mu new file mode 100644 index 000000000..275637045 --- /dev/null +++ b/src/infiniop/ops/softplus/moore/softplus_moore.mu @@ -0,0 +1,181 @@ +#include "softplus_moore.h" +#include "softplus_moore_kernel.h" +#include "../../../handle.h" +#include +#include +#include "../../../devices/moore/moore_handle.h" + +namespace op::softplus::moore { + +static constexpr int MAX_DIMS = 8; + +struct TensorMetadata { + int ndim; + int64_t shape[MAX_DIMS]; + int64_t strides[MAX_DIMS]; +}; + +// ================================================================== +// Kernel 1: 连续内存路径 +// ================================================================== +template +__global__ void softplus_kernel_contiguous( + T *output, + const T *input, + size_t n, + float beta, + float threshold) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < n) { + op::softplus::moore::SoftplusOp functor; + output[idx] = functor(input[idx], beta, threshold); + } +} + +// ================================================================== +// Kernel 2: 非连续内存路径 (Strided) +// ================================================================== +template +__global__ void softplus_kernel_strided( + T *output, + const T *input, + size_t n, + float beta, + float threshold, + TensorMetadata meta) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < n) { + size_t input_offset = 0; + size_t temp_idx = idx; + + #pragma unroll + for (int d = meta.ndim - 1; d >= 0; --d) { + size_t dim_size = meta.shape[d]; + size_t coord = temp_idx % dim_size; + temp_idx /= dim_size; + input_offset += coord * meta.strides[d]; + } + + op::softplus::moore::SoftplusOp functor; + output[idx] = functor(input[input_offset], beta, threshold); + } +} + +// ================================================================== +// Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const SoftplusInfo &info, + void *stream) { + + size_t n = info.num_elements(); + auto musa_stream = reinterpret_cast(stream); + + dim3 block(256); + dim3 grid((n + block.x - 1) / block.x); + + if (info.is_contiguous()) { + softplus_kernel_contiguous<<>>( + reinterpret_cast(output), + reinterpret_cast(input), + n, + info.beta(), + info.threshold() + ); + } + else { + TensorMetadata meta; + meta.ndim = info.ndim(); + + const auto& shape_vec = info.shape(); + const auto& stride_vec = info.strides(); + + for (int i = 0; i < meta.ndim; ++i) { + meta.shape[i] = shape_vec[i]; + meta.strides[i] = stride_vec[i]; + } + + softplus_kernel_strided<<>>( + reinterpret_cast(output), + reinterpret_cast(input), + n, + info.beta(), + info.threshold(), + meta + ); + } +} + +// ================================================================== +// Descriptor Implementation +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + float beta, + float threshold) { + + auto handle = reinterpret_cast(handle_); + + auto result = SoftplusInfo::create(out_desc, input_desc, beta, threshold); + if (!result) { + return result.status(); + } + + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + 0, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + switch (_info.dtype()) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16>(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::softplus::moore \ No newline at end of file diff --git a/src/infiniop/ops/softplus/moore/softplus_moore_kernel.h b/src/infiniop/ops/softplus/moore/softplus_moore_kernel.h new file mode 100644 index 000000000..e1492702e --- /dev/null +++ b/src/infiniop/ops/softplus/moore/softplus_moore_kernel.h @@ -0,0 +1,64 @@ +#ifndef __SOFTPLUS_MOORE_KERNEL_H__ +#define __SOFTPLUS_MOORE_KERNEL_H__ + +#include +#include +#include +#include +#include + +namespace op::softplus::moore { + +struct SoftplusOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x, float beta, float threshold) const { + + // 1. Half (FP16) + if constexpr (std::is_same_v) { + float xf = __half2float(x); + float bx = beta * xf; + float out = (bx > threshold) ? xf : ::log1pf(::expf(bx)) / beta; + return __float2half(out); + } + else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float bx = beta * xf; + float out = (bx > threshold) ? xf : ::log1pf(::expf(bx)) / beta; + return __float2bfloat16(out); + } + // 3. Half2 (FP16 Vector) + else if constexpr (std::is_same_v) { + float2 xf = __half22float2(x); + float2 out; + float bx_x = beta * xf.x; + out.x = (bx_x > threshold) ? xf.x : ::log1pf(::expf(bx_x)) / beta; + float bx_y = beta * xf.y; + out.y = (bx_y > threshold) ? xf.y : ::log1pf(::expf(bx_y)) / beta; + return __floats2half2_rn(out.x, out.y); + } + // 4. Float / Double + else { + using CalcType = std::conditional_t, double, float>; + CalcType x_val = static_cast(x); + CalcType b_val = static_cast(beta); + CalcType t_val = static_cast(threshold); + CalcType bx = b_val * x_val; + + if (bx > t_val) { + return static_cast(x_val); + } else { + if constexpr (std::is_same_v) { + return static_cast(::log1p(::exp(bx)) / b_val); + } else { + return static_cast(::log1pf(::expf(bx)) / b_val); + } + } + } + } +}; + +} // namespace op::softplus::moore + +#endif // __SOFTPLUS_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cu b/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cu index 392e2dc18..460407cc8 100644 --- a/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cu +++ b/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cu @@ -1,31 +1,152 @@ -#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" - -#include "../cuda/kernel.cuh" #include "softplus_nvidia.cuh" - +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include +#include +#include "../../../devices/nvidia/nvidia_common.cuh" namespace op::softplus::nvidia { -Descriptor::~Descriptor() = default; +// 最大支持维度,用于 Kernel 参数传递 +static constexpr int MAX_DIMS = 8; + +struct TensorMetadata { + int ndim; + int64_t shape[MAX_DIMS]; + int64_t strides[MAX_DIMS]; +}; + +// ================================================================== +// Kernel 1: 连续内存路径 (Fast Path) +// ================================================================== +template +__global__ void softplus_kernel_contiguous( + T *output, + const T *input, + size_t n, + float beta, + float threshold) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < n) { + op::softplus::cuda::SoftplusOp functor; + output[idx] = functor(input[idx], beta, threshold); + } +} + +// ================================================================== +template +__global__ void softplus_kernel_strided( + T *output, + const T *input, + size_t n, + float beta, + float threshold, + TensorMetadata meta) { // 按值传递元数据 + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < n) { + size_t input_offset = 0; + size_t temp_idx = idx; + + // 坐标变换:Linear Index -> Coordinate -> Strided Offset + #pragma unroll + for (int d = meta.ndim - 1; d >= 0; --d) { + size_t dim_size = meta.shape[d]; + size_t coord = temp_idx % dim_size; + temp_idx /= dim_size; + input_offset += coord * meta.strides[d]; + } + + op::softplus::cuda::SoftplusOp functor; + output[idx] = functor(input[input_offset], beta, threshold); + } +} + +// ================================================================== +// Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const SoftplusInfo &info, + void *stream) { + + size_t n = info.num_elements(); + auto cuda_stream = reinterpret_cast(stream); + + dim3 block(256); + dim3 grid((n + block.x - 1) / block.x); + if (info.is_contiguous()) { + softplus_kernel_contiguous<<>>( + reinterpret_cast(output), + reinterpret_cast(input), + n, + info.beta(), + info.threshold() + ); + } + else { + // 准备元数据 + TensorMetadata meta; + meta.ndim = info.ndim(); + + + const auto& shape_vec = info.shape(); + const auto& stride_vec = info.strides(); + + for (int i = 0; i < meta.ndim; ++i) { + meta.shape[i] = shape_vec[i]; + meta.strides[i] = stride_vec[i]; + } + + softplus_kernel_strided<<>>( + reinterpret_cast(output), + reinterpret_cast(input), + n, + info.beta(), + info.threshold(), + meta + ); + } +} + +// ================================================================== +// Descriptor Implementation +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} infiniStatus_t Descriptor::create( infiniopHandle_t handle_, Descriptor **desc_ptr, infiniopTensorDescriptor_t out_desc, - std::vector input_desc_vec) { + infiniopTensorDescriptor_t input_desc, + float beta, + float threshold) { auto handle = reinterpret_cast(handle_); - auto dtype = out_desc->dtype(); - - const auto &x_desc = input_desc_vec.at(0); - const auto &y_shape = out_desc->shape(); - const auto &x_shape = x_desc->shape(); - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); - - CHECK_SAME_SHAPE(y_shape, x_shape); + auto result = SoftplusInfo::create(out_desc, input_desc, beta, threshold); + if (!result) { + return result.status(); + } - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + 0, + handle->device, + handle->device_id + ); return INFINI_STATUS_SUCCESS; } @@ -34,26 +155,27 @@ infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, void *output, - std::vector inputs, + const void *input, void *stream) const { - if (workspace_size < _workspace_size) { - return INFINI_STATUS_INSUFFICIENT_WORKSPACE; - } - - switch (_dtype) { + switch (_info.dtype()) { case INFINI_DTYPE_F16: - return _device_info->calculate<256, cuda::SoftplusOp, half>(_info, workspace, output, inputs, stream); + launch_kernel(output, input, _info, stream); + break; case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::SoftplusOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + launch_kernel(output, input, _info, stream); + break; case INFINI_DTYPE_F32: - return _device_info->calculate<256, cuda::SoftplusOp, float>(_info, workspace, output, inputs, stream); + launch_kernel(output, input, _info, stream); + break; case INFINI_DTYPE_F64: - return _device_info->calculate<256, cuda::SoftplusOp, double>(_info, workspace, output, inputs, stream); + launch_kernel(output, input, _info, stream); + break; default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } return INFINI_STATUS_SUCCESS; } -} // namespace op::softplus::nvidia + +} // namespace op::softplus::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cuh b/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cuh index ef0261eac..d87fc6631 100644 --- a/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cuh +++ b/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cuh @@ -1,8 +1,8 @@ -#ifndef __SOFTPLUS_CUDA_API_H__ -#define __SOFTPLUS_CUDA_API_H__ +#ifndef __SOFTPLUS_NVIDIA_CUH__ +#define __SOFTPLUS_NVIDIA_CUH__ -#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" +#include "../softplus.h" -ELEMENTWISE_DESCRIPTOR(softplus, nvidia) +DESCRIPTOR(nvidia) -#endif // __SOFTPLUS_CUDA_API_H__ +#endif // __SOFTPLUS_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/softplus/operator.cc b/src/infiniop/ops/softplus/operator.cc index 7d09bad6c..146989c70 100644 --- a/src/infiniop/ops/softplus/operator.cc +++ b/src/infiniop/ops/softplus/operator.cc @@ -14,12 +14,25 @@ #ifdef ENABLE_KUNLUN_API #include "kunlun/softplus_kunlun.h" #endif +// 新增 Moore 头文件引用 +#ifdef ENABLE_MOORE_API +#include "moore/softplus_moore.h" +#endif + +// ======================================================================= +// [关键修复] 定义结构体 +// 必须在这里定义 InfiniopSoftplusDescriptor 并继承 InfiniopDescriptor, +// 这样编译器才能识别 desc->device_type。 +// ======================================================================= +struct InfiniopSoftplusDescriptor : public InfiniopDescriptor {}; __C infiniStatus_t infiniopCreateSoftplusDescriptor( infiniopHandle_t handle, infiniopSoftplusDescriptor_t *desc_ptr, infiniopTensorDescriptor_t y_desc, - infiniopTensorDescriptor_t x_desc) { + infiniopTensorDescriptor_t x_desc, + float beta, + float threshold) { #define CREATE(CASE, NAMESPACE) \ case CASE: \ @@ -27,7 +40,9 @@ __C infiniStatus_t infiniopCreateSoftplusDescriptor( handle, \ reinterpret_cast(desc_ptr), \ y_desc, \ - {x_desc}) + {x_desc}, \ + beta, \ + threshold) switch (handle->device) { @@ -48,6 +63,10 @@ __C infiniStatus_t infiniopCreateSoftplusDescriptor( #endif #ifdef ENABLE_KUNLUN_API CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +// 新增 Moore 分支 +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -81,6 +100,10 @@ __C infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t #endif #ifdef ENABLE_KUNLUN_API GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +// 新增 Moore 分支 +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -98,8 +121,8 @@ __C infiniStatus_t infiniopSoftplus( const void *x, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, y, {x}, stream) @@ -122,6 +145,10 @@ __C infiniStatus_t infiniopSoftplus( #endif #ifdef ENABLE_KUNLUN_API CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +// 新增 Moore 分支 +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -157,10 +184,14 @@ infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc) { #endif #ifdef ENABLE_KUNLUN_API DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +// 新增 Moore 分支 +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } #undef DELETE -} +} \ No newline at end of file diff --git a/src/infiniop/ops/softplus/softplus.h b/src/infiniop/ops/softplus/softplus.h new file mode 100644 index 000000000..37b25baa2 --- /dev/null +++ b/src/infiniop/ops/softplus/softplus.h @@ -0,0 +1,49 @@ +#ifndef __SOFTPLUS_H__ +#define __SOFTPLUS_H__ + +#include "../../operator.h" +#include "info.h" // 引用对应的 SoftplusInfo 定义 + +// 宏定义:用于生成不同命名空间下的 Descriptor 类 +#define DESCRIPTOR(NAMESPACE) \ + namespace op::softplus::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + SoftplusInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + SoftplusInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + float beta, \ + float threshold); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __SOFTPLUS_H__ \ No newline at end of file diff --git a/src/infiniop/ops/softsign/cpu/softsign_cpu.cc b/src/infiniop/ops/softsign/cpu/softsign_cpu.cc new file mode 100644 index 000000000..bf1a5f757 --- /dev/null +++ b/src/infiniop/ops/softsign/cpu/softsign_cpu.cc @@ -0,0 +1,55 @@ +#include "softsign_cpu.h" + +namespace op::softsign::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // Softsign 只有一个输入 + const auto &x_desc = input_desc_vec.at(0); + + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 调用 SoftsignOp 进行计算 + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::softsign::cpu \ No newline at end of file diff --git a/src/infiniop/ops/softsign/cpu/softsign_cpu.h b/src/infiniop/ops/softsign/cpu/softsign_cpu.h new file mode 100644 index 000000000..02ada11a3 --- /dev/null +++ b/src/infiniop/ops/softsign/cpu/softsign_cpu.h @@ -0,0 +1,21 @@ +#ifndef __SOFTSIGN_CPU_H__ +#define __SOFTSIGN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(softsign, cpu) + +namespace op::softsign::cpu { +typedef struct SoftsignOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return x / (static_cast(1) + std::abs(x)); + } +} SoftsignOp; +} // namespace op::softsign::cpu + +#endif // __SOFTSIGN_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/softsign/cuda/kernel.cuh b/src/infiniop/ops/softsign/cuda/kernel.cuh new file mode 100644 index 000000000..bc22a2f51 --- /dev/null +++ b/src/infiniop/ops/softsign/cuda/kernel.cuh @@ -0,0 +1,42 @@ +#ifndef __SOFTSIGN_CUDA_H__ +#define __SOFTSIGN_CUDA_H__ + +#include +#include +#include +#include + +namespace op::softsign::cuda { + +struct SoftsignOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + const half2 one = __float2half2_rn(1.0f); + const half2 abs_x = __habs2(x); + const half2 denom = __hadd2(one, abs_x); + return __h2div(x, denom); + } else if constexpr (std::is_same_v) { +#if __CUDA_ARCH__ >= 530 + const half one = __float2half(1.0f); + return __hdiv(x, __hadd(one, __habs(x))); +#else + return static_cast(static_cast(x) / (1.0f + fabsf(static_cast(x)))); +#endif + } else if constexpr (std::is_same_v) { + // Avoid __habs which is for fp16. Use manual abs or operators to keep bf16 precision. + const T abs_x = (x >= T(0.0f)) ? x : -x; + return x / (T(1.0f) + abs_x); + } else if constexpr (std::is_same_v) { + return x / (1.0f + fabsf(x)); + } else { + return x / (static_cast(1) + std::abs(x)); + } + } +}; + +} // namespace op::softsign::cuda + +#endif // __SOFTSIGN_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/softsign/metax/softsign_metax.h b/src/infiniop/ops/softsign/metax/softsign_metax.h new file mode 100644 index 000000000..6d4f57a72 --- /dev/null +++ b/src/infiniop/ops/softsign/metax/softsign_metax.h @@ -0,0 +1,8 @@ +#ifndef __SOFTSIGN_METAX_API_H__ +#define __SOFTSIGN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(softsign, metax) + +#endif // __SOFTSIGN_METAX_API_H__ diff --git a/src/infiniop/ops/softsign/metax/softsign_metax.maca b/src/infiniop/ops/softsign/metax/softsign_metax.maca new file mode 100644 index 000000000..0b16252cf --- /dev/null +++ b/src/infiniop/ops/softsign/metax/softsign_metax.maca @@ -0,0 +1,95 @@ +#include "../../../elementwise/metax/elementwise_metax.h" +#include "softsign_metax.h" + +#include +#include +#include +#include + +namespace op::softsign::metax { + +using nv_bfloat16 = __maca_bfloat16; + +struct SoftsignOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + float xf = __half2float(x); + float res = xf / (1.0f + fabsf(xf)); + return __float2half(res); + } else if constexpr (std::is_same_v) { + T zero = T(0.0f); + T one = T(1.0f); + T abs_x = (x >= zero) ? x : -x; + return x / (one + abs_x); + } else if constexpr (std::is_same_v) { + return x / (1.0f + fabsf(x)); + } else if constexpr (std::is_same_v) { + return x / (1.0 + std::fabs(x)); + } else { + using std::abs; + return x / (static_cast(1) + abs(x)); + } + } +}; + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_BF16, + INFINI_DTYPE_F64); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, SoftsignOp, __half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, SoftsignOp, nv_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, SoftsignOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, SoftsignOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::softsign::metax \ No newline at end of file diff --git a/src/infiniop/ops/softsign/moore/softsign_moore.h b/src/infiniop/ops/softsign/moore/softsign_moore.h new file mode 100644 index 000000000..68652b6ba --- /dev/null +++ b/src/infiniop/ops/softsign/moore/softsign_moore.h @@ -0,0 +1,8 @@ +#ifndef __SOFTSIGN_MOORE_API_H__ +#define __SOFTSIGN_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(softsign, moore) + +#endif // __SOFTSIGN_MOORE_API_H__ diff --git a/src/infiniop/ops/softsign/moore/softsign_moore.mu b/src/infiniop/ops/softsign/moore/softsign_moore.mu new file mode 100644 index 000000000..99fa85a1b --- /dev/null +++ b/src/infiniop/ops/softsign/moore/softsign_moore.mu @@ -0,0 +1,69 @@ +#include "softsign_moore.h" +#include "softsign_moore_kernel.h" + +#include "../../../elementwise/moore/elementwise_moore.h" +#include +#include + +namespace op::softsign::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // Softsign 是单输入算子,只取 index 0 + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_BF16, + INFINI_DTYPE_F64); + + // 创建 Moore elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, SoftsignOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, SoftsignOp, __mt_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, SoftsignOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, SoftsignOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::softsign::moore diff --git a/src/infiniop/ops/softsign/moore/softsign_moore_kernel.h b/src/infiniop/ops/softsign/moore/softsign_moore_kernel.h new file mode 100644 index 000000000..d42a81107 --- /dev/null +++ b/src/infiniop/ops/softsign/moore/softsign_moore_kernel.h @@ -0,0 +1,141 @@ +#ifndef __SOFTSIGN_MOORE_KERNEL_H__ +#define __SOFTSIGN_MOORE_KERNEL_H__ + +#include +#include +#include +#include +#include + +namespace op::softsign::moore { + +// ================================================================ +// 类型转换辅助函数 +// ================================================================ +template +__device__ __forceinline__ float to_float(T v) { + if constexpr (std::is_same_v) { + return __half2float(v); + } else if constexpr (std::is_same_v) { + return __bfloat162float(v); + } else { + return static_cast(v); + } +} + +template +__device__ __forceinline__ T from_float(float v) { + if constexpr (std::is_same_v) { + return __float2half(v); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(v); + } else { + return static_cast(v); + } +} + +// ================================================================ +// Softsign Functor +// y = x / (1 + |x|) +// ================================================================ +struct SoftsignOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &x) const { + float xf = to_float(x); + float res = xf / (1.0f + fabsf(xf)); + return from_float(res); + } +}; + +// ================================================================ +// TensorMetadata +// ================================================================ +static constexpr int MAX_DIMS = 8; + +struct TensorMetadata { + int ndim; + int64_t shape[MAX_DIMS]; + int64_t strides[MAX_DIMS]; +}; + +// ================================================================ +// Kernel 1: 连续内存 +// ================================================================ +template +__global__ void softsign_kernel_contiguous(T* output, const T* input, size_t n) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < n) { + SoftsignOp functor; + output[idx] = functor(input[idx]); + } +} + +// ================================================================ +// Kernel 2: 非连续内存 (Strided) +// ================================================================ +template +__global__ void softsign_kernel_strided( + T* output, + const T* input, + size_t n, + TensorMetadata meta +) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < n) { + size_t offset = 0; + size_t t = idx; + +#pragma unroll + for (int d = meta.ndim - 1; d >= 0; --d) { + size_t dim_size = meta.shape[d]; + size_t coord = t % dim_size; + t /= dim_size; + offset += coord * meta.strides[d]; + } + + SoftsignOp functor; + output[idx] = functor(input[offset]); + } +} + +// ================================================================ +// Launch Kernel +// ================================================================ +template +void launch_kernel( + void* output, + const void* input, + const TensorMetadata& meta, + size_t numel, + bool is_contiguous, + void* stream +) { + auto musa_stream = reinterpret_cast(stream); + + dim3 block(256); + dim3 grid((numel + block.x - 1) / block.x); + + if (is_contiguous) { + softsign_kernel_contiguous<<>>( + reinterpret_cast(output), + reinterpret_cast(input), + numel + ); + } else { + softsign_kernel_strided<<>>( + reinterpret_cast(output), + reinterpret_cast(input), + numel, + meta + ); + } +} + +} // namespace op::softsign::moore + +#endif // __SOFTSIGN_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cu b/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cu new file mode 100644 index 000000000..0a5632877 --- /dev/null +++ b/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cu @@ -0,0 +1,56 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "softsign_nvidia.cuh" + +namespace op::softsign::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // Softsign 是单输入算子,只取 index 0 + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SoftsignOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SoftsignOp, nv_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SoftsignOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SoftsignOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::softsign::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cuh b/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cuh new file mode 100644 index 000000000..b5b3ad7c8 --- /dev/null +++ b/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SOFTSIGN_CUDA_API_H__ +#define __SOFTSIGN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(softsign, nvidia) + +#endif // __SOFTSIGN_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/softsign/operator.cc b/src/infiniop/ops/softsign/operator.cc new file mode 100644 index 000000000..feadccadb --- /dev/null +++ b/src/infiniop/ops/softsign/operator.cc @@ -0,0 +1,201 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/softsign.h" // 必须包含上面定义的头文件 + +#ifdef ENABLE_CPU_API +#include "cpu/softsign_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/softsign_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/softsign_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/softsign_moore.h" +#endif + +// ----------------------------------------------------------------------------- +// Struct Definition +// ----------------------------------------------------------------------------- + +// 修正:使用 infiniDevice_t +struct InfiniopSoftsignDescriptor { + infiniDevice_t device_type; +}; + +// ----------------------------------------------------------------------------- +// Create Descriptor +// ----------------------------------------------------------------------------- + +__C infiniStatus_t infiniopCreateSoftsignDescriptor( + infiniopHandle_t handle, + infiniopSoftsignDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +// 使用 {x_desc} 构建 vector +#define CREATE(CASE, NAMESPACE) \ + case CASE: { \ + auto status = op::softsign::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}); \ + if (status == INFINI_STATUS_SUCCESS) { \ + (*desc_ptr)->device_type = CASE; \ + } \ + return status; \ + } + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +// ----------------------------------------------------------------------------- +// Get Workspace Size +// ----------------------------------------------------------------------------- + +__C infiniStatus_t infiniopGetSoftsignWorkspaceSize(infiniopSoftsignDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +// ----------------------------------------------------------------------------- +// Execute (Calculate) +// ----------------------------------------------------------------------------- + +__C infiniStatus_t infiniopSoftsign( + infiniopSoftsignDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +// 使用 {x} 构建 vector +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +// ----------------------------------------------------------------------------- +// Destroy Descriptor +// ----------------------------------------------------------------------------- + +__C infiniStatus_t +infiniopDestroySoftsignDescriptor(infiniopSoftsignDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/test/infinicore/ops/broadcast_to.py b/test/infinicore/ops/broadcast_to.py index 0813cf14f..ff129c0fb 100644 --- a/test/infinicore/ops/broadcast_to.py +++ b/test/infinicore/ops/broadcast_to.py @@ -64,9 +64,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.broadcast_to(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.broadcast_to(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.broadcast_to(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/huber_loss.py b/test/infinicore/ops/huber_loss.py index 592695993..8670b7b04 100644 --- a/test/infinicore/ops/huber_loss.py +++ b/test/infinicore/ops/huber_loss.py @@ -66,9 +66,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.huber_loss(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.huber_loss(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.nn.functional.huber_loss(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/softplus.py b/test/infinicore/ops/softplus.py index c01c6d632..a74a60974 100644 --- a/test/infinicore/ops/softplus.py +++ b/test/infinicore/ops/softplus.py @@ -76,9 +76,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.softplus(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.softplus(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.nn.functional.softplus(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/softsign.py b/test/infinicore/ops/softsign.py index c482e72e5..b55d47539 100644 --- a/test/infinicore/ops/softsign.py +++ b/test/infinicore/ops/softsign.py @@ -68,9 +68,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.softsign(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.softsign(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.nn.functional.softsign(*args, **kwargs) def main():