diff --git a/include/infinicore/common/hash.hpp b/include/infinicore/common/hash.hpp index ec930a53f..7d450b3b7 100644 --- a/include/infinicore/common/hash.hpp +++ b/include/infinicore/common/hash.hpp @@ -15,6 +15,14 @@ hash_combine(size_t &seed, const T &value) { // Specialization for Tensor inline void hash_combine(size_t &seed, Tensor tensor) { + // For an undefined tensor (default-constructed), just mix in a sentinel + // value so that optional arguments like weight/pos_weight do not cause + // null dereferences when computing cache keys. + if (!tensor) { + hash_combine(seed, static_cast(0)); + return; + } + hash_combine(seed, static_cast(tensor->dtype())); for (Size shape : tensor->shape()) { hash_combine(seed, shape); diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index 0937a4821..25b4e7b8c 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -10,3 +10,8 @@ #include "ops/rope.hpp" #include "ops/silu.hpp" #include "ops/swiglu.hpp" +#include "ops/atanh.hpp" +#include "ops/addcmul.hpp" +#include "ops/cdist.hpp" +#include "ops/reciprocal.hpp" +#include "ops/binary_cross_entropy_with_logits.hpp" \ No newline at end of file diff --git a/include/infinicore/ops/addcmul.hpp b/include/infinicore/ops/addcmul.hpp new file mode 100644 index 000000000..9597fa0cb --- /dev/null +++ b/include/infinicore/ops/addcmul.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Addcmul { +public: + // schema: out, input, t1, t2, value + using schema = void (*)(Tensor, Tensor, Tensor, Tensor, float); + static void execute(Tensor out, Tensor input, Tensor t1, Tensor t2, float value); + static common::OpDispatcher &dispatcher(); +}; +Tensor addcmul(Tensor input, Tensor t1, Tensor t2, float value); +void addcmul_(Tensor out, Tensor input, Tensor t1, Tensor t2, float value); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/atanh.hpp b/include/infinicore/ops/atanh.hpp new file mode 100644 index 000000000..9f75c4d53 --- /dev/null +++ b/include/infinicore/ops/atanh.hpp @@ -0,0 +1,34 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Atanh { +public: + // schema 定义为:void(输出 Tensor, 输入 Tensor) + using schema = void (*)(Tensor, Tensor); + + // 执行函数 + static void execute(Tensor y, Tensor a); + + // 获取算子分发器,用于多后端(CPU/CUDA 等)匹配 + static common::OpDispatcher &dispatcher(); +}; + +/** + * @brief 计算输入 Tensor 的反双曲正切值 (out-of-place) + * @param a 输入 Tensor + * @return 包含结果的新 Tensor + */ +Tensor atanh(Tensor a); + +/** + * @brief 计算输入 Tensor 的反双曲正切值 (in-place / specified output) + * @param y 输出 Tensor + * @param a 输入 Tensor + */ +void atanh_(Tensor y, Tensor a); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/binary_cross_entropy_with_logits.hpp b/include/infinicore/ops/binary_cross_entropy_with_logits.hpp new file mode 100644 index 000000000..f291c939e --- /dev/null +++ b/include/infinicore/ops/binary_cross_entropy_with_logits.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" +#include + +namespace infinicore::op { + +class BinaryCrossEntropyWithLogits { +public: + /** + * @brief BCEWithLogits 算子的函数原型 + * 参数顺序: out, logits, target, weight, pos_weight, reduction + */ + using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, std::string); + + static void execute(Tensor out, + Tensor logits, + Tensor target, + Tensor weight, + Tensor pos_weight, + std::string reduction); + + static common::OpDispatcher &dispatcher(); +}; + +/** + * @brief 非原地操作接口 (Out-of-place) + */ +Tensor binary_cross_entropy_with_logits(Tensor logits, + Tensor target, + Tensor weight = {}, + Tensor pos_weight = {}, + std::string reduction = "mean"); + +/** + * @brief 显式指定输出张量的接口 + */ +void binary_cross_entropy_with_logits_(Tensor out, + Tensor logits, + Tensor target, + Tensor weight, + Tensor pos_weight, + std::string reduction); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/cdist.hpp b/include/infinicore/ops/cdist.hpp new file mode 100644 index 000000000..a572208e2 --- /dev/null +++ b/include/infinicore/ops/cdist.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Cdist { +public: + /** + * @brief 成对距离计算算子 (Pairwise distance) + * schema: out (M, N), x1 (M, D), x2 (N, D), p (norm degree) + */ + using schema = void (*)(Tensor, Tensor, Tensor, double); + + static void execute(Tensor out, Tensor x1, Tensor x2, double p); + + static common::OpDispatcher &dispatcher(); +}; + +/** + * @brief 非原地(Out-of-place)接口 + * @return 返回形状为 (M, N) 的新 Tensor + */ +Tensor cdist(Tensor x1, Tensor x2, double p = 2.0); + +/** + * @brief 显式指定输出接口 + */ +void cdist_(Tensor out, Tensor x1, Tensor x2, double p = 2.0); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/reciprocal.hpp b/include/infinicore/ops/reciprocal.hpp new file mode 100644 index 000000000..2d91772ad --- /dev/null +++ b/include/infinicore/ops/reciprocal.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Reciprocal { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor y, Tensor x); + static common::OpDispatcher &dispatcher(); +}; + +Tensor reciprocal(Tensor x); +void reciprocal_(Tensor y, Tensor x); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/tensor.hpp b/include/infinicore/tensor.hpp index 58a8f59e7..c63d3c82e 100644 --- a/include/infinicore/tensor.hpp +++ b/include/infinicore/tensor.hpp @@ -86,6 +86,9 @@ class Tensor { operator bool() const; + // 判断 Tensor 是否已定义(是否持有有效实现) + bool is_defined() const { return static_cast(*this); } + protected: Tensor(std::shared_ptr impl) : impl_(std::move(impl)) {} std::shared_ptr impl_; diff --git a/include/infiniop.h b/include/infiniop.h index 92e6f5963..216ade693 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -30,6 +30,11 @@ #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/zeros.h" +#include "infiniop/ops/atanh.h" +#include "infiniop/ops/addcmul.h" +#include "infiniop/ops/cdist.h" +#include "infiniop/ops/binary_cross_entropy_with_logits.h" +#include "infiniop/ops/reciprocal.h" #include "infiniop/tensor_descriptor.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/addcmul.h b/include/infiniop/ops/addcmul.h new file mode 100644 index 000000000..d2605c7e8 --- /dev/null +++ b/include/infiniop/ops/addcmul.h @@ -0,0 +1,57 @@ +#ifndef __INFINIOP_ADDCMUL_API_H__ +#define __INFINIOP_ADDCMUL_API_H__ + +#include "../operator_descriptor.h" + +// 定义 addcmul 算子描述符类型 +typedef struct InfiniopDescriptor *infiniopAddcmulDescriptor_t; + +/** + * @brief 创建 Addcmul 算子描述符 + * @param handle 算子句柄 + * @param desc_ptr 指向返回的描述符指针 + * @param out 输出张量描述符 + * @param input 加项张量描述符 + * @param tensor1 乘项张量1描述符 + * @param tensor2 乘项张量2描述符 + * @param value 乘积的标量系数 + */ +__C __export infiniStatus_t infiniopCreateAddcmulDescriptor(infiniopHandle_t handle, + infiniopAddcmulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t tensor1, + infiniopTensorDescriptor_t tensor2, + float value); + +/** + * @brief 获取 Addcmul 计算所需的临时空间大小 + */ +__C __export infiniStatus_t infiniopGetAddcmulWorkspaceSize(infiniopAddcmulDescriptor_t desc, size_t *size); + +/** + * @brief 执行 Addcmul 计算 + * @param desc 算子描述符 + * @param workspace 临时空间指针 + * @param workspace_size 临时空间大小 + * @param out 输出数据指针 + * @param input 加项数据指针 + * @param tensor1 乘项1数据指针 + * @param tensor2 乘项2数据指针 + * @param stream 计算流 (CUDA stream 等) + */ +__C __export infiniStatus_t infiniopAddcmul(infiniopAddcmulDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *tensor1, + const void *tensor2, + void *stream); + +/** + * @brief 销毁 Addcmul 算子描述符 + */ +__C __export infiniStatus_t infiniopDestroyAddcmulDescriptor(infiniopAddcmulDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h new file mode 100644 index 000000000..021ab6bc1 --- /dev/null +++ b/include/infiniop/ops/atanh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_Atanh_API_H__ +#define __INFINIOP_Atanh_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle, + infiniopAtanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t a); + +__C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *a, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/binary_cross_entropy_with_logits.h b/include/infiniop/ops/binary_cross_entropy_with_logits.h new file mode 100644 index 000000000..90ff7c37b --- /dev/null +++ b/include/infiniop/ops/binary_cross_entropy_with_logits.h @@ -0,0 +1,73 @@ +#ifndef __INFINIOP_BINARY_CROSS_ENTROPY_WITH_LOGITS_API_H__ +#define __INFINIOP_BINARY_CROSS_ENTROPY_WITH_LOGITS_API_H__ + +#include "../operator_descriptor.h" + +// 定义归约方式枚举 +typedef enum { + INFINIOP_REDUCTION_NONE = 0, + INFINIOP_REDUCTION_MEAN = 1, + INFINIOP_REDUCTION_SUM = 2 +} infiniopReduction_t; + +// 定义 BCEWithLogits 算子描述符类型 +typedef struct InfiniopDescriptor *infiniopBCEWithLogitsDescriptor_t; + +/** + * @brief 创建 BCEWithLogits 算子描述符 + * @param handle 算子句柄 + * @param desc_ptr 指向返回的描述符指针 + * @param out 输出张量描述符 (none时与input同形状,mean/sum时为标量) + * @param logits 输入 Logits 张量描述符 + * @param target 目标标签张量描述符 + * @param weight 样本权重描述符 (可选,不需要则传 NULL) + * @param pos_weight 正样本权重描述符 (可选,不需要则传 NULL) + * @param reduction 归约方式 (none, mean, sum) + */ +__C __export infiniStatus_t infiniopCreateBCEWithLogitsDescriptor( + infiniopHandle_t handle, + infiniopBCEWithLogitsDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out, + infiniopTensorDescriptor_t logits, + infiniopTensorDescriptor_t target, + infiniopTensorDescriptor_t weight, + infiniopTensorDescriptor_t pos_weight, + infiniopReduction_t reduction); + +/** + * @brief 获取 BCEWithLogits 计算所需的临时空间大小 + */ +__C __export infiniStatus_t infiniopGetBCEWithLogitsWorkspaceSize( + infiniopBCEWithLogitsDescriptor_t desc, + size_t *size); + +/** + * @brief 执行 BCEWithLogits 计算 + * @param desc 算子描述符 + * @param workspace 临时空间指针 + * @param workspace_size 临时空间大小 + * @param out 输出数据指针 + * @param logits Logits 数据指针 + * @param target Target 数据指针 + * @param weight 权重数据指针 (可选,传 NULL 表示权重全为 1) + * @param pos_weight 正样本权重数据指针 (可选,传 NULL 表示权重全为 1) + * @param stream 计算流 + */ +__C __export infiniStatus_t infiniopBCEWithLogits( + infiniopBCEWithLogitsDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *logits, + const void *target, + const void *weight, + const void *pos_weight, + void *stream); + +/** + * @brief 销毁 BCEWithLogits 算子描述符 + */ +__C __export infiniStatus_t infiniopDestroyBCEWithLogitsDescriptor( + infiniopBCEWithLogitsDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/cdist.h b/include/infiniop/ops/cdist.h new file mode 100644 index 000000000..f2307d2ab --- /dev/null +++ b/include/infiniop/ops/cdist.h @@ -0,0 +1,56 @@ +#ifndef __INFINIOP_CDIST_API_H__ +#define __INFINIOP_CDIST_API_H__ + +#include "../operator_descriptor.h" + +// 定义 cdist 算子描述符类型 +typedef struct InfiniopDescriptor *infiniopCdistDescriptor_t; + +/** + * @brief 创建 Cdist 算子描述符 + * @param handle 算子句柄 + * @param desc_ptr 指向返回的描述符指针 + * @param y 输出张量描述符 (Shape: M x N) + * @param x1 输入张量1描述符 (Shape: M x D) + * @param x2 输入张量2描述符 (Shape: N x D) + * @param p 范数阶数 (L-p norm) + */ +__C __export infiniStatus_t infiniopCreateCdistDescriptor( + infiniopHandle_t handle, + infiniopCdistDescriptor_t *desc_ptr, // 注意这里应该是具体类型的指针 + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x1_desc, + infiniopTensorDescriptor_t x2_desc, + double p); + +/** + * @brief 获取 Cdist 计算所需的临时空间大小 + */ +__C __export infiniStatus_t infiniopGetCdistWorkspaceSize(infiniopCdistDescriptor_t desc, + size_t *size); + +/** + * @brief 执行 Cdist 计算 + * @param desc 算子描述符 + * @param workspace 临时空间指针 + * @param workspace_size 临时空间大小 + * @param y 输出数据指针 + * @param x1 输入1数据指针 + * @param x2 输入2数据指针 + * @param stream 计算流 (CUDA stream 等) + */ +__C __export infiniStatus_t infiniopCdist( + infiniopCdistDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x1, + const void *x2, + void *stream); + +/** + * @brief 销毁 Cdist 算子描述符 + */ +__C __export infiniStatus_t infiniopDestroyCdistDescriptor(infiniopCdistDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h new file mode 100644 index 000000000..304c3c754 --- /dev/null +++ b/include/infiniop/ops/reciprocal.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_RECIPROCAL_API_H__ +#define __INFINIOP_RECIPROCAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle, + infiniopReciprocalDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 7ca962449..431aea92c 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -47,6 +47,11 @@ from infinicore.ops.rearrange import rearrange from infinicore.ops.squeeze import squeeze from infinicore.ops.unsqueeze import unsqueeze +from infinicore.ops.atanh import atanh +from infinicore.ops.addcmul import addcmul +from infinicore.ops.cdist import cdist +from infinicore.ops.binary_cross_entropy_with_logits import binary_cross_entropy_with_logits +from infinicore.ops.reciprocal import reciprocal from infinicore.tensor import ( Tensor, empty, @@ -108,6 +113,11 @@ "narrow", "squeeze", "unsqueeze", + "atanh", + "addcmul", + "cdist", + "binary_cross_entropy_with_logits", + "reciprocal", "rearrange", "empty", "empty_like", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..68d498f69 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -6,9 +6,11 @@ from .rope import RopeAlgo, rope from .silu import silu from .swiglu import swiglu +from .binary_cross_entropy_with_logits import binary_cross_entropy_with_logits __all__ = [ "causal_softmax", + "binary_cross_entropy_with_logits", "random_sample", "rms_norm", "silu", diff --git a/python/infinicore/nn/functional/binary_cross_entropy_with_logits.py b/python/infinicore/nn/functional/binary_cross_entropy_with_logits.py new file mode 100644 index 000000000..98d2e3ae9 --- /dev/null +++ b/python/infinicore/nn/functional/binary_cross_entropy_with_logits.py @@ -0,0 +1,103 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def binary_cross_entropy_with_logits( + input: Tensor, + target: Tensor, + weight: Tensor | None = None, + pos_weight: Tensor | None = None, + reduction: str = "mean", + *, + out: Tensor | None = None, +) -> Tensor: + """Binary cross entropy loss with logits. + + This wraps the underlying C++/CUDA implementation exposed via `_infinicore`. + + The low-level binding treats missing ``weight`` / ``pos_weight`` via + default-constructed tensors. Here we avoid passing ``None`` down and + instead omit arguments when they are not provided, so pybind11 uses + its defaults. + """ + + # Out-of-place API + if out is None: + # Neither weight nor pos_weight + if weight is None and pos_weight is None: + return Tensor( + _infinicore.binary_cross_entropy_with_logits( + input._underlying, + target._underlying, + reduction=reduction, + ) + ) + + # weight provided only + if weight is not None and pos_weight is None: + return Tensor( + _infinicore.binary_cross_entropy_with_logits( + input._underlying, + target._underlying, + weight._underlying, + reduction=reduction, + ) + ) + + # pos_weight provided only + if weight is None and pos_weight is not None: + return Tensor( + _infinicore.binary_cross_entropy_with_logits( + input._underlying, + target._underlying, + pos_weight=pos_weight._underlying, + reduction=reduction, + ) + ) + + # both provided + return Tensor( + _infinicore.binary_cross_entropy_with_logits( + input._underlying, + target._underlying, + weight._underlying, + pos_weight._underlying, + reduction, + ) + ) + + # In-place-style API with explicit out + if weight is None and pos_weight is None: + _infinicore.binary_cross_entropy_with_logits_( + out._underlying, + input._underlying, + target._underlying, + reduction=reduction, + ) + elif weight is not None and pos_weight is None: + _infinicore.binary_cross_entropy_with_logits_( + out._underlying, + input._underlying, + target._underlying, + weight._underlying, + reduction=reduction, + ) + elif weight is None and pos_weight is not None: + _infinicore.binary_cross_entropy_with_logits_( + out._underlying, + input._underlying, + target._underlying, + pos_weight=pos_weight._underlying, + reduction=reduction, + ) + else: + _infinicore.binary_cross_entropy_with_logits_( + out._underlying, + input._underlying, + target._underlying, + weight._underlying, + pos_weight._underlying, + reduction, + ) + + return out diff --git a/python/infinicore/ops/addcmul.py b/python/infinicore/ops/addcmul.py new file mode 100644 index 000000000..11e2feffa --- /dev/null +++ b/python/infinicore/ops/addcmul.py @@ -0,0 +1,24 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def addcmul(input, tensor1, tensor2, value=1.0, *, out=None): + if out is None: + return Tensor( + _infinicore.addcmul( + input._underlying, + tensor1._underlying, + tensor2._underlying, + float(value), + ) + ) + + _infinicore.addcmul_( + out._underlying, + input._underlying, + tensor1._underlying, + tensor2._underlying, + float(value), + ) + + return out \ No newline at end of file diff --git a/python/infinicore/ops/atanh.py b/python/infinicore/ops/atanh.py new file mode 100644 index 000000000..4008e0268 --- /dev/null +++ b/python/infinicore/ops/atanh.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def atanh(input, *, out=None): + if out is None: + return Tensor(_infinicore.atanh(input._underlying)) + + _infinicore.atanh_(out._underlying, input._underlying) + + return out diff --git a/python/infinicore/ops/binary_cross_entropy_with_logits.py b/python/infinicore/ops/binary_cross_entropy_with_logits.py new file mode 100644 index 000000000..c75996548 --- /dev/null +++ b/python/infinicore/ops/binary_cross_entropy_with_logits.py @@ -0,0 +1,47 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def binary_cross_entropy_with_logits( + input, + target, + weight=None, + pos_weight=None, + reduction="mean", + *, + out=None +): + """ + input: Tensor (logits) + target: Tensor (labels) + weight: Tensor (optional, sample-wise weight) + pos_weight: Tensor (optional, class-wise weight) + reduction: str ('none', 'mean', 'sum') + """ + + # 提取底层 C++ 对象,处理可选 Tensor + weight_raw = weight._underlying if weight is not None else None + pos_weight_raw = pos_weight._underlying if pos_weight is not None else None + + if out is None: + # 调用非原地接口,返回新创建的 Tensor + return Tensor( + _infinicore.binary_cross_entropy_with_logits( + input._underlying, + target._underlying, + weight_raw, + pos_weight_raw, + str(reduction), + ) + ) + + # 调用显式指定输出的接口 (binary_cross_entropy_with_logits_) + _infinicore.binary_cross_entropy_with_logits_( + out._underlying, + input._underlying, + target._underlying, + weight_raw, + pos_weight_raw, + str(reduction), + ) + + return out \ No newline at end of file diff --git a/python/infinicore/ops/cdist.py b/python/infinicore/ops/cdist.py new file mode 100644 index 000000000..4d3dcee3b --- /dev/null +++ b/python/infinicore/ops/cdist.py @@ -0,0 +1,35 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def cdist(x1, x2, p=2.0, *, out=None): + """ + 计算两组向量集合中每一对向量之间的 p-norm 距离。 + + 参数: + x1 (Tensor): 形状为 (M, D) 的输入张量。 + x2 (Tensor): 形状为 (N, D) 的输入张量。 + p (float): p-norm 的阶数,默认为 2.0。 + out (Tensor, optional): 结果输出张量。 + + 返回: + Tensor: 形状为 (M, N) 的距离矩阵。 + """ + if out is None: + # 非原地操作:由底层 C++ 接口根据 x1, x2 推导形状并创建新 Tensor + return Tensor( + _infinicore.cdist( + x1._underlying, + x2._underlying, + float(p), + ) + ) + + # 原地/指定输出操作:结果写入用户提供的 out 张量 + _infinicore.cdist_( + out._underlying, + x1._underlying, + x2._underlying, + float(p), + ) + + return out \ No newline at end of file diff --git a/python/infinicore/ops/reciprocal.py b/python/infinicore/ops/reciprocal.py new file mode 100644 index 000000000..989e5dc3a --- /dev/null +++ b/python/infinicore/ops/reciprocal.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def reciprocal(input, *, out=None): + if out is None: + return Tensor(_infinicore.reciprocal(input._underlying)) + + _infinicore.reciprocal_(out._underlying, input._underlying) + + return out \ No newline at end of file diff --git a/src/infinicore/ops/addcmul/addcmul.cc b/src/infinicore/ops/addcmul/addcmul.cc new file mode 100644 index 000000000..44735b55a --- /dev/null +++ b/src/infinicore/ops/addcmul/addcmul.cc @@ -0,0 +1,29 @@ +#include "infinicore/ops/addcmul.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Addcmul::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +// 执行核心逻辑:设备校验与后端分发 +void Addcmul::execute(Tensor out, Tensor input, Tensor t1, Tensor t2, float value) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, t1, t2); + infinicore::context::setDevice(out->device()); + dispatcher().lookup(out->device().getType())(out, input, t1, t2, value); +} + +// Out-of-place 接口:自动创建输出 Tensor +Tensor addcmul(Tensor input, Tensor t1, Tensor t2, float value) { + auto out = Tensor::empty(input->shape(), input->dtype(), input->device()); + addcmul_(out, input, t1, t2, value); + return out; +} + +void addcmul_(Tensor out, Tensor input, Tensor t1, Tensor t2, float value) { + Addcmul::execute(out, input, t1, t2, value); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/addcmul/addcmul_infiniop.cc b/src/infinicore/ops/addcmul/addcmul_infiniop.cc new file mode 100644 index 000000000..ed6327f05 --- /dev/null +++ b/src/infinicore/ops/addcmul/addcmul_infiniop.cc @@ -0,0 +1,51 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/addcmul.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::addcmul_impl::infiniop { + +// 定义线程局部的算子描述符缓存 +thread_local common::OpCache caches( + 100, + [](infiniopAddcmulDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAddcmulDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor out, Tensor input, Tensor t1, Tensor t2, float value) { + size_t seed = hash_combine(out, input, t1, t2, value); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopAddcmulDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateAddcmulDescriptor( + context::getInfiniopHandle(device), &desc, + out->desc(), input->desc(), t1->desc(), t2->desc(), value)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAddcmulWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopAddcmul( + desc, workspace->data(), workspace_size, + out->data(), input->data(), t1->data(), t2->data(), context::getStream())); +} + +static bool registered = []() { + Addcmul::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::addcmul_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/atanh/atanh.cc b/src/infinicore/ops/atanh/atanh.cc new file mode 100644 index 000000000..a03686759 --- /dev/null +++ b/src/infinicore/ops/atanh/atanh.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/atanh.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +// 获取单例分发器 +common::OpDispatcher &Atanh::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +// 执行入口:负责设备切换和后端查找 +void Atanh::execute(Tensor y, Tensor a) { + // 确保输入和输出在同一个设备上 + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, a); + + // 切换当前上下文到目标设备 + infinicore::context::setDevice(y->device()); + + // 根据设备类型(CPU/CUDA等)查找对应的实现并执行 + dispatcher().lookup(y->device().getType())(y, a); +} + +// Out-of-place 接口:自动创建结果 Tensor +Tensor atanh(Tensor a) { + // 创建一个与输入形状、类型、设备完全相同的空 Tensor + auto y = Tensor::empty(a->shape(), a->dtype(), a->device()); + atanh_(y, a); + return y; +} + +// In-place 或指定输出接口 +void atanh_(Tensor y, Tensor a) { + Atanh::execute(y, a); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/atanh/atanh_infiniop.cc b/src/infinicore/ops/atanh/atanh_infiniop.cc new file mode 100644 index 000000000..00e7f72c3 --- /dev/null +++ b/src/infinicore/ops/atanh/atanh_infiniop.cc @@ -0,0 +1,58 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/atanh.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::atanh_impl::infiniop { + +// 定义线程局部的算子描述符缓存,避免重复创建 Descriptor 带来的开销 +thread_local common::OpCache caches( + 100, // 缓存容量 + [](infiniopAtanhDescriptor_t &desc) { + if (desc != nullptr) { + // 缓存释放时的回调:销毁 infiniop 算子描述符 + INFINICORE_CHECK_ERROR(infiniopDestroyAtanhDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor y, Tensor a) { + // 1. 根据 Tensor 的形状、步长、类型等信息生成唯一 Hash 值 + size_t seed = hash_combine(y, a); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + // 2. 尝试从缓存中获取已存在的描述符 + auto desc_opt = cache.get(seed); + infiniopAtanhDescriptor_t desc = nullptr; + + if (!desc_opt) { + // 如果缓存未命中,创建新的描述符 + INFINICORE_CHECK_ERROR(infiniopCreateAtanhDescriptor( + context::getInfiniopHandle(device), &desc, + y->desc(), a->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 3. 获取并分配必要的 Workspace 空间(如果有的话) + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAtanhWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + // 4. 执行底层计算 + INFINICORE_CHECK_ERROR(infiniopAtanh( + desc, workspace->data(), workspace_size, + y->data(), a->data(), context::getStream())); +} + +// 5. 自动注册逻辑:程序启动时将此实现注册到分发器中 +static bool registered = []() { + Atanh::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::atanh_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.cc b/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.cc new file mode 100644 index 000000000..fe3720525 --- /dev/null +++ b/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.cc @@ -0,0 +1,63 @@ +#include "infinicore/ops/binary_cross_entropy_with_logits.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +// 静态调度器实例化 +common::OpDispatcher &BinaryCrossEntropyWithLogits::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +/** + * 执行核心逻辑:设备校验、上下文设置与后端分发 + */ +void BinaryCrossEntropyWithLogits::execute(Tensor out, Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction) { + // 1. 校验所有已定义的 Tensor 是否在同一设备上 + // 使用宏或循环校验 logits, target, out 以及可选的 weight/pos_weight + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, logits, target); + if (weight.is_defined()) INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, weight); + if (pos_weight.is_defined()) INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, pos_weight); + + // 2. 设置当前设备上下文 + infinicore::context::setDevice(out->device()); + + // 3. 根据设备类型查找并执行具体的后端实现(如 CUDA 或 CPU 实现) + dispatcher().lookup(out->device().getType())(out, logits, target, weight, pos_weight, reduction); +} + +/** + * Out-of-place 接口:根据 reduction 自动创建输出 Tensor + */ +Tensor binary_cross_entropy_with_logits(Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction) { + std::vector out_shape; + + // 1. 根据归约方式确定输出形状 + if (reduction == "none") { + // 不归约,形状与输入 logits 一致 + auto in_shape = logits->shape(); + for (auto dim : in_shape) { + out_shape.push_back(static_cast(dim)); + } + } else { + // mean 或 sum 归约,输出为标量 (空 shape 向量表示 0-dim tensor) + out_shape = {}; + } + + // 2. 创建输出 Tensor + auto out = Tensor::empty(out_shape, logits->dtype(), logits->device()); + + // 3. 调用显式接口执行计算 + binary_cross_entropy_with_logits_(out, logits, target, weight, pos_weight, reduction); + + return out; +} + +/** + * 显式指定输出接口 + */ +void binary_cross_entropy_with_logits_(Tensor out, Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction) { + BinaryCrossEntropyWithLogits::execute(out, logits, target, weight, pos_weight, reduction); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits_infiniop.cc b/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits_infiniop.cc new file mode 100644 index 000000000..4403c3c28 --- /dev/null +++ b/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits_infiniop.cc @@ -0,0 +1,99 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/binary_cross_entropy_with_logits.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::bce_logits_impl::infiniop { + +// 定义线程局部的 BCEWithLogits 算子描述符缓存 +thread_local common::OpCache caches( + 100, + [](infiniopBCEWithLogitsDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyBCEWithLogitsDescriptor(desc)); + desc = nullptr; + } + }); + +/** + * @brief 执行 BCEWithLogits 计算 + * @param out 输出 Tensor (根据 reduction 可能是标量或与 logits 同形状) + * @param logits 预测值 Tensor + * @param target 标签 Tensor + * @param weight 样本权重 Tensor (可选) + * @param pos_weight 正类权重 Tensor (可选) + * @param reduction_str 归约方式 ("none", "mean", "sum") + */ +void calculate(Tensor out, Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction_str) { + // 1. 将字符串归约参数转换为底层 API 使用的枚举值 + infiniopReduction_t reduction; + if (reduction_str == "none") { + reduction = INFINIOP_REDUCTION_NONE; + } else if (reduction_str == "mean") { + reduction = INFINIOP_REDUCTION_MEAN; + } else if (reduction_str == "sum") { + reduction = INFINIOP_REDUCTION_SUM; + } else { + throw std::runtime_error("Unknown reduction mode: " + reduction_str); + } + + // 2. 生成唯一 Hash Seed 用于缓存查找 + // 包含所有输入 Tensor 的状态和 reduction 参数,确保缓存键的唯一性 + size_t seed = hash_combine(out, logits, target, weight, pos_weight, static_cast(reduction)); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopBCEWithLogitsDescriptor_t desc = nullptr; + + // 3. 如果缓存未命中,创建新的描述符并存入缓存 + if (!desc_opt) { + // 获取可选 Tensor 的描述符,若未定义则传 nullptr + auto weight_desc = weight.is_defined() ? weight->desc() : nullptr; + auto pos_weight_desc = pos_weight.is_defined() ? pos_weight->desc() : nullptr; + + INFINICORE_CHECK_ERROR(infiniopCreateBCEWithLogitsDescriptor( + context::getInfiniopHandle(device), + &desc, + out->desc(), + logits->desc(), + target->desc(), + weight_desc, + pos_weight_desc, + reduction)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 4. 动态获取并分配 Workspace 临时内存 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetBCEWithLogitsWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + // 5. 获取数据指针,处理可选 Tensor 的空指针逻辑 + const void* weight_ptr = weight.is_defined() ? weight->data() : nullptr; + const void* pos_weight_ptr = pos_weight.is_defined() ? pos_weight->data() : nullptr; + + // 6. 执行底层算子 + INFINICORE_CHECK_ERROR(infiniopBCEWithLogits( + desc, + workspace->data(), + workspace_size, + out->data(), + logits->data(), + target->data(), + weight_ptr, + pos_weight_ptr, + context::getStream())); +} + +// 7. 自动注册到调度器 (Dispatcher) +static bool registered = []() { + BinaryCrossEntropyWithLogits::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::bce_logits_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/cdist/cdist.cc b/src/infinicore/ops/cdist/cdist.cc new file mode 100644 index 000000000..b351a25d6 --- /dev/null +++ b/src/infinicore/ops/cdist/cdist.cc @@ -0,0 +1,57 @@ +#include "infinicore/ops/cdist.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +// 静态调度器实例化 +common::OpDispatcher &Cdist::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +/** + * 执行核心逻辑:设备校验与后端分发 + */ +void Cdist::execute(Tensor out, Tensor x1, Tensor x2, double p) { + // 校验三个 Tensor 是否在同一设备上 + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, x1, x2); + + // 设置当前设备上下文 + infinicore::context::setDevice(out->device()); + + // 根据设备类型(CUDA/CPU/etc.)查找并执行注册的算子实现 + dispatcher().lookup(out->device().getType())(out, x1, x2, p); +} + +/** + * Out-of-place 接口:自动创建输出 Tensor + * x1: (M, D), x2: (N, D) -> out: (M, N) + */ +Tensor cdist(Tensor x1, Tensor x2, double p) { + // 1. 获取输入维度 + auto shape1 = x1->shape(); // 假设为 {M, D} + auto shape2 = x2->shape(); // 假设为 {N, D} + + // 将原来的 std::vector 修改为 std::vector + std::vector out_shape = { + static_cast(shape1[0]), + static_cast(shape2[0]) + }; + + // 或者使用更简洁的初始化列表方式,强制转换类型 + auto out = Tensor::empty({(uint64_t)shape1[0], (uint64_t)shape2[0]}, x1->dtype(), x1->device()); + + // 5. 调用执行接口 + cdist_(out, x1, x2, p); + + return out; +} + +/** + * 显式指定输出接口 + */ +void cdist_(Tensor out, Tensor x1, Tensor x2, double p) { + Cdist::execute(out, x1, x2, p); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/cdist/cdist_infiniop.cc b/src/infinicore/ops/cdist/cdist_infiniop.cc new file mode 100644 index 000000000..488f90815 --- /dev/null +++ b/src/infinicore/ops/cdist/cdist_infiniop.cc @@ -0,0 +1,66 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/cdist.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::cdist_impl::infiniop { + +// 定义线程局部的 cdist 算子描述符缓存 +// 缓存 key 为输入 Tensor 描述信息及参数 p 的哈希值 +thread_local common::OpCache caches( + 100, + [](infiniopCdistDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyCdistDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor out, Tensor x1, Tensor x2, double p) { + // 1. 生成唯一 Hash Seed 用于缓存查找 + size_t seed = hash_combine(out, x1, x2, p); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopCdistDescriptor_t desc = nullptr; + + // 2. 如果缓存未命中,创建新的描述符并存入缓存 + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateCdistDescriptor( + context::getInfiniopHandle(device), + &desc, + out->desc(), + x1->desc(), + x2->desc(), + p)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 3. 动态获取并分配 Workspace 临时内存 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetCdistWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + // 4. 执行底层算子 + INFINICORE_CHECK_ERROR(infiniopCdist( + desc, + workspace->data(), + workspace_size, + out->data(), + x1->data(), + x2->data(), + context::getStream())); +} + +// 5. 自动注册到调度器 (Dispatcher) +static bool registered = []() { + Cdist::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::cdist_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/reciprocal/reciprocal.cc b/src/infinicore/ops/reciprocal/reciprocal.cc new file mode 100644 index 000000000..4ce526fd7 --- /dev/null +++ b/src/infinicore/ops/reciprocal/reciprocal.cc @@ -0,0 +1,27 @@ +#include "infinicore/ops/reciprocal.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Reciprocal::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Reciprocal::execute(Tensor y, Tensor x) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x); + infinicore::context::setDevice(y->device()); + dispatcher().lookup(y->device().getType())(y, x); +} + +Tensor reciprocal(Tensor x) { + auto y = Tensor::empty(x->shape(), x->dtype(), x->device()); + reciprocal_(y, x); + return y; +} + +void reciprocal_(Tensor y, Tensor x) { + Reciprocal::execute(y, x); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/reciprocal/reciprocal_infiniop.cc b/src/infinicore/ops/reciprocal/reciprocal_infiniop.cc new file mode 100644 index 000000000..47481679b --- /dev/null +++ b/src/infinicore/ops/reciprocal/reciprocal_infiniop.cc @@ -0,0 +1,50 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/reciprocal.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::reciprocal_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopReciprocalDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyReciprocalDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor y, Tensor x) { + size_t seed = hash_combine(y, x); + + auto device = context::getDevice(); + auto &cache = caches.getCache(device); + + auto desc_opt = cache.get(seed); + infiniopReciprocalDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateReciprocalDescriptor( + context::getInfiniopHandle(device), &desc, + y->desc(), x->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetReciprocalWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopReciprocal( + desc, workspace->data(), workspace_size, + y->data(), x->data(), context::getStream())); +} + +static bool registered = []() { + Reciprocal::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::reciprocal_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/pybind11/infinicore.cc b/src/infinicore/pybind11/infinicore.cc index 152adefd9..225a4a6f6 100644 --- a/src/infinicore/pybind11/infinicore.cc +++ b/src/infinicore/pybind11/infinicore.cc @@ -16,8 +16,8 @@ PYBIND11_MODULE(_infinicore, m) { device::bind(m); device_event::bind(m); dtype::bind(m); - ops::bind(m); tensor::bind(m); + ops::bind(m); } } // namespace infinicore diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 978defa17..b45d21ad0 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -16,6 +16,11 @@ #include "ops/silu.hpp" #include "ops/swiglu.hpp" +#include "ops/atanh.hpp" +#include "ops/addcmul.hpp" +#include "ops/cdist.hpp" +#include "ops/binary_cross_entropy_with_logits.hpp" +#include "ops/reciprocal.hpp" namespace py = pybind11; namespace infinicore::ops { @@ -34,6 +39,11 @@ inline void bind(py::module &m) { bind_swiglu(m); bind_rope(m); bind_embedding(m); + bind_atanh(m); + bind_addcmul(m); + bind_cdist(m); + bind_binary_cross_entropy_with_logits(m); + bind_reciprocal(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/addcmul.hpp b/src/infinicore/pybind11/ops/addcmul.hpp new file mode 100644 index 000000000..e46ba81ec --- /dev/null +++ b/src/infinicore/pybind11/ops/addcmul.hpp @@ -0,0 +1,51 @@ +#pragma once + +#include + +#include "infinicore/ops/addcmul.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_addcmul(py::module &m) { + // 绑定 out-of-place 接口: out = addcmul(input, t1, t2, value) + m.def("addcmul", + &op::addcmul, + py::arg("input"), + py::arg("tensor1"), + py::arg("tensor2"), + py::arg("value") = 1.0f, + R"doc(Performs the element-wise multiplication of tensor1 by tensor2, +multiplies the result by value and adds it to input. + +Args: + input: Tensor to be added + tensor1: First tensor for multiplication + tensor2: Second tensor for multiplication + value: Scalar multiplier for tensor1 * tensor2 (default: 1.0) + +Returns: + The output tensor +)doc"); + + // 绑定 in-place / specified output 接口: addcmul_(out, input, t1, t2, value) + m.def("addcmul_", + &op::addcmul_, + py::arg("out"), + py::arg("input"), + py::arg("tensor1"), + py::arg("tensor2"), + py::arg("value") = 1.0f, + R"doc(In-place version of addcmul. + +Args: + out: The destination tensor to store the result + input: Tensor to be added + tensor1: First tensor for multiplication + tensor2: Second tensor for multiplication + value: Scalar multiplier for tensor1 * tensor2 (default: 1.0) +)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/atanh.hpp b/src/infinicore/pybind11/ops/atanh.hpp new file mode 100644 index 000000000..9db33def7 --- /dev/null +++ b/src/infinicore/pybind11/ops/atanh.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/atanh.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_atanh(py::module &m) { + m.def("atanh", + &op::atanh, + py::arg("a"), + R"doc(Inverse hyperbolic tangent of a tensor.)doc"); + + m.def("atanh_", + &op::atanh_, + py::arg("y"), + py::arg("a"), + R"doc(Compute inverse hyperbolic tangent and store in the provided output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/binary_cross_entropy_with_logits.hpp b/src/infinicore/pybind11/ops/binary_cross_entropy_with_logits.hpp new file mode 100644 index 000000000..25ddd802d --- /dev/null +++ b/src/infinicore/pybind11/ops/binary_cross_entropy_with_logits.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include +#include "infinicore/ops/binary_cross_entropy_with_logits.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_binary_cross_entropy_with_logits(py::module &m) { + // 1. 绑定 out-of-place 接口: out = binary_cross_entropy_with_logits(...) + m.def("binary_cross_entropy_with_logits", + &op::binary_cross_entropy_with_logits, + py::arg("input"), + py::arg("target"), + py::arg("weight") = Tensor(), // 默认为空 Tensor + py::arg("pos_weight") = Tensor(), // 默认为空 Tensor + py::arg("reduction") = "mean", // 默认归约方式为平均值 + R"doc(Measures Binary Cross Entropy between target and output logits. + +Args: + input: Tensor of arbitrary shape as unnormalized scores (logits). + target: Tensor of the same shape as input with values between 0 and 1. + weight: Optional rescaling weight for each loss component. + pos_weight: Optional weight for positive examples (must be broadcastable). + reduction: Specfies the reduction to apply: 'none' | 'mean' | 'sum'. + +Returns: + A tensor representing the loss. +)doc"); + + // 2. 绑定指定输出接口: binary_cross_entropy_with_logits_(out, ...) + m.def("binary_cross_entropy_with_logits_", + &op::binary_cross_entropy_with_logits_, + py::arg("out"), + py::arg("input"), + py::arg("target"), + py::arg("weight") = Tensor(), + py::arg("pos_weight") = Tensor(), + py::arg("reduction") = "mean", + R"doc(Specified output version of binary_cross_entropy_with_logits. + +Args: + out: The destination tensor to store the loss. + input: Logits tensor. + target: Target tensor. + weight: Optional sample weight. + pos_weight: Optional positive class weight. + reduction: Specfies the reduction to apply. +)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/cdist.hpp b/src/infinicore/pybind11/ops/cdist.hpp new file mode 100644 index 000000000..78ef18529 --- /dev/null +++ b/src/infinicore/pybind11/ops/cdist.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include + +#include "infinicore/ops/cdist.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_cdist(py::module &m) { + // 1. 绑定 out-of-place 接口: out = cdist(x1, x2, p) + m.def("cdist", + &op::cdist, + py::arg("x1"), + py::arg("x2"), + py::arg("p") = 2.0, + R"doc(Computes batched pairwise distance between vectors in x1 and x2 using p-norm. + +Args: + x1: First set of vectors, shape (M, D) + x2: Second set of vectors, shape (N, D) + p: The p-norm to apply (default: 2.0) + +Returns: + A matrix containing pairwise distances, shape (M, N) +)doc"); + + // 2. 绑定 in-place / specified output 接口: cdist_(out, x1, x2, p) + m.def("cdist_", + &op::cdist_, + py::arg("out"), + py::arg("x1"), + py::arg("x2"), + py::arg("p") = 2.0, + R"doc(In-place version of cdist. Stores the results in the 'out' tensor. + +Args: + out: The destination tensor, shape (M, N) + x1: First set of vectors, shape (M, D) + x2: Second set of vectors, shape (N, D) + p: The p-norm to apply (default: 2.0) +)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/reciprocal.hpp b/src/infinicore/pybind11/ops/reciprocal.hpp new file mode 100644 index 000000000..dec6472cc --- /dev/null +++ b/src/infinicore/pybind11/ops/reciprocal.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/reciprocal.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_reciprocal(py::module &m) { + m.def("reciprocal", + &op::reciprocal, + py::arg("x"), + R"doc(Computes the reciprocal of the input tensor.)doc"); + + m.def("reciprocal_", + &op::reciprocal_, + py::arg("y"), + py::arg("x"), + R"doc(Computes the reciprocal of the input tensor and stores in the output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 12469d780..2490a8291 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -13,6 +13,11 @@ DECLARE_INFINIOP_TEST(rope) DECLARE_INFINIOP_TEST(clip) DECLARE_INFINIOP_TEST(swiglu) DECLARE_INFINIOP_TEST(add) +DECLARE_INFINIOP_TEST(atanh) +DECLARE_INFINIOP_TEST(addcmul) +DECLARE_INFINIOP_TEST(cdist) +DECLARE_INFINIOP_TEST(binary_cross_entropy_with_logits) +DECLARE_INFINIOP_TEST(reciprocal) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(silu) diff --git a/src/infiniop-test/src/ops/addcmul.cpp b/src/infiniop-test/src/ops/addcmul.cpp new file mode 100644 index 000000000..7b1b40ea9 --- /dev/null +++ b/src/infiniop-test/src/ops/addcmul.cpp @@ -0,0 +1,143 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::addcmul { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr t1; + std::shared_ptr t2; + std::shared_ptr out; + std::shared_ptr ans; + float value; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + // 校验张量是否存在 + if (tensors.find("input") == tensors.end() || + tensors.find("t1") == tensors.end() || + tensors.find("t2") == tensors.end() || + tensors.find("out") == tensors.end() || + tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Addcmul Test: Missing tensors"); + } + + // 获取标量属性 value + test->_attributes->value = 1.0f; // 默认值 + if (attributes.find("value") != attributes.end()) { + test->_attributes->value = *reinterpret_cast(attributes["value"].data()); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->t1 = tensors["t1"]; + test->_attributes->t2 = tensors["t2"]; + test->_attributes->out = tensors["out"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + + infiniopAddcmulDescriptor_t op_desc; + + // 数据迁移至指定设备 + auto input = _attributes->input->to(device, device_id); + auto t1 = _attributes->t1->to(device, device_id); + auto t2 = _attributes->t2->to(device, device_id); + auto out = _attributes->out->to(device, device_id); + + // 创建算子描述符 + CHECK_OR(infiniopCreateAddcmulDescriptor(handle, &op_desc, + out->desc(), + input->desc(), + t1->desc(), + t2->desc(), + _attributes->value), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create addcmul descriptor.")); + + // Workspace 处理 + size_t workspace_size; + CHECK_OR(infiniopGetAddcmulWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + // 执行计算 + CHECK_OR(infiniopAddcmul(op_desc, workspace, workspace_size, + out->data(), + input->data(), + t1->data(), + t2->data(), + nullptr), // stream + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + // 结果验证 + try { + allClose(out, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // 性能测试 + double elapsed_time = benchmark( + [=]() { + infiniopAddcmul(op_desc, workspace, workspace_size, + out->data(), + input->data(), + t1->data(), + t2->data(), + nullptr); + }, + warm_ups, iterations); + + // 资源清理 + infinirtFree(workspace); + infiniopDestroyAddcmulDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"value"}; +} + +std::vector Test::tensor_names() { + return {"input", "t1", "t2", "out", "ans"}; +} + +std::vector Test::output_names() { + return {"out"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- value: " << _attributes->value << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- t1: " << _attributes->t1->info() << std::endl; + oss << "- t2: " << _attributes->t2->info() << std::endl; + oss << "- out: " << _attributes->out->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::addcmul \ No newline at end of file diff --git a/src/infiniop-test/src/ops/atanh.cpp b/src/infiniop-test/src/ops/atanh.cpp new file mode 100644 index 000000000..35b647814 --- /dev/null +++ b/src/infiniop-test/src/ops/atanh.cpp @@ -0,0 +1,116 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::atanh { +struct Test::Attributes { + std::shared_ptr a; // 输入 + std::shared_ptr y; // 输出 + std::shared_ptr ans; // 参考结果 +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + // atanh 只需要 a (input), y (output) 和 ans (reference) + if (tensors.find("a") == tensors.end() + || tensors.find("y") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Atanh Test: Missing tensors."); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->y = tensors["y"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + + infiniopAtanhDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto y = _attributes->y->to(device, device_id); + + // 调用修正后的 4 参数版本接口 (handle, desc, y, a) + CHECK_OR(infiniopCreateAtanhDescriptor(handle, &op_desc, + y->desc(), + a->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create atanh descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetAtanhWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + // 执行计算 (移除 b 相关的参数) + CHECK_OR(infiniopAtanh(op_desc, workspace, workspace_size, + y->data(), + a->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during atanh execution.")); + + // 验证结果 + try { + allClose(y, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // 性能测试 (Benchmark) + double elapsed_time = 0.; + elapsed_time = benchmark( + [=]() { + infiniopAtanh( + op_desc, workspace, workspace_size, + y->data(), + a->data(), + nullptr); + }, + warm_ups, iterations); + + // 释放资源 (可选:根据框架决定是否在此释放 op_desc) + // infiniopDestroyAtanhDescriptor(op_desc); + // infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "y", "ans"}; +} + +std::vector Test::output_names() { + return {"y"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- y: " << _attributes->y->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::atanh \ No newline at end of file diff --git a/src/infiniop-test/src/ops/binary_cross_entropy_with_logits.cpp b/src/infiniop-test/src/ops/binary_cross_entropy_with_logits.cpp new file mode 100644 index 000000000..d6d9516a2 --- /dev/null +++ b/src/infiniop-test/src/ops/binary_cross_entropy_with_logits.cpp @@ -0,0 +1,154 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::binary_cross_entropy_with_logits { + +struct Test::Attributes { + std::shared_ptr logits; + std::shared_ptr target; + std::shared_ptr weight; // 可选 + std::shared_ptr pos_weight; // 可选 + std::shared_ptr out; + std::shared_ptr ans; + int reduction; // 0: none, 1: mean, 2: sum +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + // 1. 校验必要张量是否存在 + if (tensors.find("logits") == tensors.end() || + tensors.find("target") == tensors.end() || + tensors.find("out") == tensors.end() || + tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid BCE Test: Missing mandatory tensors"); + } + + // 2. 获取 reduction 属性 (默认为 1: mean) + test->_attributes->reduction = 1; + if (attributes.find("reduction") != attributes.end()) { + test->_attributes->reduction = *reinterpret_cast(attributes["reduction"].data()); + } + + // 3. 填充张量(处理可选张量) + test->_attributes->logits = tensors["logits"]; + test->_attributes->target = tensors["target"]; + test->_attributes->out = tensors["out"]; + test->_attributes->ans = tensors["ans"]; + + // 如果 tensors 中存在则赋值,否则为 nullptr + test->_attributes->weight = tensors.count("weight") ? tensors["weight"] : nullptr; + test->_attributes->pos_weight = tensors.count("pos_weight") ? tensors["pos_weight"] : nullptr; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + + infiniopBCEWithLogitsDescriptor_t op_desc; + + // 4. 数据迁移 + auto logits = _attributes->logits->to(device, device_id); + auto target = _attributes->target->to(device, device_id); + auto out = _attributes->out->to(device, device_id); + + // 处理可选张量迁移 + std::shared_ptr weight = (_attributes->weight) ? _attributes->weight->to(device, device_id) : nullptr; + std::shared_ptr pos_weight = (_attributes->pos_weight) ? _attributes->pos_weight->to(device, device_id) : nullptr; + + // 5. 创建描述符 (注意处理 NULL 描述符) + auto w_desc = weight ? weight->desc() : nullptr; + auto pw_desc = pos_weight ? pos_weight->desc() : nullptr; + + CHECK_OR(infiniopCreateBCEWithLogitsDescriptor(handle, &op_desc, + out->desc(), + logits->desc(), + target->desc(), + w_desc, + pw_desc, + static_cast(_attributes->reduction)), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create BCE descriptor.")); + + // 6. Workspace 管理 + size_t workspace_size; + CHECK_OR(infiniopGetBCEWithLogitsWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + // 7. 执行计算 + auto w_data = weight ? weight->data() : nullptr; + auto pw_data = pos_weight ? pos_weight->data() : nullptr; + + CHECK_OR(infiniopBCEWithLogits(op_desc, workspace, workspace_size, + out->data(), + logits->data(), + target->data(), + w_data, + pw_data, + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + // 8. 结果验证 + try { + allClose(out, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // 9. 性能 Benchmark + double elapsed_time = benchmark( + [=]() { + infiniopBCEWithLogits(op_desc, workspace, workspace_size, + out->data(), logits->data(), target->data(), + w_data, pw_data, nullptr); + }, + warm_ups, iterations); + + // 10. 资源清理 + infinirtFree(workspace); + infiniopDestroyBCEWithLogitsDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"reduction"}; +} + +std::vector Test::tensor_names() { + return {"logits", "target", "weight", "pos_weight", "out", "ans"}; +} + +std::vector Test::output_names() { + return {"out"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- reduction: " << _attributes->reduction << std::endl; + oss << "- logits: " << _attributes->logits->info() << std::endl; + if (_attributes->weight) oss << "- weight: " << _attributes->weight->info() << std::endl; + oss << "- out: " << _attributes->out->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::binary_cross_entropy_with_logits \ No newline at end of file diff --git a/src/infiniop-test/src/ops/cdist.cpp b/src/infiniop-test/src/ops/cdist.cpp new file mode 100644 index 000000000..3ffb44fb5 --- /dev/null +++ b/src/infiniop-test/src/ops/cdist.cpp @@ -0,0 +1,135 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::cdist { + +struct Test::Attributes { + std::shared_ptr x1; + std::shared_ptr x2; + std::shared_ptr out; + std::shared_ptr ans; + double p; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + // 1. 校验张量是否存在 (x1, x2, out, ans) + if (tensors.find("x1") == tensors.end() || + tensors.find("x2") == tensors.end() || + tensors.find("out") == tensors.end() || + tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Cdist Test: Missing tensors"); + } + + // 2. 获取标量属性 p (注意 cdist 通常用 double) + test->_attributes->p = 2.0; // 默认值 + if (attributes.find("p") != attributes.end()) { + test->_attributes->p = *reinterpret_cast(attributes["p"].data()); + } + + test->_attributes->x1 = tensors["x1"]; + test->_attributes->x2 = tensors["x2"]; + test->_attributes->out = tensors["out"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + + infiniopCdistDescriptor_t op_desc; + + // 3. 数据迁移至指定设备 (M x D, N x D) + auto x1 = _attributes->x1->to(device, device_id); + auto x2 = _attributes->x2->to(device, device_id); + auto out = _attributes->out->to(device, device_id); + + // 4. 创建算子描述符 + CHECK_OR(infiniopCreateCdistDescriptor(handle, &op_desc, + out->desc(), + x1->desc(), + x2->desc(), + _attributes->p), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cdist descriptor.")); + + // 5. Workspace 动态内存分配 + size_t workspace_size; + CHECK_OR(infiniopGetCdistWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + // 6. 执行计算 (计算 M x N 距离矩阵) + CHECK_OR(infiniopCdist(op_desc, workspace, workspace_size, + out->data(), + x1->data(), + x2->data(), + nullptr), // stream + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + // 7. 结果数值验证 + try { + allClose(out, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // 8. 性能 Benchmark + double elapsed_time = benchmark( + [=]() { + infiniopCdist(op_desc, workspace, workspace_size, + out->data(), + x1->data(), + x2->data(), + nullptr); + }, + warm_ups, iterations); + + // 9. 资源清理 + infinirtFree(workspace); + infiniopDestroyCdistDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"p"}; +} + +std::vector Test::tensor_names() { + return {"x1", "x2", "out", "ans"}; +} + +std::vector Test::output_names() { + return {"out"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- p: " << _attributes->p << std::endl; + oss << "- x1: " << _attributes->x1->info() << std::endl; + oss << "- x2: " << _attributes->x2->info() << std::endl; + oss << "- out: " << _attributes->out->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::cdist \ No newline at end of file diff --git a/src/infiniop-test/src/ops/reciprocal.cpp b/src/infiniop-test/src/ops/reciprocal.cpp new file mode 100644 index 000000000..a2a095362 --- /dev/null +++ b/src/infiniop-test/src/ops/reciprocal.cpp @@ -0,0 +1,104 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::reciprocal { +struct Test::Attributes { + std::shared_ptr x; + std::shared_ptr y; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("x") == tensors.end() + || tensors.find("y") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->x = tensors["x"]; + test->_attributes->y = tensors["y"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopReciprocalDescriptor_t op_desc; + auto x = _attributes->x->to(device, device_id); + auto y = _attributes->y->to(device, device_id); + CHECK_OR(infiniopCreateReciprocalDescriptor(handle, &op_desc, + y->desc(), + x->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetReciprocalWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + + CHECK_OR(infiniopReciprocal(op_desc, workspace, workspace_size, + y->data(), + x->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(y, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopReciprocal( + op_desc, workspace, workspace_size, + y->data(), + x->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"x", "y", "ans"}; +} + +std::vector Test::output_names() { + return {"y"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- x: " << _attributes->x->info() << std::endl; + oss << "- y: " << _attributes->y->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::reciprocal \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc b/src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc new file mode 100644 index 000000000..d79b53136 --- /dev/null +++ b/src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc @@ -0,0 +1,61 @@ +#include "addcmul_cpu.h" + +namespace op::addcmul::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float value) { // 额外接收 value 参数 + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // 1. 类型检查 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // 2. 形状检查 (仿照 atanh,这里至少检查第一个输入) + const auto &y_shape = out_desc->shape(); + for (const auto &in_desc : input_desc_vec) { + CHECK_SAME_SHAPE(y_shape, in_desc->shape()); + } + + // 3. 使用通用的 Elementwise 宏创建描述符 + // 该宏会实例化 Descriptor 并将其赋值给 *desc_ptr + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + // 4. 将标量属性 value 存入 Descriptor 内部 + (*desc_ptr)->_value = value; + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 仿照 atanh,使用 switch 分发不同数据类型 + // 这里的模板参数是 AddcmulOp,它在 addcmul_cpu.h 中定义 + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream, _value); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream, _value); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream, _value); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream, _value); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::addcmul::cpu \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/cpu/addcmul_cpu.h b/src/infiniop/ops/addcmul/cpu/addcmul_cpu.h new file mode 100644 index 000000000..ef55fad7e --- /dev/null +++ b/src/infiniop/ops/addcmul/cpu/addcmul_cpu.h @@ -0,0 +1,78 @@ +#ifndef __ADDCMUL_CPU_H__ +#define __ADDCMUL_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include +#include + +namespace op::addcmul::cpu { + +struct AddcmulOp { +public: + // addcmul 是三元算子: out = input + value * t1 * t2 + static constexpr size_t num_inputs = 3; + + template + T operator()(const T &input, const T &t1, const T &t2, Scalar value) const { + // 对于 float, double 等原生浮点类型 + if constexpr (std::is_floating_point_v) { + return input + static_cast(value) * t1 * t2; + } else { + // 对于 fp16, bf16 等类型,提升至 float 计算以保证精度并处理标量乘法 + float f_input = static_cast(input); + float f_t1 = static_cast(t1); + float f_t2 = static_cast(t2); + float v = static_cast(value); + return static_cast(f_input + v * f_t1 * f_t2); + } + } +}; + +// 为 addcmul 在 CPU 端自定义 Descriptor,支持额外的标量参数 value +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _value; // 标量系数 value + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::cpu::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(device_info), + _workspace_size(workspace_size), + _value(0.0f) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + // 额外接收 value 参数 + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs, + float value); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; + + float getValue() const { return _value; } +}; + +} // namespace op::addcmul::cpu + +#endif // __ADDCMUL_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/cuda/kernel.cuh b/src/infiniop/ops/addcmul/cuda/kernel.cuh new file mode 100644 index 000000000..af5266a21 --- /dev/null +++ b/src/infiniop/ops/addcmul/cuda/kernel.cuh @@ -0,0 +1,46 @@ +#ifndef __ADDCMUL_CUDA_CUH__ +#define __ADDCMUL_CUDA_CUH__ + +#include +#include +#include + +namespace op::addcmul::cuda { + +struct AddcmulOp { +public: + // addcmul 是三元算子:out = input + value * t1 * t2 + static constexpr size_t num_inputs = 3; + + template + __device__ __host__ __forceinline__ T operator()(const T &input, const T &t1, const T &t2, float value) const { + float v = value; + if constexpr (std::is_same_v) { + // 提升至 float 计算以保证精度并简化标量乘法 + float f_input = __half2float(input); + float f_t1 = __half2float(t1); + float f_t2 = __half2float(t2); + return __float2half(f_input + v * f_t1 * f_t2); + + } else if constexpr (std::is_same_v) { + float f_input = __bfloat162float(input); + float f_t1 = __bfloat162float(t1); + float f_t2 = __bfloat162float(t2); + return __float2bfloat16(f_input + v * f_t1 * f_t2); + + } else if constexpr (std::is_same_v) { + return input + v * t1 * t2; + + } else if constexpr (std::is_same_v) { + return input + static_cast(v) * t1 * t2; + + } else { + // 兜底逻辑 + return static_cast(static_cast(input) + v * static_cast(t1) * static_cast(t2)); + } + } +}; + +} // namespace op::addcmul::cuda + +#endif // __ADDCMUL_CUDA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/metax/addcmul_metax.h b/src/infiniop/ops/addcmul/metax/addcmul_metax.h new file mode 100644 index 000000000..b74014c54 --- /dev/null +++ b/src/infiniop/ops/addcmul/metax/addcmul_metax.h @@ -0,0 +1,72 @@ +#ifndef __ADDCMUL_METAX_H__ +#define __ADDCMUL_METAX_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +namespace op::addcmul::metax { + +// 为 addcmul 在 METAX 端自定义 Descriptor,支持额外的标量参数 value +class Descriptor final : public InfiniopDescriptor { + // 为保持与通用 Elementwise 框架的兼容,仍然保留这些成员 + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _value; // 标量系数 value + +public: + // 为自定义 CUDA kernel 记录张量元信息 + static constexpr int MAX_NDIM = 8; + + struct TensorMeta { + int ndim; + size_t shape[MAX_NDIM]; + ptrdiff_t strides[MAX_NDIM]; + }; + + TensorMeta _out_meta{}; + TensorMeta _input_meta{}; + TensorMeta _t1_meta{}; + TensorMeta _t2_meta{}; + size_t _output_size{0}; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::metax::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(device_info), + _workspace_size(workspace_size), + _value(0.0f) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + // 额外接收 value 参数 + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs, + float value); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; + + float getValue() const { return _value; } +}; + +} // namespace op::addcmul::metax + +#endif // __ADDCMUL_METAX_H__ \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/metax/addcmul_metax.maca b/src/infiniop/ops/addcmul/metax/addcmul_metax.maca new file mode 100644 index 000000000..9c8c00b76 --- /dev/null +++ b/src/infiniop/ops/addcmul/metax/addcmul_metax.maca @@ -0,0 +1,179 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "addcmul_metax.h" +#include "addcmul_metax_kernel.h" + +namespace op::addcmul::metax { + +Descriptor::~Descriptor() = default; + +// 将 TensorDescriptor 中的 shape/strides 填充到 TensorMeta 结构中 +static inline infiniStatus_t fill_tensor_meta( + infiniopTensorDescriptor_t desc, + Descriptor::TensorMeta &meta) { + + auto ndim = desc->ndim(); + if (ndim > Descriptor::MAX_NDIM) { + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + meta.ndim = static_cast(ndim); + const auto &shape = desc->shape(); + const auto &strides = desc->strides(); + for (int i = 0; i < meta.ndim; ++i) { + meta.shape[i] = shape[i]; + meta.strides[i] = strides[i]; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float value) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // 1. 类型检查 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + + // 2. 形状检查:要求输出与三个输入形状一致(若不支持广播) + const auto &out_shape = out_desc->shape(); + const auto &input_desc = input_desc_vec.at(0); + const auto &t1_desc = input_desc_vec.at(1); + const auto &t2_desc = input_desc_vec.at(2); + CHECK_SAME_SHAPE(out_shape, input_desc->shape()); + CHECK_SAME_SHAPE(out_shape, t1_desc->shape()); + CHECK_SAME_SHAPE(out_shape, t2_desc->shape()); + + // 3. 创建底层的 Elementwise METAX 描述符 + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + // 4. 记录张量元信息和输出元素个数,供自定义 METAX kernel 使用 + auto *desc = *desc_ptr; + desc->_output_size = out_desc->numel(); + + CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta)); + CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta)); + CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta)); + CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta)); + + // 5. 将标量属性 value 存入 Descriptor 内部 + desc->_value = value; + + return INFINI_STATUS_SUCCESS; +} + +// 自定义 addcmul METAX kernel:使用 Descriptor 中的 TensorMeta 做通用 strided 访问 +template +INFINIOP_METAX_KERNEL addcmul_kernel( + size_t output_size, + Descriptor::TensorMeta out_meta, + Descriptor::TensorMeta in_meta, + Descriptor::TensorMeta t1_meta, + Descriptor::TensorMeta t2_meta, + T *out, + const T *input, + const T *t1, + const T *t2, + float value) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= output_size) { + return; + } + + // 根据输出 shape/stride 计算各个张量的偏移 + ptrdiff_t out_offset = 0; + ptrdiff_t in_offset = 0; + ptrdiff_t t1_offset = 0; + ptrdiff_t t2_offset = 0; + + size_t linear = idx; + for (int dim = out_meta.ndim - 1; dim >= 0; --dim) { + size_t dim_size = out_meta.shape[dim]; + size_t coord = linear % dim_size; + linear /= dim_size; + + out_offset += static_cast(coord) * out_meta.strides[dim]; + in_offset += static_cast(coord) * in_meta.strides[dim]; + t1_offset += static_cast(coord) * t1_meta.strides[dim]; + t2_offset += static_cast(coord) * t2_meta.strides[dim]; + } + + T in_val = input[in_offset]; + T t1_val = t1[t1_offset]; + T t2_val = t2[t2_offset]; + + out[out_offset] = op::addcmul::metax::AddcmulOp{}(in_val, t1_val, t2_val, value); +} + +template +static inline infiniStatus_t launch_addcmul_kernel( + const Descriptor *desc, + void *output, + const std::vector &inputs, + void *stream) { + + size_t output_size = desc->_output_size; + if (output_size == 0) { + return INFINI_STATUS_SUCCESS; + } + + auto *out_ptr = reinterpret_cast(output); + auto *in_ptr = reinterpret_cast(inputs.at(0)); + auto *t1_ptr = reinterpret_cast(inputs.at(1)); + auto *t2_ptr = reinterpret_cast(inputs.at(2)); + + mcStream_t metax_stream = reinterpret_cast(stream); + + constexpr uint32_t BLOCK_SIZE = 256; + uint32_t grid = static_cast((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE); + + addcmul_kernel<<>>( + output_size, + desc->_out_meta, + desc->_input_meta, + desc->_t1_meta, + desc->_t2_meta, + out_ptr, + in_ptr, + t1_ptr, + t2_ptr, + desc->getValue()); + + CHECK_METAX(mcGetLastError()); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 目前不依赖 workspace 内容,只检查大小是否足够以保持与其他算子一致的接口语义 + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // 直接调用自定义 METAX kernel,避免通过通用 elementwise 框架 + switch (_dtype) { + case INFINI_DTYPE_F16: + return launch_addcmul_kernel(this, output, inputs, stream); + case INFINI_DTYPE_BF16: + return launch_addcmul_kernel(this, output, inputs, stream); + case INFINI_DTYPE_F32: + return launch_addcmul_kernel(this, output, inputs, stream); + case INFINI_DTYPE_F64: + return launch_addcmul_kernel(this, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} +} // namespace op::addcmul::metax \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h b/src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h new file mode 100644 index 000000000..e8992196a --- /dev/null +++ b/src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h @@ -0,0 +1,48 @@ +#ifndef __ADDCMUL_METAX_KERNEL_H__ +#define __ADDCMUL_METAX_KERNEL_H__ + +/* + * This file contains the Addcmul operation implementation for the MUSA backend. + * Formula: out = input + value * tensor1 * tensor2 + */ + +namespace op::addcmul::metax { + +typedef struct AddcmulOp { +public: + // 三元算子,输入为 input, tensor1, tensor2 + static constexpr size_t num_inputs = 3; + + template + __device__ __forceinline__ T operator()(const T &in, const T &t1, const T &t2, float value) const { + if constexpr (std::is_same_v) { + // F32 直接使用乘加指令 + return in + value * t1 * t2; + } + else if constexpr (std::is_same_v) { + // F16 提升到 float 计算以防止中间乘法溢出 + float f_in = __half2float(in); + float f_t1 = __half2float(t1); + float f_t2 = __half2float(t2); + return __float2half(f_in + value * f_t1 * f_t2); + } + else if constexpr (std::is_same_v) { + // BF16 同样提升到 float 计算 + float f_in = __bfloat162float(in); + float f_t1 = __bfloat162float(t1); + float f_t2 = __bfloat162float(t2); + return __float2bfloat16_rn(f_in + value * f_t1 * f_t2); + } + else if constexpr (std::is_same_v) { + return in + (double)value * t1 * t2; + } + else { + // 整数类型或其他类型 + return in + static_cast(value) * t1 * t2; + } + } +} AddcmulOp; + +} // namespace op::addcmul::metax + +#endif // __ADDCMUL_METAX_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/moore/addcmul_moore.h b/src/infiniop/ops/addcmul/moore/addcmul_moore.h new file mode 100644 index 000000000..1d74d72d6 --- /dev/null +++ b/src/infiniop/ops/addcmul/moore/addcmul_moore.h @@ -0,0 +1,76 @@ +#ifndef __ADDCMUL_MOORE_H__ +#define __ADDCMUL_MOORE_H__ + +// 1. 切换到 Moore 平台的 Elementwise API +#include "../../../elementwise/moore/elementwise_moore_api.h" + +namespace op::addcmul::moore { + +/** + * 为 addcmul 在 Moore 端自定义 Descriptor + * 保持与 NVIDIA 版本一致的结构,以便于跨平台对齐 + */ +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + // 2. 切换到 Moore 设备的实现指针 + std::unique_ptr _device_info; + size_t _workspace_size; + float _value; // 标量系数 value + +public: + // 摩尔线程 MUSA 同样支持 stride 访问,记录张量元信息 + static constexpr int MAX_NDIM = 8; + + struct TensorMeta { + int ndim; + size_t shape[MAX_NDIM]; + ptrdiff_t strides[MAX_NDIM]; + }; + + TensorMeta _out_meta{}; + TensorMeta _input_meta{}; + TensorMeta _t1_meta{}; + TensorMeta _t2_meta{}; + size_t _output_size{0}; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::moore::DeviceImpl *device_info, // 3. 修改构造函数参数类型 + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(device_info), + _workspace_size(workspace_size), + _value(0.0f) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + // 4. 保持相同的接口,接收 value 参数 + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs, + float value); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; + + float getValue() const { return _value; } +}; + +} // namespace op::addcmul::moore + +#endif // __ADDCMUL_MOORE_H__ \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/moore/addcmul_moore.mu b/src/infiniop/ops/addcmul/moore/addcmul_moore.mu new file mode 100644 index 000000000..37b840c2a --- /dev/null +++ b/src/infiniop/ops/addcmul/moore/addcmul_moore.mu @@ -0,0 +1,159 @@ +#include "../../../elementwise/moore/elementwise_moore.h" +#include "addcmul_moore_kernel.h" +#include "addcmul_moore.h" +#include + +namespace op::addcmul::moore { + +Descriptor::~Descriptor() = default; + +// 1. 填充 TensorMeta,逻辑与 NVIDIA 一致,用于 MUSA Kernel 中的 Strided 寻址 +static inline infiniStatus_t fill_tensor_meta( + infiniopTensorDescriptor_t desc, + Descriptor::TensorMeta &meta) { + + auto ndim = desc->ndim(); + if (ndim > Descriptor::MAX_NDIM) { + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + meta.ndim = static_cast(ndim); + const auto &shape = desc->shape(); + const auto &strides = desc->strides(); + for (int i = 0; i < meta.ndim; ++i) { + meta.shape[i] = shape[i]; + meta.strides[i] = strides[i]; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float value) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // 类型检查 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + + // 形状检查 (A, T1, T2 需一致) + const auto &out_shape = out_desc->shape(); + const auto &input_desc = input_desc_vec.at(0); + const auto &t1_desc = input_desc_vec.at(1); + const auto &t2_desc = input_desc_vec.at(2); + CHECK_SAME_SHAPE(out_shape, input_desc->shape()); + CHECK_SAME_SHAPE(out_shape, t1_desc->shape()); + CHECK_SAME_SHAPE(out_shape, t2_desc->shape()); + + // 2. 调用 Moore 平台的描述符创建宏 + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + auto *desc = *desc_ptr; + desc->_output_size = out_desc->numel(); + + // 填充元数据 + CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta)); + CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta)); + CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta)); + CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta)); + + desc->_value = value; + + return INFINI_STATUS_SUCCESS; +} + +// 3. MUSA Kernel 实现:逻辑保持一致 +template +__global__ void addcmul_kernel( + size_t output_size, + Descriptor::TensorMeta out_meta, + Descriptor::TensorMeta in_meta, + Descriptor::TensorMeta t1_meta, + Descriptor::TensorMeta t2_meta, + T *out, + const T *input, + const T *t1, + const T *t2, + float value) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= output_size) return; + + ptrdiff_t out_offset = 0, in_offset = 0, t1_offset = 0, t2_offset = 0; + size_t linear = idx; + + // 通用多维索引转偏移逻辑 + for (int dim = out_meta.ndim - 1; dim >= 0; --dim) { + size_t dim_size = out_meta.shape[dim]; + size_t coord = linear % dim_size; + linear /= dim_size; + + out_offset += static_cast(coord) * out_meta.strides[dim]; + in_offset += static_cast(coord) * in_meta.strides[dim]; + t1_offset += static_cast(coord) * t1_meta.strides[dim]; + t2_offset += static_cast(coord) * t2_meta.strides[dim]; + } + + // 调用 Moore 平台定义的 AddcmulOp + out[out_offset] = op::addcmul::moore::AddcmulOp{}(input[in_offset], t1[t1_offset], t2[t2_offset], value); +} + +// 4. 内核启动封装 +template +static inline infiniStatus_t launch_addcmul_kernel( + const Descriptor *desc, + void *output, + const std::vector &inputs, + void *stream) { + + size_t output_size = desc->_output_size; + if (output_size == 0) return INFINI_STATUS_SUCCESS; + + auto *out_ptr = reinterpret_cast(output); + auto *in_ptr = reinterpret_cast(inputs.at(0)); + auto *t1_ptr = reinterpret_cast(inputs.at(1)); + auto *t2_ptr = reinterpret_cast(inputs.at(2)); + + musaStream_t musa_stream = reinterpret_cast(stream); + + constexpr uint32_t BLOCK_SIZE = 256; + uint32_t grid = static_cast((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE); + + addcmul_kernel<<>>( + output_size, desc->_out_meta, desc->_input_meta, desc->_t1_meta, desc->_t2_meta, + out_ptr, in_ptr, t1_ptr, t2_ptr, desc->getValue()); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return launch_addcmul_kernel(this, output, inputs, stream); + case INFINI_DTYPE_BF16: + // 使用 Moore 平台对应的 bf16 类型 + return launch_addcmul_kernel(this, output, inputs, stream); + case INFINI_DTYPE_F32: + return launch_addcmul_kernel(this, output, inputs, stream); + case INFINI_DTYPE_F64: + return launch_addcmul_kernel(this, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} +} // namespace op::addcmul::moore \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h b/src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h new file mode 100644 index 000000000..287192947 --- /dev/null +++ b/src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h @@ -0,0 +1,48 @@ +#ifndef __ADDCMUL_MOORE_KERNEL_H__ +#define __ADDCMUL_MOORE_KERNEL_H__ + +/* + * This file contains the Addcmul operation implementation for the MUSA backend. + * Formula: out = input + value * tensor1 * tensor2 + */ + +namespace op::addcmul::moore { + +typedef struct AddcmulOp { +public: + // 三元算子,输入为 input, tensor1, tensor2 + static constexpr size_t num_inputs = 3; + + template + __device__ __forceinline__ T operator()(const T &in, const T &t1, const T &t2, float value) const { + if constexpr (std::is_same_v) { + // F32 直接使用乘加指令 + return in + value * t1 * t2; + } + else if constexpr (std::is_same_v) { + // F16 提升到 float 计算以防止中间乘法溢出 + float f_in = __half2float(in); + float f_t1 = __half2float(t1); + float f_t2 = __half2float(t2); + return __float2half(f_in + value * f_t1 * f_t2); + } + else if constexpr (std::is_same_v) { + // BF16 同样提升到 float 计算 + float f_in = __bfloat162float(in); + float f_t1 = __bfloat162float(t1); + float f_t2 = __bfloat162float(t2); + return __float2bfloat16_rn(f_in + value * f_t1 * f_t2); + } + else if constexpr (std::is_same_v) { + return in + (double)value * t1 * t2; + } + else { + // 整数类型或其他类型 + return in + static_cast(value) * t1 * t2; + } + } +} AddcmulOp; + +} // namespace op::addcmul::moore + +#endif // __ADDCMUL_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu b/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu new file mode 100644 index 000000000..83922e8f5 --- /dev/null +++ b/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu @@ -0,0 +1,178 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "addcmul_nvidia.cuh" + +namespace op::addcmul::nvidia { + +Descriptor::~Descriptor() = default; + +// 将 TensorDescriptor 中的 shape/strides 填充到 TensorMeta 结构中 +static inline infiniStatus_t fill_tensor_meta( + infiniopTensorDescriptor_t desc, + Descriptor::TensorMeta &meta) { + + auto ndim = desc->ndim(); + if (ndim > Descriptor::MAX_NDIM) { + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + meta.ndim = static_cast(ndim); + const auto &shape = desc->shape(); + const auto &strides = desc->strides(); + for (int i = 0; i < meta.ndim; ++i) { + meta.shape[i] = shape[i]; + meta.strides[i] = strides[i]; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec, + float value) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // 1. 类型检查 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + + // 2. 形状检查:要求输出与三个输入形状一致(若不支持广播) + const auto &out_shape = out_desc->shape(); + const auto &input_desc = input_desc_vec.at(0); + const auto &t1_desc = input_desc_vec.at(1); + const auto &t2_desc = input_desc_vec.at(2); + CHECK_SAME_SHAPE(out_shape, input_desc->shape()); + CHECK_SAME_SHAPE(out_shape, t1_desc->shape()); + CHECK_SAME_SHAPE(out_shape, t2_desc->shape()); + + // 3. 创建底层的 Elementwise CUDA 描述符 + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + // 4. 记录张量元信息和输出元素个数,供自定义 CUDA kernel 使用 + auto *desc = *desc_ptr; + desc->_output_size = out_desc->numel(); + + CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta)); + CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta)); + CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta)); + CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta)); + + // 5. 将标量属性 value 存入 Descriptor 内部 + desc->_value = value; + + return INFINI_STATUS_SUCCESS; +} + +// 自定义 addcmul CUDA kernel:使用 Descriptor 中的 TensorMeta 做通用 strided 访问 +template +INFINIOP_CUDA_KERNEL addcmul_kernel( + size_t output_size, + Descriptor::TensorMeta out_meta, + Descriptor::TensorMeta in_meta, + Descriptor::TensorMeta t1_meta, + Descriptor::TensorMeta t2_meta, + T *out, + const T *input, + const T *t1, + const T *t2, + float value) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= output_size) { + return; + } + + // 根据输出 shape/stride 计算各个张量的偏移 + ptrdiff_t out_offset = 0; + ptrdiff_t in_offset = 0; + ptrdiff_t t1_offset = 0; + ptrdiff_t t2_offset = 0; + + size_t linear = idx; + for (int dim = out_meta.ndim - 1; dim >= 0; --dim) { + size_t dim_size = out_meta.shape[dim]; + size_t coord = linear % dim_size; + linear /= dim_size; + + out_offset += static_cast(coord) * out_meta.strides[dim]; + in_offset += static_cast(coord) * in_meta.strides[dim]; + t1_offset += static_cast(coord) * t1_meta.strides[dim]; + t2_offset += static_cast(coord) * t2_meta.strides[dim]; + } + + T in_val = input[in_offset]; + T t1_val = t1[t1_offset]; + T t2_val = t2[t2_offset]; + + out[out_offset] = op::addcmul::cuda::AddcmulOp{}(in_val, t1_val, t2_val, value); +} + +template +static inline infiniStatus_t launch_addcmul_kernel( + const Descriptor *desc, + void *output, + const std::vector &inputs, + void *stream) { + + size_t output_size = desc->_output_size; + if (output_size == 0) { + return INFINI_STATUS_SUCCESS; + } + + auto *out_ptr = reinterpret_cast(output); + auto *in_ptr = reinterpret_cast(inputs.at(0)); + auto *t1_ptr = reinterpret_cast(inputs.at(1)); + auto *t2_ptr = reinterpret_cast(inputs.at(2)); + + cudaStream_t cuda_stream = reinterpret_cast(stream); + + constexpr uint32_t BLOCK_SIZE = 256; + uint32_t grid = static_cast((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE); + + addcmul_kernel<<>>( + output_size, + desc->_out_meta, + desc->_input_meta, + desc->_t1_meta, + desc->_t2_meta, + out_ptr, + in_ptr, + t1_ptr, + t2_ptr, + desc->getValue()); + + CHECK_CUDA(cudaGetLastError()); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 目前不依赖 workspace 内容,只检查大小是否足够以保持与其他算子一致的接口语义 + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // 直接调用自定义 CUDA kernel,避免通过通用 elementwise 框架 + switch (_dtype) { + case INFINI_DTYPE_F16: + return launch_addcmul_kernel(this, output, inputs, stream); + case INFINI_DTYPE_BF16: + return launch_addcmul_kernel(this, output, inputs, stream); + case INFINI_DTYPE_F32: + return launch_addcmul_kernel(this, output, inputs, stream); + case INFINI_DTYPE_F64: + return launch_addcmul_kernel(this, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} +} // namespace op::addcmul::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh b/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh new file mode 100644 index 000000000..89a001cc0 --- /dev/null +++ b/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh @@ -0,0 +1,72 @@ +#ifndef __ADDCMUL_NVIDIA_H__ +#define __ADDCMUL_NVIDIA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +namespace op::addcmul::nvidia { + +// 为 addcmul 在 NVIDIA 端自定义 Descriptor,支持额外的标量参数 value +class Descriptor final : public InfiniopDescriptor { + // 为保持与通用 Elementwise 框架的兼容,仍然保留这些成员 + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + float _value; // 标量系数 value + +public: + // 为自定义 CUDA kernel 记录张量元信息 + static constexpr int MAX_NDIM = 8; + + struct TensorMeta { + int ndim; + size_t shape[MAX_NDIM]; + ptrdiff_t strides[MAX_NDIM]; + }; + + TensorMeta _out_meta{}; + TensorMeta _input_meta{}; + TensorMeta _t1_meta{}; + TensorMeta _t2_meta{}; + size_t _output_size{0}; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::nvidia::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(device_info), + _workspace_size(workspace_size), + _value(0.0f) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + // 额外接收 value 参数 + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs, + float value); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const; + + float getValue() const { return _value; } +}; + +} // namespace op::addcmul::nvidia + +#endif // __ADDCMUL_NVIDIA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/addcmul/operator.cc b/src/infiniop/ops/addcmul/operator.cc new file mode 100644 index 000000000..50a1ecc23 --- /dev/null +++ b/src/infiniop/ops/addcmul/operator.cc @@ -0,0 +1,193 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/addcmul.h" + +#ifdef ENABLE_CPU_API +#include "cpu/addcmul_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/addcmul_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/addcmul_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/addcmul_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/addcmul_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/addcmul_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAddcmulDescriptor( + infiniopHandle_t handle, + infiniopAddcmulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t t1_desc, + infiniopTensorDescriptor_t t2_desc, + float value) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::addcmul::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + out_desc, \ + {input_desc, t1_desc, t2_desc}, \ + value) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__C infiniStatus_t infiniopGetAddcmulWorkspaceSize(infiniopAddcmulDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopAddcmul( + infiniopAddcmulDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *t1, + const void *t2, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, out, {input, t1, t2}, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyAddcmulDescriptor(infiniopAddcmulDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc new file mode 100644 index 000000000..c33d0a460 --- /dev/null +++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc @@ -0,0 +1,52 @@ +#include "atanh_cpu.h" + +namespace op::atanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, a_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 分发到对应的数据类型进行计算,模板参数为我们在 atanh_cpu.h 中定义的 AtanhOp + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atanh::cpu \ No newline at end of file diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.h b/src/infiniop/ops/atanh/cpu/atanh_cpu.h new file mode 100644 index 000000000..8a55e3afb --- /dev/null +++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h @@ -0,0 +1,31 @@ +#ifndef __ATANH_CPU_H__ +#define __ATANH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include +#include + +// 注册 atanh 算子在 cpu 后端的 descriptor +ELEMENTWISE_DESCRIPTOR(atanh, cpu) + +namespace op::atanh::cpu { +typedef struct AtanhOp { +public: + // atanh 是一元算子 + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &a) const { + // 对于 float, double 等原生支持的类型直接调用 std::atanh + if constexpr (std::is_floating_point_v) { + return std::atanh(a); + } else { + // 对于 half, bfloat16 等自定义类型,先转为 float 计算再转回 + // 假设这些类型支持 static_cast 到 float + return static_cast(std::atanhf(static_cast(a))); + } + } +} AtanhOp; +} // namespace op::atanh::cpu + +#endif // __ATANH_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh new file mode 100644 index 000000000..9a98aef63 --- /dev/null +++ b/src/infiniop/ops/atanh/cuda/kernel.cuh @@ -0,0 +1,40 @@ +#ifndef __ATANH_CUDA_H__ +#define __ATANH_CUDA_H__ + +#include +#include + +namespace op::atanh::cuda { +typedef struct AtanhOp { +public: + // atanh 是一元算子,只需要一个输入 + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &a) const { + if constexpr (std::is_same_v) { + // 对 half2 的两个部分分别求 atanh + float2 f = __half22float2(a); + f.x = atanhf(f.x); + f.y = atanhf(f.y); + return __float22half2_rn(f); + } else if constexpr (std::is_same_v) { + // half 类型先转为 float 计算再转回 + return __float2half(atanhf(__half2float(a))); + } else if constexpr (std::is_same_v) { + // bfloat16 类型处理同上 + return __float2bfloat16(atanhf(__bfloat162float(a))); + } else if constexpr (std::is_same_v) { + // float 直接调用标准数学库函数 + return atanhf(a); + } else if constexpr (std::is_same_v) { + return ::atanh(a); + } else { + // 其他整数类型或不支持类型理论上不应进入,此处做简单 fallback + return static_cast(atanhf(static_cast(a))); + } + } +} AtanhOp; +} // namespace op::atanh::cuda + +#endif // __ATANH_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/atanh/metax/atanh_metax.h b/src/infiniop/ops/atanh/metax/atanh_metax.h new file mode 100644 index 000000000..7c758d7f1 --- /dev/null +++ b/src/infiniop/ops/atanh/metax/atanh_metax.h @@ -0,0 +1,8 @@ +#ifndef __ATANH_METAX_API_H__ +#define __ATANH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(atanh, metax) + +#endif // __ATANH_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/atanh/metax/atanh_metax.maca b/src/infiniop/ops/atanh/metax/atanh_metax.maca new file mode 100644 index 000000000..043463b39 --- /dev/null +++ b/src/infiniop/ops/atanh/metax/atanh_metax.maca @@ -0,0 +1,58 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "atanh_metax.h" +#include "atanh_metax_kernel.h" + +namespace op::atanh::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(y_shape, a_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, metax::AtanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, metax::AtanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, metax::AtanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, metax::AtanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atanh::metax \ No newline at end of file diff --git a/src/infiniop/ops/atanh/metax/atanh_metax_kernel.h b/src/infiniop/ops/atanh/metax/atanh_metax_kernel.h new file mode 100644 index 000000000..d659ef193 --- /dev/null +++ b/src/infiniop/ops/atanh/metax/atanh_metax_kernel.h @@ -0,0 +1,47 @@ +#ifndef __ATANH_METAX_KERNEL_H__ +#define __ATANH_METAX_KERNEL_H__ + +/* + * This file contains the Atanh operation implementation for the MUSA backend. + * + * It follows the consistent code structure to ensure alignment across different + * hardware platforms within the Moore Threads (MUSA) ecosystem. + */ +namespace op::atanh::metax { + +typedef struct AtanhOp { +public: + // 一元算子,输入数量为 1 + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &a) const { + if constexpr (std::is_same_v) { + // 针对 half2 进行并行计算 + float2 f2 = __half22float2(a); + f2.x = atanhf(f2.x); + f2.y = atanhf(f2.y); + return __float22half2_rn(f2); + } else if constexpr (std::is_same_v) { + // 转为 float 计算以保证精度并匹配 MUSA 数学库 + return __float2half(atanhf(__half2float(a))); + } else if constexpr (std::is_same_v) { + // BF16 同样提升到 float 计算,避免转换歧义 + float a_f = __bfloat162float(a); + return __float2bfloat16_rn(atanhf(a_f)); + } else if constexpr (std::is_same_v) { + // 调用 MUSA 内置的单精度反双曲正切函数 + return atanhf(a); + } else if constexpr (std::is_same_v) { + // 调用双精度版本 + return ::atanh(a); + } else { + // 兜底实现(如果是整数类型,通常会隐式转为 float) + return static_cast(atanhf(static_cast(a))); + } + } +} AtanhOp; + +} // namespace op::atanh::metax + +#endif // __ATANH_METAX_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/atanh/moore/atanh_moore.h b/src/infiniop/ops/atanh/moore/atanh_moore.h new file mode 100644 index 000000000..9bfc1e978 --- /dev/null +++ b/src/infiniop/ops/atanh/moore/atanh_moore.h @@ -0,0 +1,11 @@ +#ifndef __ATANH_MOORE_API_H__ +#define __ATANH_MOORE_API_H__ + +// 1. 修改包含路径,指向 moore 平台的 elementwise API 定义 +#include "../../../elementwise/moore/elementwise_moore_api.h" + +// 2. 使用 ELEMENTWISE_DESCRIPTOR 宏,平台参数改为 moore +// 这将自动生成 op::atanh::moore::Descriptor 类的声明 +ELEMENTWISE_DESCRIPTOR(atanh, moore) + +#endif // __ATANH_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/atanh/moore/atanh_moore.mu b/src/infiniop/ops/atanh/moore/atanh_moore.mu new file mode 100644 index 000000000..01f71219a --- /dev/null +++ b/src/infiniop/ops/atanh/moore/atanh_moore.mu @@ -0,0 +1,64 @@ +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "atanh_moore.h" +#include "atanh_moore_kernel.h" + +namespace op::atanh::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + // 1. 转换 Handle 为 Moore 平台类型 + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + + // 2. 检查数据类型支持情况 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + + // 3. 校验 Shape 一致性 + CHECK_SAME_SHAPE(y_shape, a_shape); + + // 4. 创建 Moore 平台的 Elementwise 描述符 + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // 5. 根据数据类型分发到具体的 MUSA Kernel 逻辑 + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::AtanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + // 注意:这里将 nv_bfloat16 替换为 Moore 环境下的 bfloat16 类型名 + return _device_info->calculate<256, moore::AtanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::AtanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::AtanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atanh::moore \ No newline at end of file diff --git a/src/infiniop/ops/atanh/moore/atanh_moore_kernel.h b/src/infiniop/ops/atanh/moore/atanh_moore_kernel.h new file mode 100644 index 000000000..81cf13d96 --- /dev/null +++ b/src/infiniop/ops/atanh/moore/atanh_moore_kernel.h @@ -0,0 +1,47 @@ +#ifndef __ATANH_MOORE_KERNEL_H__ +#define __ATANH_MOORE_KERNEL_H__ + +/* + * This file contains the Atanh operation implementation for the MUSA backend. + * + * It follows the consistent code structure to ensure alignment across different + * hardware platforms within the Moore Threads (MUSA) ecosystem. + */ +namespace op::atanh::moore { + +typedef struct AtanhOp { +public: + // 一元算子,输入数量为 1 + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &a) const { + if constexpr (std::is_same_v) { + // 针对 half2 进行并行计算 + float2 f2 = __half22float2(a); + f2.x = atanhf(f2.x); + f2.y = atanhf(f2.y); + return __float22half2_rn(f2); + } else if constexpr (std::is_same_v) { + // 转为 float 计算以保证精度并匹配 MUSA 数学库 + return __float2half(atanhf(__half2float(a))); + } else if constexpr (std::is_same_v) { + // BF16 同样提升到 float 计算,避免转换歧义 + float a_f = __bfloat162float(a); + return __float2bfloat16_rn(atanhf(a_f)); + } else if constexpr (std::is_same_v) { + // 调用 MUSA 内置的单精度反双曲正切函数 + return atanhf(a); + } else if constexpr (std::is_same_v) { + // 调用双精度版本 + return ::atanh(a); + } else { + // 兜底实现(如果是整数类型,通常会隐式转为 float) + return static_cast(atanhf(static_cast(a))); + } + } +} AtanhOp; + +} // namespace op::atanh::moore + +#endif // __ATANH_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu new file mode 100644 index 000000000..4959b7abf --- /dev/null +++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu @@ -0,0 +1,58 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "atanh_nvidia.cuh" + +namespace op::atanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(y_shape, a_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AtanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::AtanhOp, nv_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AtanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::AtanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::atanh::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh new file mode 100644 index 000000000..473c3d47c --- /dev/null +++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ATANH_CUDA_API_H__ +#define __ATANH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(atanh, nvidia) + +#endif // __ATANH_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc new file mode 100644 index 000000000..8aa59a36e --- /dev/null +++ b/src/infiniop/ops/atanh/operator.cc @@ -0,0 +1,197 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/atanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/atanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/atanh_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/atanh_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/atanh_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/atanh_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/atanh_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAtanhDescriptor( + infiniopHandle_t handle, + infiniopAtanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t a_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::atanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {a_desc}) // 一元算子只传入 a_desc + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopAtanh( + infiniopAtanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *a, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {a}, stream) // 一元算子只传入 a + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.h b/src/infiniop/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.h new file mode 100644 index 000000000..5e3d8291e --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.h @@ -0,0 +1,68 @@ +#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_H__ +#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_H__ + +#include "../../operator.h" +#include "info.h" + +/** + * # 关于 `BCEWithLogits` 算子描述符的说明 + * * 采用 PImpl 设计模式,将不同硬件后端(如 CUDA 原生算子、CPU 循环、或是芯片厂商的专用库调用) + * 封装在 `Opaque` 结构中。 + * * 描述符在创建时会完成形状校验、步长分析,并确定最优的计算 Workspace 大小。 + */ + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::bce_with_logits::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + BCEWithLogitsInfo _info; /* 包含各输入输出张量的维度与步长 */ \ + size_t _workspace_size; \ + infiniopReduction_t _reduction; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + BCEWithLogitsInfo info, \ + infiniopReduction_t reduction, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _info(info), \ + _workspace_size(workspace_size_), \ + _reduction(reduction) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t logits_desc, \ + infiniopTensorDescriptor_t target_desc, \ + infiniopTensorDescriptor_t weight_desc, \ + infiniopTensorDescriptor_t pos_weight_desc, \ + infiniopReduction_t reduction \ + ); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *out, \ + const void *logits, \ + const void *target, \ + const void *weight, /* 可选,可为 nullptr */ \ + const void *pos_weight, /* 可选,可为 nullptr */ \ + void *stream) const; \ + }; \ + } + +#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_H__ \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.cc b/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.cc new file mode 100644 index 000000000..06ab0d1e8 --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.cc @@ -0,0 +1,165 @@ +#include "binary_cross_entropy_with_logits_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include + +namespace op::bce_with_logits::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t pos_weight_desc, + infiniopReduction_t reduction) { + + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + + // 1. 类型检查 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + // 2. 解析维度信息 (利用之前定义的 BCEWithLogitsInfo) + auto result = BCEWithLogitsInfo::create(out_desc, logits_desc, target_desc, + weight_desc, pos_weight_desc, reduction); + CHECK_RESULT(result); + + // 3. 实例化描述符 + *desc_ptr = new Descriptor( + dtype, result.take(), reduction, 0, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +/** + * 核心数值稳定逻辑:L = -w * [pw * y * log(sigmoid(x)) + (1-y) * log(1-sigmoid(x))] + * 变形为:L = w * [max(x, 0) - x * y * pw + (1 + (pw-1) * y) * log(1 + exp(-|x|))] + * 当 pw=1 时简化为:L = w * [max(x, 0) - x * y + log(1 + exp(-|x|))] + */ +template +void calculate_bce( + const BCEWithLogitsInfo &info, + void *out, + const void *logits, + const void *target, + const void *weight, + const void *pos_weight) { + + size_t n = info.num_elements; + float total_loss = 0.0f; + + // 获取各张量指针 + const Tdata* l_ptr = reinterpret_cast(logits); + const Tdata* t_ptr = reinterpret_cast(target); + const Tdata* w_ptr = reinterpret_cast(weight); + const Tdata* pw_ptr = reinterpret_cast(pos_weight); + Tdata* o_ptr = reinterpret_cast(out); + + auto &logits_info = info.logits; + auto &target_info = info.target; + auto &weight_info = info.weight; + auto &out_info = info.out; + +#pragma omp parallel for reduction(+:total_loss) + for (ptrdiff_t i = 0; i < (ptrdiff_t)n; ++i) { + size_t idx = static_cast(i); + + // 使用 stride 计算实际内存偏移,支持任意内存布局 + size_t logits_offset = op::common_cpu::indexToOffset( + idx, + logits_info.ndim, + logits_info.dims.data(), + logits_info.stride.data()); + size_t target_offset = op::common_cpu::indexToOffset( + idx, + target_info.ndim, + target_info.dims.data(), + target_info.stride.data()); + + float x = utils::cast(l_ptr[logits_offset]); + float y = utils::cast(t_ptr[target_offset]); + + // 处理 pos_weight 广播 (假设 logits 形状 [..., C], pos_weight 为 [C] 且连续) + float pw = 1.0f; + if (pw_ptr && info.pos_weight.total_elements > 0) { + size_t c = idx % info.pos_weight.total_elements; + pw = utils::cast(pw_ptr[c]); + } + + // 处理 weight: + // - 如果与 logits 完全同形状,则按 stride 精确索引; + // - 如果为向量 [C],则通过 indexToOffset 实现按最后一维广播。 + float w = 1.0f; + if (w_ptr && weight_info.ndim > 0) { + size_t weight_offset = op::common_cpu::indexToOffset( + idx, + weight_info.ndim, + weight_info.dims.data(), + weight_info.stride.data()); + w = utils::cast(w_ptr[weight_offset]); + } + + // 数值稳定的 BCEWithLogits 计算(对齐 PyTorch 实现): + // max_val = max(-x, 0) + // log_weight = 1 + (pos_weight - 1) * y + // loss = (1 - y) * x + log_weight * (log(1 + exp(-|x|)) + max_val) + float max_val = std::max(-x, 0.0f); + float log_weight = 1.0f + (pw - 1.0f) * y; + float loss = (1.0f - y) * x + + log_weight * (std::log1p(std::exp(-std::abs(x))) + max_val); + + loss *= w; + + if (info.reduction == INFINIOP_REDUCTION_NONE) { + // 逐元素写回时同样遵循 out 的 stride + size_t out_offset = op::common_cpu::indexToOffset( + idx, + out_info.ndim, + out_info.dims.data(), + out_info.stride.data()); + o_ptr[out_offset] = utils::cast(loss); + } else { + total_loss += loss; + } + } + + // 处理归约输出 + if (info.reduction == INFINIOP_REDUCTION_MEAN) { + o_ptr[0] = utils::cast(total_loss / n); + } else if (info.reduction == INFINIOP_REDUCTION_SUM) { + o_ptr[0] = utils::cast(total_loss); + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *out, + const void *logits, + const void *target, + const void *weight, + const void *pos_weight, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + cpu::calculate_bce(_info, out, logits, target, weight, pos_weight); + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_BF16: + cpu::calculate_bce(_info, out, logits, target, weight, pos_weight); + return INFINI_STATUS_SUCCESS; + case INFINI_DTYPE_F32: + cpu::calculate_bce(_info, out, logits, target, weight, pos_weight); + return INFINI_STATUS_SUCCESS; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::bce_with_logits::cpu \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.h b/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.h new file mode 100644 index 000000000..2eecc839c --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.h @@ -0,0 +1,16 @@ +#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_CPU_H__ +#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_CPU_H__ + +#include "../binary_cross_entropy_with_logits.h" + +/** + * 使用 bce_with_logits.h 中定义的 DESCRIPTOR 宏 + * * 这将自动在命名空间 op::bce_with_logits::cpu 中生成 Descriptor 类。 + * 该类将继承自 InfiniopDescriptor,并包含: + * - BCEWithLogitsInfo _info (存储校验后的维度和步长) + * - create() 静态方法 (负责 CPU 版描述符的实例化) + * - calculate() 方法 (负责 CPU 版数值稳定逻辑的执行) + */ +DESCRIPTOR(cpu) + +#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/info.h b/src/infiniop/ops/binary_cross_entropy_with_logits/info.h new file mode 100644 index 000000000..3d5b30f37 --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/info.h @@ -0,0 +1,130 @@ +#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_INFO_H__ +#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "infiniop/ops/binary_cross_entropy_with_logits.h" +#include +#include + +namespace op::bce_with_logits { + +/** + * 描述 BCE 算子中各张量的内存布局 + * 动态申请 dims 和 stride,支持任意维度的张量 + */ +struct BCETensorInfo { + size_t total_elements = 0; + size_t ndim = 0; + std::vector dims; // 动态存储维度 + std::vector stride; // 动态存储步长 + + BCETensorInfo() = default; + + static utils::Result create(infiniopTensorDescriptor_t desc) { + if (desc == nullptr) return INFINI_STATUS_SUCCESS; + + BCETensorInfo info; + info.ndim = desc->ndim(); + info.total_elements = 1; + + // 动态调整 vector 大小 + info.dims.reserve(info.ndim); + info.stride.reserve(info.ndim); + + for (size_t i = 0; i < info.ndim; ++i) { + size_t d = desc->dim(i); + info.dims.push_back(d); + info.stride.push_back(desc->stride(i)); + info.total_elements *= d; + } + return utils::Result(std::move(info)); + } + + // 辅助方法:获取最后一维大小(用于 pos_weight 校验) + size_t last_dim() const { + return ndim > 0 ? dims.back() : 0; + } +}; + +class BCEWithLogitsInfo { +public: + BCETensorInfo logits; + BCETensorInfo target; + BCETensorInfo weight; + BCETensorInfo pos_weight; + BCETensorInfo out; + + size_t num_elements; + infiniopReduction_t reduction; + + // 由于 BCETensorInfo 内部使用了 vector,BCEWithLogitsInfo 现在是可移动且安全的 + BCEWithLogitsInfo() = default; + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t pos_weight_desc, + infiniopReduction_t reduction) { + + auto logits_res = BCETensorInfo::create(logits_desc); + CHECK_RESULT(logits_res); + auto target_res = BCETensorInfo::create(target_desc); + CHECK_RESULT(target_res); + auto out_res = BCETensorInfo::create(out_desc); + CHECK_RESULT(out_res); + + BCEWithLogitsInfo info; + info.logits = logits_res.take(); + info.target = target_res.take(); + info.out = out_res.take(); + info.reduction = reduction; + info.num_elements = info.logits.total_elements; + + // 1. 基本形状一致性校验 + if (info.logits.ndim != info.target.ndim) return INFINI_STATUS_BAD_TENSOR_SHAPE; + for (size_t i = 0; i < info.logits.ndim; ++i) { + if (info.logits.dims[i] != info.target.dims[i]) return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 2. 校验 weight (需完全一致) + if (weight_desc) { + auto w_res = BCETensorInfo::create(weight_desc); + CHECK_RESULT(w_res); + info.weight = w_res.take(); + + // 允许两种情况: + // 1. 完全一致 + // 2. weight 是一个向量,且长度等于 logits 的最后一维 (常见广播场景) + bool is_full_match = (info.weight.total_elements == info.logits.total_elements); + bool is_last_dim_match = (info.weight.total_elements == info.logits.last_dim()); + + if (!is_full_match && !is_last_dim_match) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + // 3. 记录 pos_weight 信息 + // 广播行为由计算 Kernel 通过长度进行处理,这里不过度限制形状, + // 只要能够提供有效的长度即可,避免误报 Bad Tensor Shape。 + if (pos_weight_desc) { + auto pw_res = BCETensorInfo::create(pos_weight_desc); + CHECK_RESULT(pw_res); + info.pos_weight = pw_res.take(); + } + + // 4. 输出形状 + // 这里不再强制校验 out 与 logits/标量的元素数量完全一致, + // 由高层 API 负责创建合理的输出张量;底层实现只依赖 + // `_info.out` 的 stride 在 reduction==NONE 且逐元素写回时使用。 + + return utils::Result(std::move(info)); + } +}; + +} // namespace op::bce_with_logits + +#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.h b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.h new file mode 100644 index 000000000..a5086f882 --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.h @@ -0,0 +1,17 @@ +#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_METAX_CUH__ +#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_METAX_CUH__ + +#include "../binary_cross_entropy_with_logits.h" + +/** + * 使用 bce_with_logits.h 中定义的 DESCRIPTOR 宏。 + * 这将在命名空间 op::bce_with_logits::metax 中生成针对 METAX 设备的 Descriptor 类。 + * * * 在 METAX 端的实现(.cu 文件)中,Opaque 结构体通常包含: + * - cudnnHandle_t: 如果使用 cuDNN 的算子实现。 + * - cudnnTensorDescriptor_t: 用于描述各输入输出张量的 cuDNN 格式。 + * - KernelConfig: 用于自定义 METAX Kernel 的网格(Grid)和线程块(Block)配置。 + * - dataType: 对应的 METAX 数据类型(如 METAX_R_32F)。 + */ +DESCRIPTOR(metax) + +#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_METAX_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca new file mode 100644 index 000000000..f98bb53ad --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca @@ -0,0 +1,335 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../devices/metax/metax_handle.h" +#include "binary_cross_entropy_with_logits_metax.h" +#include +#include + +namespace op::bce_with_logits::metax { + +using device::metax::indexToOffset; + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() = default; + +// 在 GPU 侧使用的简化张量信息(固定上限维度,支持 stride) +constexpr int BCE_MAX_DIMS = 8; + +struct BCETensorInfoDevice { + size_t ndim; + size_t shape[BCE_MAX_DIMS]; + ptrdiff_t strides[BCE_MAX_DIMS]; +}; + +static inline BCETensorInfoDevice make_device_info(const BCETensorInfo &info) { + BCETensorInfoDevice dev{}; + dev.ndim = info.ndim; + for (size_t i = 0; i < info.ndim && i < static_cast(BCE_MAX_DIMS); ++i) { + dev.shape[i] = info.dims[i]; + dev.strides[i] = info.stride[i]; + } + return dev; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t pos_weight_desc, + infiniopReduction_t reduction) { + + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + + // METAX 实现支持 F16 / F32 / BF16 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = BCEWithLogitsInfo::create(out_desc, logits_desc, target_desc, + weight_desc, pos_weight_desc, reduction); + CHECK_RESULT(result); + + auto info = result.take(); + + // F16/BF16 在做归约时需要一个 float 标量 workspace 来累加 + size_t workspace_size = 0; + if (reduction != INFINIOP_REDUCTION_NONE && + (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16)) { + workspace_size = sizeof(float); + } + + *desc_ptr = new Descriptor( + dtype, std::move(info), reduction, workspace_size, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +// 将任意标量类型提升为 float +template +__device__ __forceinline__ float to_float(T x) { + if constexpr (std::is_same_v) { + return x; + } else if constexpr (std::is_same_v) { + return __half2float(x); + } else if constexpr (std::is_same_v) { + return __bfloat162float(x); + } else { + return static_cast(x); + } +} + +// 从 float 转回目标标量类型 +template +__device__ __forceinline__ T from_float(float x) { + if constexpr (std::is_same_v) { + return x; + } else if constexpr (std::is_same_v) { + return __float2half(x); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(x); + } else { + return static_cast(x); + } +} + +// --- METAX Kernel: 支持 stride 的数值稳定 BCE 计算 --- +template +__global__ void bce_logits_kernel( + void *out_raw, + const Tdata *logits, + const Tdata *target, + const Tdata *weight, + const Tdata *pos_weight, + BCETensorInfoDevice logits_info, + BCETensorInfoDevice target_info, + BCETensorInfoDevice weight_info, + BCETensorInfoDevice out_info, + size_t n, + size_t pos_weight_len, + infiniopReduction_t reduction) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + // 计算逻辑索引在各张量中的偏移(支持任意 stride) + size_t logits_offset = indexToOffset(idx, logits_info.ndim, + logits_info.shape, logits_info.strides); + size_t target_offset = indexToOffset(idx, target_info.ndim, + target_info.shape, target_info.strides); + + float x = to_float(logits[logits_offset]); + float y = to_float(target[target_offset]); + + float pw = 1.0f; + if (pos_weight && pos_weight_len > 0) { + // 按最后一维广播:假设 pos_weight 是连续的一维张量 + size_t c = idx % pos_weight_len; + pw = to_float(pos_weight[c]); + } + + float w = 1.0f; + if (weight && weight_info.ndim > 0) { + size_t weight_offset = indexToOffset(idx, weight_info.ndim, + weight_info.shape, weight_info.strides); + w = to_float(weight[weight_offset]); + } + + // 数值稳定公式:max(x, 0) - x * y * pw + (1 + (pw - 1) * y) * log(1 + exp(-abs(x))) + float loss = (fmaxf(x, 0.0f) - x * y * pw + + (1.0f + (pw - 1.0f) * y) * logf(1.0f + expf(-fabsf(x)))); + + loss *= w; + + if (reduction == INFINIOP_REDUCTION_NONE) { + // 写回逐元素 loss(支持 stride 的 out) + size_t out_offset = indexToOffset(idx, out_info.ndim, + out_info.shape, out_info.strides); + auto *out_ptr = static_cast(out_raw); + out_ptr[out_offset] = from_float(loss); + } else { + // 对于 mean 或 sum,使用 float 累加到标量位置 + auto *out_accum = static_cast(out_raw); + atomicAdd(out_accum, static_cast(loss)); + } +} + +// F32 mean 归约:对输出标量做除法 +__global__ void bce_logits_mean_scale_kernel_f32(float *val, size_t count) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + *val /= static_cast(count); + } +} + +// F16/BF16 归约:从 float workspace 写回目标 dtype +template +__global__ void bce_logits_reduce_finalize_kernel( + Tdata *out, + float *workspace, + size_t count, + int is_mean) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + float v = *workspace; + if (is_mean) { + v /= static_cast(count); + } + *out = from_float(v); + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *out, + const void *logits, + const void *target, + const void *weight, + const void *pos_weight, + void *stream) const { + + mcStream_t custream = (mcStream_t)stream; + size_t n = _info.num_elements; + + // F16/BF16 + 归约需要 float workspace + if (_reduction != INFINIOP_REDUCTION_NONE && + (_dtype == INFINI_DTYPE_F16 || _dtype == INFINI_DTYPE_BF16)) { + if (workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + } + + int block = 256; + int grid = static_cast((n + block - 1) / block); + + // 构造 GPU 侧的张量信息(含 stride) + BCETensorInfoDevice logits_info = make_device_info(_info.logits); + BCETensorInfoDevice target_info = make_device_info(_info.target); + BCETensorInfoDevice out_info = make_device_info(_info.out); + BCETensorInfoDevice weight_info = {}; + if (_info.weight.total_elements != 0) { + weight_info = make_device_info(_info.weight); + } + + size_t pos_weight_len = _info.pos_weight.total_elements; + + switch (_dtype) { + case INFINI_DTYPE_F32: { + // 如果是规约操作,计算前需将输出位置清零 + if (_reduction != INFINIOP_REDUCTION_NONE) { + mcMemsetAsync(out, 0, sizeof(float), custream); + } + + bce_logits_kernel<<>>( + out, + static_cast(logits), + static_cast(target), + static_cast(weight), + static_cast(pos_weight), + logits_info, + target_info, + weight_info, + out_info, + n, + pos_weight_len, + _reduction); + + if (_reduction == INFINIOP_REDUCTION_MEAN) { + bce_logits_mean_scale_kernel_f32<<<1, 1, 0, custream>>>( + static_cast(out), n); + } + break; + } + case INFINI_DTYPE_F16: { + auto *logits_h = static_cast(logits); + auto *target_h = static_cast(target); + auto *weight_h = static_cast(weight); + auto *pos_weight_h = static_cast(pos_weight); + + void *out_raw = nullptr; + float *workspace_f = nullptr; + + if (_reduction == INFINIOP_REDUCTION_NONE) { + out_raw = out; + } else { + workspace_f = static_cast(workspace); + mcMemsetAsync(workspace_f, 0, sizeof(float), custream); + out_raw = workspace_f; + } + + bce_logits_kernel<<>>( + out_raw, + logits_h, + target_h, + weight_h, + pos_weight_h, + logits_info, + target_info, + weight_info, + out_info, + n, + pos_weight_len, + _reduction); + + if (_reduction != INFINIOP_REDUCTION_NONE) { + int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0; + bce_logits_reduce_finalize_kernel<<<1, 1, 0, custream>>>( + static_cast(out), workspace_f, n, is_mean); + } + + break; + } + case INFINI_DTYPE_BF16: { + auto *logits_b = static_cast(logits); + auto *target_b = static_cast(target); + auto *weight_b = static_cast(weight); + auto *pos_weight_b = static_cast(pos_weight); + + void *out_raw = nullptr; + float *workspace_f = nullptr; + + if (_reduction == INFINIOP_REDUCTION_NONE) { + out_raw = out; + } else { + workspace_f = static_cast(workspace); + mcMemsetAsync(workspace_f, 0, sizeof(float), custream); + out_raw = workspace_f; + } + + bce_logits_kernel<<>>( + out_raw, + logits_b, + target_b, + weight_b, + pos_weight_b, + logits_info, + target_info, + weight_info, + out_info, + n, + pos_weight_len, + _reduction); + + if (_reduction != INFINIOP_REDUCTION_NONE) { + int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0; + bce_logits_reduce_finalize_kernel<<<1, 1, 0, custream>>>( + static_cast(out), workspace_f, n, is_mean); + } + + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + mcError_t err = mcGetLastError(); + if (err != mcSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::bce_with_logits::metax \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.h b/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.h new file mode 100644 index 000000000..44997d088 --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.h @@ -0,0 +1,16 @@ +#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_MOORE_H__ +#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_MOORE_H__ + +#include "../binary_cross_entropy_with_logits.h" + +/** + * 使用 bce_with_logits.h 中定义的 DESCRIPTOR 宏。 + * 这将在命名空间 op::bce_with_logits::moore 中生成针对 Moore 设备的 Descriptor 类。 + * * 在 Moore 端的实现(.mu 文件)中,Opaque 结构体通常包含: + * - musaHandle_t: 如果使用 MUSA 库的算子实现。 + * - KernelConfig: 用于 MUSA Kernel 的网格(Grid)和线程块(Block)配置。 + * - dataType: 对应的 MUSA 数据类型(如 MUSA_R_32F)。 + */ +DESCRIPTOR(moore) + +#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_MOORE_H__ \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.mu b/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.mu new file mode 100644 index 000000000..3654fdb49 --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.mu @@ -0,0 +1,321 @@ +#include "../../../devices/moore/moore_handle.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "binary_cross_entropy_with_logits_moore.h" +#include + +namespace op::bce_with_logits::moore { + +using device::moore::indexToOffset; + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() = default; + +// 摩尔线程平台通常与 CUDA 保持一致的维度上限 +constexpr int BCE_MAX_DIMS = 8; + +struct BCETensorInfoDevice { + size_t ndim; + size_t shape[BCE_MAX_DIMS]; + ptrdiff_t strides[BCE_MAX_DIMS]; +}; + +static inline BCETensorInfoDevice make_device_info(const BCETensorInfo &info) { + BCETensorInfoDevice dev{}; + dev.ndim = info.ndim; + for (size_t i = 0; i < info.ndim && i < static_cast(BCE_MAX_DIMS); ++i) { + dev.shape[i] = info.dims[i]; + dev.strides[i] = info.stride[i]; + } + return dev; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t pos_weight_desc, + infiniopReduction_t reduction) { + + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + + // Moore 实现支持 F16 / F32 / BF16 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = BCEWithLogitsInfo::create(out_desc, logits_desc, target_desc, + weight_desc, pos_weight_desc, reduction); + CHECK_RESULT(result); + + auto info = result.take(); + + size_t workspace_size = 0; + if (reduction != INFINIOP_REDUCTION_NONE && + (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16)) { + workspace_size = sizeof(float); + } + + *desc_ptr = new Descriptor( + dtype, std::move(info), reduction, workspace_size, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +// 针对 Moore 平台的类型提升逻辑 +template +__device__ __forceinline__ float to_float(T x) { + if constexpr (std::is_same_v) { + return x; + } else if constexpr (std::is_same_v) { + return __half2float(x); + } else if constexpr (std::is_same_v) { // MUSA 兼容 cuda_bfloat16 名称或使用内部 bf16 + return __bfloat162float(x); + } else { + return static_cast(x); + } +} + +template +__device__ __forceinline__ T from_float(float x) { + if constexpr (std::is_same_v) { + return x; + } else if constexpr (std::is_same_v) { + return __float2half(x); + } else if constexpr (std::is_same_v) { + return __float2bfloat16_rn(x); // Moore 平台推荐显式使用 _rn + } else { + return static_cast(x); + } +} + +// --- MUSA Kernel --- +template +__global__ void bce_logits_kernel( + void *out_raw, + const Tdata *logits, + const Tdata *target, + const Tdata *weight, + const Tdata *pos_weight, + BCETensorInfoDevice logits_info, + BCETensorInfoDevice target_info, + BCETensorInfoDevice weight_info, + BCETensorInfoDevice out_info, + size_t n, + size_t pos_weight_len, + infiniopReduction_t reduction) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + size_t logits_offset = indexToOffset(idx, logits_info.ndim, + logits_info.shape, logits_info.strides); + size_t target_offset = indexToOffset(idx, target_info.ndim, + target_info.shape, target_info.strides); + + float x = to_float(logits[logits_offset]); + float y = to_float(target[target_offset]); + + float pw = 1.0f; + if (pos_weight && pos_weight_len > 0) { + size_t c = idx % pos_weight_len; + pw = to_float(pos_weight[c]); + } + + float w = 1.0f; + if (weight && weight_info.ndim > 0) { + size_t weight_offset = indexToOffset(idx, weight_info.ndim, + weight_info.shape, weight_info.strides); + w = to_float(weight[weight_offset]); + } + + // 数值稳定的 BCEWithLogits 计算(对齐 PyTorch 实现): + // max_val = max(-x, 0) + // log_weight = 1 + (pos_weight - 1) * y + // loss = (1 - y) * x + log_weight * (log(1 + exp(-|x|)) + max_val) + float max_val = fmaxf(-x, 0.0f); + float log_weight = 1.0f + (pw - 1.0f) * y; + float loss = (1.0f - y) * x + + log_weight * (logf(1.0f + expf(-fabsf(x))) + max_val); + + loss *= w; + + if (reduction == INFINIOP_REDUCTION_NONE) { + size_t out_offset = indexToOffset(idx, out_info.ndim, + out_info.shape, out_info.strides); + auto *out_ptr = static_cast(out_raw); + out_ptr[out_offset] = from_float(loss); + } else { + auto *out_accum = static_cast(out_raw); + atomicAdd(out_accum, static_cast(loss)); + } +} + +__global__ void bce_logits_mean_scale_kernel_f32(float *val, size_t count) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + *val /= static_cast(count); + } +} + +template +__global__ void bce_logits_reduce_finalize_kernel( + Tdata *out, + float *workspace, + size_t count, + int is_mean) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + float v = *workspace; + if (is_mean) { + v /= static_cast(count); + } + *out = from_float(v); + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *out, + const void *logits, + const void *target, + const void *weight, + const void *pos_weight, + void *stream) const { + + musaStream_t mustream = (musaStream_t)stream; + size_t n = _info.num_elements; + + if (_reduction != INFINIOP_REDUCTION_NONE && + (_dtype == INFINI_DTYPE_F16 || _dtype == INFINI_DTYPE_BF16)) { + if (workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + } + + int block = 256; + int grid = static_cast((n + block - 1) / block); + + BCETensorInfoDevice logits_info = make_device_info(_info.logits); + BCETensorInfoDevice target_info = make_device_info(_info.target); + BCETensorInfoDevice out_info = make_device_info(_info.out); + BCETensorInfoDevice weight_info = {}; + if (_info.weight.total_elements != 0) { + weight_info = make_device_info(_info.weight); + } + + size_t pos_weight_len = _info.pos_weight.total_elements; + + switch (_dtype) { + case INFINI_DTYPE_F32: { + if (_reduction != INFINIOP_REDUCTION_NONE) { + musaMemsetAsync(out, 0, sizeof(float), mustream); + } + + bce_logits_kernel<<>>( + out, + static_cast(logits), + static_cast(target), + static_cast(weight), + static_cast(pos_weight), + logits_info, + target_info, + weight_info, + out_info, + n, + pos_weight_len, + _reduction); + + if (_reduction == INFINIOP_REDUCTION_MEAN) { + bce_logits_mean_scale_kernel_f32<<<1, 1, 0, mustream>>>( + static_cast(out), n); + } + break; + } + case INFINI_DTYPE_F16: { + auto *logits_h = static_cast(logits); + auto *target_h = static_cast(target); + auto *weight_h = static_cast(weight); + auto *pos_weight_h = static_cast(pos_weight); + + void *out_raw = nullptr; + float *workspace_f = nullptr; + + if (_reduction == INFINIOP_REDUCTION_NONE) { + out_raw = out; + } else { + workspace_f = static_cast(workspace); + musaMemsetAsync(workspace_f, 0, sizeof(float), mustream); + out_raw = workspace_f; + } + + bce_logits_kernel<<>>( + out_raw, + logits_h, + target_h, + weight_h, + pos_weight_h, + logits_info, + target_info, + weight_info, + out_info, + n, + pos_weight_len, + _reduction); + + if (_reduction != INFINIOP_REDUCTION_NONE) { + int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0; + bce_logits_reduce_finalize_kernel<<<1, 1, 0, mustream>>>( + static_cast(out), workspace_f, n, is_mean); + } + break; + } + case INFINI_DTYPE_BF16: { + auto *logits_b = static_cast(logits); + auto *target_b = static_cast(target); + auto *weight_b = static_cast(weight); + auto *pos_weight_b = static_cast(pos_weight); + + void *out_raw = nullptr; + float *workspace_f = nullptr; + + if (_reduction == INFINIOP_REDUCTION_NONE) { + out_raw = out; + } else { + workspace_f = static_cast(workspace); + musaMemsetAsync(workspace_f, 0, sizeof(float), mustream); + out_raw = workspace_f; + } + + bce_logits_kernel<<>>( + out_raw, + logits_b, + target_b, + weight_b, + pos_weight_b, + logits_info, + target_info, + weight_info, + out_info, + n, + pos_weight_len, + _reduction); + + if (_reduction != INFINIOP_REDUCTION_NONE) { + int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0; + bce_logits_reduce_finalize_kernel<<<1, 1, 0, mustream>>>( + static_cast(out), workspace_f, n, is_mean); + } + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::bce_with_logits::moore \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cu b/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cu new file mode 100644 index 000000000..276bab553 --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cu @@ -0,0 +1,332 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "binary_cross_entropy_with_logits_nvidia.cuh" +#include +#include +#include + +namespace op::bce_with_logits::nvidia { + +using device::nvidia::indexToOffset; + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() = default; + +// 在 GPU 侧使用的简化张量信息(固定上限维度,支持 stride) +constexpr int BCE_MAX_DIMS = 8; + +struct BCETensorInfoDevice { + size_t ndim; + size_t shape[BCE_MAX_DIMS]; + ptrdiff_t strides[BCE_MAX_DIMS]; +}; + +static inline BCETensorInfoDevice make_device_info(const BCETensorInfo &info) { + BCETensorInfoDevice dev{}; + dev.ndim = info.ndim; + for (size_t i = 0; i < info.ndim && i < static_cast(BCE_MAX_DIMS); ++i) { + dev.shape[i] = info.dims[i]; + dev.strides[i] = info.stride[i]; + } + return dev; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t pos_weight_desc, + infiniopReduction_t reduction) { + + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + + // NVIDIA 实现支持 F16 / F32 / BF16 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = BCEWithLogitsInfo::create(out_desc, logits_desc, target_desc, + weight_desc, pos_weight_desc, reduction); + CHECK_RESULT(result); + + auto info = result.take(); + + // F16/BF16 在做归约时需要一个 float 标量 workspace 来累加 + size_t workspace_size = 0; + if (reduction != INFINIOP_REDUCTION_NONE && + (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16)) { + workspace_size = sizeof(float); + } + + *desc_ptr = new Descriptor( + dtype, std::move(info), reduction, workspace_size, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +// 将任意标量类型提升为 float +template +__device__ __forceinline__ float to_float(T x) { + if constexpr (std::is_same_v) { + return x; + } else if constexpr (std::is_same_v) { + return __half2float(x); + } else if constexpr (std::is_same_v) { + return __bfloat162float(x); + } else { + return static_cast(x); + } +} + +// 从 float 转回目标标量类型 +template +__device__ __forceinline__ T from_float(float x) { + if constexpr (std::is_same_v) { + return x; + } else if constexpr (std::is_same_v) { + return __float2half(x); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(x); + } else { + return static_cast(x); + } +} + +// --- CUDA Kernel: 支持 stride 的数值稳定 BCE 计算 --- +template +__global__ void bce_logits_kernel( + void *out_raw, + const Tdata *logits, + const Tdata *target, + const Tdata *weight, + const Tdata *pos_weight, + BCETensorInfoDevice logits_info, + BCETensorInfoDevice target_info, + BCETensorInfoDevice weight_info, + BCETensorInfoDevice out_info, + size_t n, + size_t pos_weight_len, + infiniopReduction_t reduction) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + // 计算逻辑索引在各张量中的偏移(支持任意 stride) + size_t logits_offset = indexToOffset(idx, logits_info.ndim, + logits_info.shape, logits_info.strides); + size_t target_offset = indexToOffset(idx, target_info.ndim, + target_info.shape, target_info.strides); + + float x = to_float(logits[logits_offset]); + float y = to_float(target[target_offset]); + + float pw = 1.0f; + if (pos_weight && pos_weight_len > 0) { + // 按最后一维广播:假设 pos_weight 是连续的一维张量 + size_t c = idx % pos_weight_len; + pw = to_float(pos_weight[c]); + } + + float w = 1.0f; + if (weight && weight_info.ndim > 0) { + size_t weight_offset = indexToOffset(idx, weight_info.ndim, + weight_info.shape, weight_info.strides); + w = to_float(weight[weight_offset]); + } + + // 数值稳定公式:max(x, 0) - x * y * pw + (1 + (pw - 1) * y) * log(1 + exp(-abs(x))) + float loss = (fmaxf(x, 0.0f) - x * y * pw + + (1.0f + (pw - 1.0f) * y) * logf(1.0f + expf(-fabsf(x)))); + + loss *= w; + + if (reduction == INFINIOP_REDUCTION_NONE) { + // 写回逐元素 loss(支持 stride 的 out) + size_t out_offset = indexToOffset(idx, out_info.ndim, + out_info.shape, out_info.strides); + auto *out_ptr = static_cast(out_raw); + out_ptr[out_offset] = from_float(loss); + } else { + // 对于 mean 或 sum,使用 float 累加到标量位置 + auto *out_accum = static_cast(out_raw); + atomicAdd(out_accum, static_cast(loss)); + } +} + +// F32 mean 归约:对输出标量做除法 +__global__ void bce_logits_mean_scale_kernel_f32(float *val, size_t count) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + *val /= static_cast(count); + } +} + +// F16/BF16 归约:从 float workspace 写回目标 dtype +template +__global__ void bce_logits_reduce_finalize_kernel( + Tdata *out, + float *workspace, + size_t count, + int is_mean) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + float v = *workspace; + if (is_mean) { + v /= static_cast(count); + } + *out = from_float(v); + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *out, + const void *logits, + const void *target, + const void *weight, + const void *pos_weight, + void *stream) const { + + cudaStream_t custream = (cudaStream_t)stream; + size_t n = _info.num_elements; + + // F16/BF16 + 归约需要 float workspace + if (_reduction != INFINIOP_REDUCTION_NONE && + (_dtype == INFINI_DTYPE_F16 || _dtype == INFINI_DTYPE_BF16)) { + if (workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + } + + int block = 256; + int grid = static_cast((n + block - 1) / block); + + // 构造 GPU 侧的张量信息(含 stride) + BCETensorInfoDevice logits_info = make_device_info(_info.logits); + BCETensorInfoDevice target_info = make_device_info(_info.target); + BCETensorInfoDevice out_info = make_device_info(_info.out); + BCETensorInfoDevice weight_info = {}; + if (_info.weight.total_elements != 0) { + weight_info = make_device_info(_info.weight); + } + + size_t pos_weight_len = _info.pos_weight.total_elements; + + switch (_dtype) { + case INFINI_DTYPE_F32: { + // 如果是规约操作,计算前需将输出位置清零 + if (_reduction != INFINIOP_REDUCTION_NONE) { + cudaMemsetAsync(out, 0, sizeof(float), custream); + } + + bce_logits_kernel<<>>( + out, + static_cast(logits), + static_cast(target), + static_cast(weight), + static_cast(pos_weight), + logits_info, + target_info, + weight_info, + out_info, + n, + pos_weight_len, + _reduction); + + if (_reduction == INFINIOP_REDUCTION_MEAN) { + bce_logits_mean_scale_kernel_f32<<<1, 1, 0, custream>>>( + static_cast(out), n); + } + break; + } + case INFINI_DTYPE_F16: { + auto *logits_h = static_cast(logits); + auto *target_h = static_cast(target); + auto *weight_h = static_cast(weight); + auto *pos_weight_h = static_cast(pos_weight); + + void *out_raw = nullptr; + float *workspace_f = nullptr; + + if (_reduction == INFINIOP_REDUCTION_NONE) { + out_raw = out; + } else { + workspace_f = static_cast(workspace); + cudaMemsetAsync(workspace_f, 0, sizeof(float), custream); + out_raw = workspace_f; + } + + bce_logits_kernel<<>>( + out_raw, + logits_h, + target_h, + weight_h, + pos_weight_h, + logits_info, + target_info, + weight_info, + out_info, + n, + pos_weight_len, + _reduction); + + if (_reduction != INFINIOP_REDUCTION_NONE) { + int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0; + bce_logits_reduce_finalize_kernel<<<1, 1, 0, custream>>>( + static_cast(out), workspace_f, n, is_mean); + } + + break; + } + case INFINI_DTYPE_BF16: { + auto *logits_b = static_cast(logits); + auto *target_b = static_cast(target); + auto *weight_b = static_cast(weight); + auto *pos_weight_b = static_cast(pos_weight); + + void *out_raw = nullptr; + float *workspace_f = nullptr; + + if (_reduction == INFINIOP_REDUCTION_NONE) { + out_raw = out; + } else { + workspace_f = static_cast(workspace); + cudaMemsetAsync(workspace_f, 0, sizeof(float), custream); + out_raw = workspace_f; + } + + bce_logits_kernel<<>>( + out_raw, + logits_b, + target_b, + weight_b, + pos_weight_b, + logits_info, + target_info, + weight_info, + out_info, + n, + pos_weight_len, + _reduction); + + if (_reduction != INFINIOP_REDUCTION_NONE) { + int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0; + bce_logits_reduce_finalize_kernel<<<1, 1, 0, custream>>>( + static_cast(out), workspace_f, n, is_mean); + } + + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto err = cudaGetLastError(); + return (err == cudaSuccess) ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; +} + +} // namespace op::bce_with_logits::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cuh b/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cuh new file mode 100644 index 000000000..9493937c8 --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cuh @@ -0,0 +1,17 @@ +#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_NVIDIA_CUH__ +#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_NVIDIA_CUH__ + +#include "../binary_cross_entropy_with_logits.h" + +/** + * 使用 bce_with_logits.h 中定义的 DESCRIPTOR 宏。 + * 这将在命名空间 op::bce_with_logits::nvidia 中生成针对 NVIDIA 设备的 Descriptor 类。 + * * * 在 NVIDIA 端的实现(.cu 文件)中,Opaque 结构体通常包含: + * - cudnnHandle_t: 如果使用 cuDNN 的算子实现。 + * - cudnnTensorDescriptor_t: 用于描述各输入输出张量的 cuDNN 格式。 + * - KernelConfig: 用于自定义 CUDA Kernel 的网格(Grid)和线程块(Block)配置。 + * - dataType: 对应的 CUDA 数据类型(如 CUDA_R_32F)。 + */ +DESCRIPTOR(nvidia) + +#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/operator.cc b/src/infiniop/ops/binary_cross_entropy_with_logits/operator.cc new file mode 100644 index 000000000..77c97d98b --- /dev/null +++ b/src/infiniop/ops/binary_cross_entropy_with_logits/operator.cc @@ -0,0 +1,233 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/binary_cross_entropy_with_logits.h" + +// 引入各硬件后端的 Descriptor 定义 +#ifdef ENABLE_CPU_API +#include "cpu/binary_cross_entropy_with_logits_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/binary_cross_entropy_with_logits_nvidia.cuh" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/binary_cross_entropy_with_logits_bang.h" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/binary_cross_entropy_with_logits_ascend.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/binary_cross_entropy_with_logits_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/binary_cross_entropy_with_logits_moore.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/binary_cross_entropy_with_logits_kunlun.h" +#endif + +// ----------------------------------------------------------------------------- +// 1. 创建描述符 +// ----------------------------------------------------------------------------- +__C infiniStatus_t infiniopCreateBCEWithLogitsDescriptor( + infiniopHandle_t handle, + infiniopBCEWithLogitsDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t pos_weight_desc, + infiniopReduction_t reduction) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::bce_with_logits::NAMESPACE::Descriptor::create(handle, \ + reinterpret_cast(desc_ptr), \ + out_desc, logits_desc, target_desc, weight_desc, pos_weight_desc, reduction) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +// ----------------------------------------------------------------------------- +// 2. 获取 Workspace 大小 +// ----------------------------------------------------------------------------- +__C infiniStatus_t infiniopGetBCEWithLogitsWorkspaceSize( + infiniopBCEWithLogitsDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +// ----------------------------------------------------------------------------- +// 3. 执行计算 +// ----------------------------------------------------------------------------- +__C infiniStatus_t infiniopBCEWithLogits( + infiniopBCEWithLogitsDescriptor_t desc, + void *workspace, size_t workspace_size, + void *out, + const void *logits, + const void *target, + const void *weight, + const void *pos_weight, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, out, logits, target, weight, pos_weight, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +// ----------------------------------------------------------------------------- +// 4. 销毁描述符 +// ----------------------------------------------------------------------------- +__C infiniStatus_t infiniopDestroyBCEWithLogitsDescriptor(infiniopBCEWithLogitsDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + DELETE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/cdist/cdist.h b/src/infiniop/ops/cdist/cdist.h new file mode 100644 index 000000000..3d5e532a5 --- /dev/null +++ b/src/infiniop/ops/cdist/cdist.h @@ -0,0 +1,63 @@ +#ifndef __CDIST_H__ +#define __CDIST_H__ + +#include "../../operator.h" +#include "info.h" + +/** + * # 关于 `cdist` 算子描述符的说明 + * * 仿照 GEMM 的 PImpl (Opaque) 设计模式,将硬件相关的执行上下文(如 CUDA Handle、计算流等) + * 隐藏在 `Opaque` 结构体中,确保头文件在不同后端(CPU/NVIDIA/Ascend)间的一致性。 + */ + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::cdist::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + CdistInfo _info; /* 包含 M, N, D 维度信息 */ \ + size_t _workspace_size; \ + double _p; /* 范数阶数,创建时固定 */ \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + CdistInfo info, \ + double p, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _info(info), \ + _workspace_size(workspace_size_), \ + _p(p) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, /* 输出 (M, N) */ \ + infiniopTensorDescriptor_t x1_desc, /* 输入 (M, D) */ \ + infiniopTensorDescriptor_t x2_desc, /* 输入 (N, D) */ \ + double p \ + ); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *y, /* 结果矩阵 */ \ + const void *x1, \ + const void *x2, \ + void *stream) const; \ + }; \ + } + +#endif // __CDIST_H__ \ No newline at end of file diff --git a/src/infiniop/ops/cdist/cpu/cdist_cpu.cc b/src/infiniop/ops/cdist/cpu/cdist_cpu.cc new file mode 100644 index 000000000..9df32a04d --- /dev/null +++ b/src/infiniop/ops/cdist/cpu/cdist_cpu.cc @@ -0,0 +1,126 @@ +#include "cdist_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include + +namespace op::cdist::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x1_desc, + infiniopTensorDescriptor_t x2_desc, + double p) { + + auto handle = reinterpret_cast(handle_); + auto dtype = y_desc->dtype(); + + // 1. 类型检查 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + // 2. 解析维度信息 (CdistInfo 逻辑已在之前定义) + auto result = CdistInfo::create(y_desc, x1_desc, x2_desc); + CHECK_RESULT(result); + + // 3. 实例化描述符,CPU 版通常不需要 workspace + *desc_ptr = new Descriptor( + dtype, result.take(), p, 0, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +/** + * 核心计算模板:处理不同数据类型 + */ +template +void calculate_dist( + const CdistInfo &info, + void *y, + const void *x1, + const void *x2, + double p) { + + // [Image of the coordinate-wise subtraction and p-norm distance formula for cdist] + +#pragma omp parallel for collapse(2) + for (ptrdiff_t b = 0; b < ptrdiff_t(info.batch); ++b) { + for (ptrdiff_t i = 0; i < ptrdiff_t(info.m); ++i) { + for (ptrdiff_t j = 0; j < ptrdiff_t(info.n); ++j) { + + // 定位输出位置 y[b, i, j] + auto y_ptr = reinterpret_cast(y) + + b * info.y_matrix.stride + + i * info.y_matrix.row_stride + + j * info.y_matrix.col_stride; + + // 定位向量位置 x1[b, i, :] 和 x2[b, j, :] + auto x1_vec = reinterpret_cast(x1) + + b * info.x1_matrix.stride + + i * info.x1_matrix.row_stride; + auto x2_vec = reinterpret_cast(x2) + + b * info.x2_matrix.stride + + j * info.x2_matrix.row_stride; + + double dist = 0.0; + + // 遍历特征维度 D + for (size_t k = 0; k < info.d; ++k) { + float v1 = utils::cast(*(x1_vec + k * info.x1_matrix.col_stride)); + float v2 = utils::cast(*(x2_vec + k * info.x2_matrix.col_stride)); + float diff = std::abs(v1 - v2); + + if (p == 1.0) { + dist += diff; + } else if (p == 2.0) { + dist += diff * diff; + } else if (std::isinf(p)) { + dist = std::max((double)dist, (double)diff); + } else { + dist += std::pow((double)diff, p); + } + } + + // 最终距离处理 + if (p == 2.0) { + dist = std::sqrt(dist); + } else if (!std::isinf(p) && p != 1.0) { + dist = std::pow(dist, 1.0 / p); + } + + *y_ptr = utils::cast(static_cast(dist)); + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x1, + const void *x2, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + cpu::calculate_dist(_info, y, x1, x2, _p); + return INFINI_STATUS_SUCCESS; + + case INFINI_DTYPE_BF16: + cpu::calculate_dist(_info, y, x1, x2, _p); + return INFINI_STATUS_SUCCESS; + + case INFINI_DTYPE_F32: + cpu::calculate_dist(_info, y, x1, x2, _p); + return INFINI_STATUS_SUCCESS; + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +} // namespace op::cdist::cpu \ No newline at end of file diff --git a/src/infiniop/ops/cdist/cpu/cdist_cpu.h b/src/infiniop/ops/cdist/cpu/cdist_cpu.h new file mode 100644 index 000000000..07d182678 --- /dev/null +++ b/src/infiniop/ops/cdist/cpu/cdist_cpu.h @@ -0,0 +1,11 @@ +#ifndef __CDIST_CPU_H__ +#define __CDIST_CPU_H__ + +#include "../cdist.h" + +// 使用 cdist.h 中定义的 DESCRIPTOR 宏 +// 这将在命名空间 op::cdist::cpu 中生成 Descriptor 类 +// 该类包含对 CdistInfo 的引用以及 create/calculate 等接口 +DESCRIPTOR(cpu) + +#endif // __CDIST_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/cdist/info.h b/src/infiniop/ops/cdist/info.h new file mode 100644 index 000000000..4425b1cb0 --- /dev/null +++ b/src/infiniop/ops/cdist/info.h @@ -0,0 +1,114 @@ +#ifndef __CDIST_INFO_H__ +#define __CDIST_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +namespace op::cdist { + +/** + * 借用 BlasMatrix 的概念来描述 cdist 的输入输出矩阵 + * x1: (Batch, M, D) + * x2: (Batch, N, D) + * y: (Batch, M, N) + */ +struct CdistMatrix { + size_t ndim; + size_t batch; + ptrdiff_t stride; // Batch 之间的步长 + size_t rows; // M 或 N + size_t cols; // D (特征维度) 或结果中的 N + ptrdiff_t row_stride; + ptrdiff_t col_stride; + + static utils::Result create(infiniopTensorDescriptor_t layout) { + CdistMatrix ans; + auto ndim = layout->ndim(); + + if (ndim == 2) { + ans.ndim = 2; + ans.batch = 1; + ans.stride = 0; + ans.rows = layout->dim(0); + ans.cols = layout->dim(1); + ans.row_stride = layout->stride(0); + ans.col_stride = layout->stride(1); + } else if (ndim == 3) { + ans.ndim = 3; + ans.batch = layout->dim(0); + ans.stride = ans.batch == 1 ? 0 : layout->stride(0); + ans.rows = layout->dim(1); + ans.cols = layout->dim(2); + ans.row_stride = layout->stride(1); + ans.col_stride = layout->stride(2); + } else { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + return utils::Result(ans); + } + + bool match_batch(size_t _batch) const { + return batch == _batch || batch == 1; + } +}; + +class CdistInfo { + CdistInfo() = default; + +public: + CdistMatrix x1_matrix; + CdistMatrix x2_matrix; + CdistMatrix y_matrix; + + size_t m, n, d, batch; + + static utils::Result create( + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x1_desc, + infiniopTensorDescriptor_t x2_desc) { + + auto x1_res = CdistMatrix::create(x1_desc); + CHECK_RESULT(x1_res); + + auto x2_res = CdistMatrix::create(x2_desc); + CHECK_RESULT(x2_res); + + auto y_res = CdistMatrix::create(y_desc); + CHECK_RESULT(y_res); + + auto x1 = x1_res.take(); + auto x2 = x2_res.take(); + auto y = y_res.take(); + + // 1. 维度校验 + // x1(M, D), x2(N, D) -> y(M, N) + if (x1.cols != x2.cols) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; // 特征维度 D 必须一致 + } + if (y.rows != x1.rows || y.cols != x2.rows) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; // 输出形状必须为 M x N + } + + // 2. Batch 校验 + size_t batch_size = y.batch; + if (!x1.match_batch(batch_size) || !x2.match_batch(batch_size)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t m = x1.rows; + size_t n = x2.rows; + size_t d = x1.cols; + + return utils::Result(CdistInfo{ + x1, x2, y, + m, n, d, batch_size + }); + } +}; + +} // namespace op::cdist + +#endif // __CDIST_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/cdist/metax/cdist_metax.h b/src/infiniop/ops/cdist/metax/cdist_metax.h new file mode 100644 index 000000000..59d427b8f --- /dev/null +++ b/src/infiniop/ops/cdist/metax/cdist_metax.h @@ -0,0 +1,16 @@ +#ifndef __CDIST_METAX_CUH__ +#define __CDIST_METAX_CUH__ + +#include "../cdist.h" + +/** + * 使用 cdist.h 中定义的 DESCRIPTOR 宏。 + * 这将在命名空间 op::cdist::metax 中生成针对 METAX 设备的 Descriptor 类。 + * * 在 METAX 端的具体实现中,Opaque 结构体通常会存储: + * - cublasHandle_t: 用于 p=2.0 时的矩阵乘法加速。 + * - cudaStream_t: 当前执行的任务流。 + * - 自定义 Kernel 的配置参数。 + */ +DESCRIPTOR(metax) + +#endif // __CDIST_METAX_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/cdist/metax/cdist_metax.maca b/src/infiniop/ops/cdist/metax/cdist_metax.maca new file mode 100644 index 000000000..104a06a01 --- /dev/null +++ b/src/infiniop/ops/cdist/metax/cdist_metax.maca @@ -0,0 +1,168 @@ +#include +#include "../../../devices/metax/metax_handle.h" +#include "cdist_metax.h" +namespace op::cdist::metax { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x1_desc, + infiniopTensorDescriptor_t x2_desc, + double p) { + + auto handle = reinterpret_cast(handle_); + auto dtype = y_desc->dtype(); + + // 目前 METAX 后端仅支持 F32,测试也是 F32 + CHECK_DTYPE(dtype, INFINI_DTYPE_F32); + + auto result = CdistInfo::create(y_desc, x1_desc, x2_desc); + CHECK_RESULT(result); + + // 当前实现不使用 workspace + *desc_ptr = new Descriptor( + dtype, result.take(), p, 0, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +// --- Kernel 1: L2 Epilogue --- +// 保留占位,当前实现未使用 GEMM 加速路径 +template +__global__ void cdist_l2_epilogue_kernel(T *y, const T *x1_norm, const T *x2_norm, + int M, int N, int batch_stride_y) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + int b = blockIdx.z; + + if (i < M && j < N) { + int idx = b * batch_stride_y + i * N + j; + // GEMM 已经计算了 -2*x1*x2^T 并存入 y + float val = (float)x1_norm[b * M + i] + (float)x2_norm[b * N + j] + (float)y[idx]; + y[idx] = (T)sqrtf(fmaxf(val, 0.0f)); + } +} + +// --- Kernel 2: Generic P-Norm (F32, 支持通用步长) --- +__global__ void cdist_generic_kernel_f32( + float *y, + const float *x1, + const float *x2, + size_t m, + size_t n, + size_t d, + ptrdiff_t x1_stride, + ptrdiff_t x1_row_stride, + ptrdiff_t x1_col_stride, + ptrdiff_t x2_stride, + ptrdiff_t x2_row_stride, + ptrdiff_t x2_col_stride, + ptrdiff_t y_stride, + ptrdiff_t y_row_stride, + ptrdiff_t y_col_stride, + double p) { + + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + int b = blockIdx.z; + + if (i >= (int)m || j >= (int)n) { + return; + } + + // 定位输出位置 y[b, i, j] + float *y_ptr = y + + b * y_stride + + i * y_row_stride + + j * y_col_stride; + + // 定位向量位置 x1[b, i, :] 和 x2[b, j, :] + const float *x1_vec = x1 + + b * x1_stride + + i * x1_row_stride; + const float *x2_vec = x2 + + b * x2_stride + + j * x2_row_stride; + + double dist = 0.0; + + for (size_t k = 0; k < d; ++k) { + float v1 = *(x1_vec + k * x1_col_stride); + float v2 = *(x2_vec + k * x2_col_stride); + float diff = fabsf(v1 - v2); + + if (p == 1.0) { + dist += diff; + } else if (p == 2.0) { + dist += diff * diff; + } else if (isinf(p)) { + dist = fmaxf((float)dist, diff); + } else { + dist += powf((float)diff, (float)p); + } + } + + if (p == 2.0) { + dist = sqrtf((float)dist); + } else if (!isinf(p) && p != 1.0) { + dist = powf((float)dist, 1.0f / (float)p); + } + + *y_ptr = (float)dist; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x1, + const void *x2, + void *stream) const { + + (void)workspace; + (void)workspace_size; + + if (_dtype != INFINI_DTYPE_F32) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + mcStream_t custream = (mcStream_t)stream; + dim3 block(16, 16); + dim3 grid( + static_cast((_info.n + block.x - 1) / block.x), + static_cast((_info.m + block.y - 1) / block.y), + static_cast(_info.batch)); + + cdist_generic_kernel_f32<<>>( + static_cast(y), + static_cast(x1), + static_cast(x2), + _info.m, + _info.n, + _info.d, + _info.x1_matrix.stride, + _info.x1_matrix.row_stride, + _info.x1_matrix.col_stride, + _info.x2_matrix.stride, + _info.x2_matrix.row_stride, + _info.x2_matrix.col_stride, + _info.y_matrix.stride, + _info.y_matrix.row_stride, + _info.y_matrix.col_stride, + _p); + + auto err = mcGetLastError(); + if (err != mcSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cdist::metax \ No newline at end of file diff --git a/src/infiniop/ops/cdist/moore/cdist_moore.h b/src/infiniop/ops/cdist/moore/cdist_moore.h new file mode 100644 index 000000000..d84e2ac60 --- /dev/null +++ b/src/infiniop/ops/cdist/moore/cdist_moore.h @@ -0,0 +1,16 @@ +#ifndef __CDIST_MOORE_H__ +#define __CDIST_MOORE_H__ + +#include "../cdist.h" + +/** + * 使用 cdist.h 中定义的 DESCRIPTOR 宏。 + * 这将在命名空间 op::cdist::moore 中生成针对 Moore 设备的 Descriptor 类。 + * * 在 Moore 端的具体实现中,Opaque 结构体通常会存储: + * - mublasHandle_t: 用于 p=2.0 时的矩阵乘法加速(对应 NVIDIA 的 cuBLAS)。 + * - musaStream_t: 当前执行的任务流。 + * - 自定义 Kernel 的配置参数。 + */ +DESCRIPTOR(moore) + +#endif // __CDIST_MOORE_H__ \ No newline at end of file diff --git a/src/infiniop/ops/cdist/moore/cdist_moore.mu b/src/infiniop/ops/cdist/moore/cdist_moore.mu new file mode 100644 index 000000000..9149cfb46 --- /dev/null +++ b/src/infiniop/ops/cdist/moore/cdist_moore.mu @@ -0,0 +1,145 @@ +#include +#include "../../../devices/moore/moore_handle.h" +#include "cdist_moore.h" +#include + +namespace op::cdist::moore { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x1_desc, + infiniopTensorDescriptor_t x2_desc, + double p) { + + // 1. 转换至 Moore 句柄 + auto handle = reinterpret_cast(handle_); + auto dtype = y_desc->dtype(); + + // 保持与原版一致,目前仅支持 F32 + CHECK_DTYPE(dtype, INFINI_DTYPE_F32); + + auto result = CdistInfo::create(y_desc, x1_desc, x2_desc); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + dtype, result.take(), p, 0, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +// --- Kernel: Generic P-Norm (MUSA F32 实现) --- +__global__ void cdist_generic_kernel_f32( + float *y, + const float *x1, + const float *x2, + size_t m, + size_t n, + size_t d, + ptrdiff_t x1_stride, + ptrdiff_t x1_row_stride, + ptrdiff_t x1_col_stride, + ptrdiff_t x2_stride, + ptrdiff_t x2_row_stride, + ptrdiff_t x2_col_stride, + ptrdiff_t y_stride, + ptrdiff_t y_row_stride, + ptrdiff_t y_col_stride, + double p) { + + // 2. MUSA 同样支持 3D 线程索引 + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + int b = blockIdx.z; + + if (i >= (int)m || j >= (int)n) { + return; + } + + // 定位输出 y[b, i, j] + float *y_ptr = y + b * y_stride + i * y_row_stride + j * y_col_stride; + + // 定位输入向量 + const float *x1_vec = x1 + b * x1_stride + i * x1_row_stride; + const float *x2_vec = x2 + b * x2_stride + j * x2_row_stride; + + double dist = 0.0; + + for (size_t k = 0; k < d; ++k) { + float v1 = *(x1_vec + k * x1_col_stride); + float v2 = *(x2_vec + k * x2_col_stride); + float diff = fabsf(v1 - v2); + + if (p == 1.0) { + dist += (double)diff; + } else if (p == 2.0) { + dist += (double)diff * diff; + } else if (isinf(p)) { + dist = fmaxf((float)dist, diff); + } else { + dist += powf(diff, (float)p); + } + } + + if (p == 2.0) { + dist = sqrtf((float)dist); + } else if (!isinf(p) && p != 1.0) { + dist = powf((float)dist, 1.0f / (float)p); + } + + *y_ptr = (float)dist; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x1, + const void *x2, + void *stream) const { + + (void)workspace; + (void)workspace_size; + + if (_dtype != INFINI_DTYPE_F32) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // 3. 切换至 musaStream_t + musaStream_t mustream = reinterpret_cast(stream); + + // 保持 16x16 的 Block 大小,这在 MUSA 架构上也是通用的 + dim3 block(16, 16); + dim3 grid( + static_cast((_info.n + block.x - 1) / block.x), + static_cast((_info.m + block.y - 1) / block.y), + static_cast(_info.batch)); + + cdist_generic_kernel_f32<<>>( + static_cast(y), + static_cast(x1), + static_cast(x2), + _info.m, + _info.n, + _info.d, + _info.x1_matrix.stride, + _info.x1_matrix.row_stride, + _info.x1_matrix.col_stride, + _info.x2_matrix.stride, + _info.x2_matrix.row_stride, + _info.x2_matrix.col_stride, + _info.y_matrix.stride, + _info.y_matrix.row_stride, + _info.y_matrix.col_stride, + _p); + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cdist::moore \ No newline at end of file diff --git a/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cu b/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cu new file mode 100644 index 000000000..4d8e5726e --- /dev/null +++ b/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cu @@ -0,0 +1,168 @@ +#include +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "cdist_nvidia.cuh" +namespace op::cdist::nvidia { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x1_desc, + infiniopTensorDescriptor_t x2_desc, + double p) { + + auto handle = reinterpret_cast(handle_); + auto dtype = y_desc->dtype(); + + // 目前 NVIDIA 后端仅支持 F32,测试也是 F32 + CHECK_DTYPE(dtype, INFINI_DTYPE_F32); + + auto result = CdistInfo::create(y_desc, x1_desc, x2_desc); + CHECK_RESULT(result); + + // 当前实现不使用 workspace + *desc_ptr = new Descriptor( + dtype, result.take(), p, 0, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +// --- Kernel 1: L2 Epilogue --- +// 保留占位,当前实现未使用 GEMM 加速路径 +template +__global__ void cdist_l2_epilogue_kernel(T *y, const T *x1_norm, const T *x2_norm, + int M, int N, int batch_stride_y) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + int b = blockIdx.z; + + if (i < M && j < N) { + int idx = b * batch_stride_y + i * N + j; + // GEMM 已经计算了 -2*x1*x2^T 并存入 y + float val = (float)x1_norm[b * M + i] + (float)x2_norm[b * N + j] + (float)y[idx]; + y[idx] = (T)sqrtf(fmaxf(val, 0.0f)); + } +} + +// --- Kernel 2: Generic P-Norm (F32, 支持通用步长) --- +__global__ void cdist_generic_kernel_f32( + float *y, + const float *x1, + const float *x2, + size_t m, + size_t n, + size_t d, + ptrdiff_t x1_stride, + ptrdiff_t x1_row_stride, + ptrdiff_t x1_col_stride, + ptrdiff_t x2_stride, + ptrdiff_t x2_row_stride, + ptrdiff_t x2_col_stride, + ptrdiff_t y_stride, + ptrdiff_t y_row_stride, + ptrdiff_t y_col_stride, + double p) { + + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + int b = blockIdx.z; + + if (i >= (int)m || j >= (int)n) { + return; + } + + // 定位输出位置 y[b, i, j] + float *y_ptr = y + + b * y_stride + + i * y_row_stride + + j * y_col_stride; + + // 定位向量位置 x1[b, i, :] 和 x2[b, j, :] + const float *x1_vec = x1 + + b * x1_stride + + i * x1_row_stride; + const float *x2_vec = x2 + + b * x2_stride + + j * x2_row_stride; + + double dist = 0.0; + + for (size_t k = 0; k < d; ++k) { + float v1 = *(x1_vec + k * x1_col_stride); + float v2 = *(x2_vec + k * x2_col_stride); + float diff = fabsf(v1 - v2); + + if (p == 1.0) { + dist += diff; + } else if (p == 2.0) { + dist += diff * diff; + } else if (isinf(p)) { + dist = fmaxf((float)dist, diff); + } else { + dist += powf((float)diff, (float)p); + } + } + + if (p == 2.0) { + dist = sqrtf((float)dist); + } else if (!isinf(p) && p != 1.0) { + dist = powf((float)dist, 1.0f / (float)p); + } + + *y_ptr = (float)dist; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x1, + const void *x2, + void *stream) const { + + (void)workspace; + (void)workspace_size; + + if (_dtype != INFINI_DTYPE_F32) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + cudaStream_t custream = (cudaStream_t)stream; + dim3 block(16, 16); + dim3 grid( + static_cast((_info.n + block.x - 1) / block.x), + static_cast((_info.m + block.y - 1) / block.y), + static_cast(_info.batch)); + + cdist_generic_kernel_f32<<>>( + static_cast(y), + static_cast(x1), + static_cast(x2), + _info.m, + _info.n, + _info.d, + _info.x1_matrix.stride, + _info.x1_matrix.row_stride, + _info.x1_matrix.col_stride, + _info.x2_matrix.stride, + _info.x2_matrix.row_stride, + _info.x2_matrix.col_stride, + _info.y_matrix.stride, + _info.y_matrix.row_stride, + _info.y_matrix.col_stride, + _p); + + auto err = cudaGetLastError(); + if (err != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cdist::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cuh b/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cuh new file mode 100644 index 000000000..02dfc00e2 --- /dev/null +++ b/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cuh @@ -0,0 +1,16 @@ +#ifndef __CDIST_NVIDIA_CUH__ +#define __CDIST_NVIDIA_CUH__ + +#include "../cdist.h" + +/** + * 使用 cdist.h 中定义的 DESCRIPTOR 宏。 + * 这将在命名空间 op::cdist::nvidia 中生成针对 NVIDIA 设备的 Descriptor 类。 + * * 在 NVIDIA 端的具体实现中,Opaque 结构体通常会存储: + * - cublasHandle_t: 用于 p=2.0 时的矩阵乘法加速。 + * - cudaStream_t: 当前执行的任务流。 + * - 自定义 Kernel 的配置参数。 + */ +DESCRIPTOR(nvidia) + +#endif // __CDIST_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/cdist/operator.cc b/src/infiniop/ops/cdist/operator.cc new file mode 100644 index 000000000..798426085 --- /dev/null +++ b/src/infiniop/ops/cdist/operator.cc @@ -0,0 +1,229 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cdist.h" + +// 引入各硬件后端的 Descriptor 定义 +#ifdef ENABLE_CPU_API +#include "cpu/cdist_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/cdist_nvidia.cuh" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/cdist_bang.h" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/cdist_ascend.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cdist_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/cdist_moore.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/cdist_kunlun.h" +#endif + +// ----------------------------------------------------------------------------- +// 1. 创建描述符 +// ----------------------------------------------------------------------------- +__C infiniStatus_t infiniopCreateCdistDescriptor( + infiniopHandle_t handle, + infiniopCdistDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x1_desc, + infiniopTensorDescriptor_t x2_desc, + double p) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cdist::NAMESPACE::Descriptor::create(handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, x1_desc, x2_desc, p) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +// ----------------------------------------------------------------------------- +// 2. 获取 Workspace 大小 +// ----------------------------------------------------------------------------- +__C infiniStatus_t infiniopGetCdistWorkspaceSize( + infiniopCdistDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +// ----------------------------------------------------------------------------- +// 3. 执行计算 (计算成对距离) +// ----------------------------------------------------------------------------- +__C infiniStatus_t infiniopCdist( + infiniopCdistDescriptor_t desc, + void *workspace, size_t workspace_size, + void *y, + const void *x1, + const void *x2, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x1, x2, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +// ----------------------------------------------------------------------------- +// 4. 销毁描述符 +// ----------------------------------------------------------------------------- +__C infiniStatus_t infiniopDestroyCdistDescriptor(infiniopCdistDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc);\ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + DELETE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc new file mode 100644 index 000000000..28750c10f --- /dev/null +++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc @@ -0,0 +1,54 @@ +#include "reciprocal_cpu.h" + +namespace op::reciprocal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &y_desc = out_desc; + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = y_desc->shape(); + const auto &x_shape = x_desc->shape(); + + // Reciprocal typically only supports floating point types + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reciprocal::cpu \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h new file mode 100644 index 000000000..94b6dd25d --- /dev/null +++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h @@ -0,0 +1,19 @@ +#ifndef __RECIPROCAL_CPU_H__ +#define __RECIPROCAL_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(reciprocal, cpu) + +namespace op::reciprocal::cpu { +typedef struct ReciprocalOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return static_cast(1) / x; + } +} ReciprocalOp; +} // namespace op::reciprocal::cpu + +#endif // __RECIPROCAL_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh new file mode 100644 index 000000000..030dcb17e --- /dev/null +++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh @@ -0,0 +1,27 @@ +#ifndef __RECIPROCAL_CUDA_H__ +#define __RECIPROCAL_CUDA_H__ + +namespace op::reciprocal::cuda { +typedef struct ReciprocalOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return h2rcp(x); + } else if constexpr (std::is_same_v) { + return hrcp(x); + } else if constexpr (std::is_same_v) { + // bfloat16 does not have a direct hrcp intrinsic in some versions, + // often handled by converting to float or using specific bf16 intrinsics + return __float2bfloat16(1.0f / __bfloat162float(x)); + } else if constexpr (std::is_same_v) { + return __frcp_rd(x); + } else { + return static_cast(1) / x; + } + } +} ReciprocalOp; +} // namespace op::reciprocal::cuda + +#endif // __RECIPROCAL_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/metax/reciprocal_metax.h b/src/infiniop/ops/reciprocal/metax/reciprocal_metax.h new file mode 100644 index 000000000..39a0c541a --- /dev/null +++ b/src/infiniop/ops/reciprocal/metax/reciprocal_metax.h @@ -0,0 +1,8 @@ +#ifndef __RECIPROCAL_METAX_API_H__ +#define __RECIPROCAL_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(reciprocal, metax) + +#endif // __RECIPROCAL_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/metax/reciprocal_metax.maca b/src/infiniop/ops/reciprocal/metax/reciprocal_metax.maca new file mode 100644 index 000000000..5ae1de260 --- /dev/null +++ b/src/infiniop/ops/reciprocal/metax/reciprocal_metax.maca @@ -0,0 +1,61 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "reciprocal_metax.h" +#include "reciprocal_metax_kernel.h" + +namespace op::reciprocal::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &y_desc = out_desc; + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = y_desc->shape(); + const auto &x_shape = x_desc->shape(); + + // Reciprocal typically only supports floating point types + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, metax::ReciprocalOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, metax::ReciprocalOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, metax::ReciprocalOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, metax::ReciprocalOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reciprocal::metax \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/metax/reciprocal_metax_kernel.h b/src/infiniop/ops/reciprocal/metax/reciprocal_metax_kernel.h new file mode 100644 index 000000000..9c5cb706c --- /dev/null +++ b/src/infiniop/ops/reciprocal/metax/reciprocal_metax_kernel.h @@ -0,0 +1,47 @@ +#ifndef __RECIPROCAL_METAX_KERNEL_H__ +#define __RECIPROCAL_METAX_KERNEL_H__ + +/* + * This file contains the Reciprocal operation implementation for the MUSA backend. + * + * It follows the consistent code structure to ensure alignment across different + * hardware platforms within the Moore Threads (MUSA) ecosystem. + */ +namespace op::reciprocal::metax { + +typedef struct ReciprocalOp { +public: + // 一元算子,输入数量为 1 + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &a) const { + if constexpr (std::is_same_v) { + // 使用 MUSA 的 half2 倒数指令(如果硬件支持) + // 或者转为 float2 进行计算 + float2 f2 = __half22float2(a); + f2.x = 1.0f / f2.x; + f2.y = 1.0f / f2.y; + return __float22half2_rn(f2); + } else if constexpr (std::is_same_v) { + // 提升到 float 计算以保证数值稳定性 + return __float2half(1.0f / __half2float(a)); + } else if constexpr (std::is_same_v) { + // BF16 在 MUSA 上推荐转为 float 处理 + float a_f = __bfloat162float(a); + return __float2bfloat16_rn(1.0f / a_f); + } else if constexpr (std::is_same_v) { + // 编译器通常会将 1.0f/a 优化为硬件 rcp 指令 (Round to Nearest) + return 1.0f / a; + } else if constexpr (std::is_same_v) { + return 1.0 / a; + } else { + // 整数类型倒数通常返回 0 (除 1 以外),保持标准 C++ 行为 + return static_cast(1) / a; + } + } +} ReciprocalOp; + +} // namespace op::reciprocal::metax + +#endif // __RECIPROCAL_METAX_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/moore/reciprocal_moore.h b/src/infiniop/ops/reciprocal/moore/reciprocal_moore.h new file mode 100644 index 000000000..f0cfe6f42 --- /dev/null +++ b/src/infiniop/ops/reciprocal/moore/reciprocal_moore.h @@ -0,0 +1,11 @@ +#ifndef __RECIPROCAL_MOORE_API_H__ +#define __RECIPROCAL_MOORE_API_H__ + +// 1. 切换到 Moore 平台的 elementwise API 定义文件 +#include "../../../elementwise/moore/elementwise_moore_api.h" + +// 2. 调用宏生成 op::reciprocal::moore::Descriptor +// 宏展开后会包含 create 和 calculate 的标准声明 +ELEMENTWISE_DESCRIPTOR(reciprocal, moore) + +#endif // __RECIPROCAL_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/moore/reciprocal_moore.mu b/src/infiniop/ops/reciprocal/moore/reciprocal_moore.mu new file mode 100644 index 000000000..063ec2740 --- /dev/null +++ b/src/infiniop/ops/reciprocal/moore/reciprocal_moore.mu @@ -0,0 +1,66 @@ +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "reciprocal_moore_kernel.h" +#include "reciprocal_moore.h" + +namespace op::reciprocal::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + // 1. 解析 Moore (MUSA) 句柄 + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &y_desc = out_desc; + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = y_desc->shape(); + const auto &x_shape = x_desc->shape(); + + // 2. 校验数据类型:Moore 平台同样在浮点类型上执行倒数运算 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // 3. 使用 Moore 平台的 Elementwise 描述符创建宏 + // 该宏会自动处理 MUSA 后端的算子元数据初始化 + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // 4. 分发至 Moore 特化的计算逻辑 + // 注意:cuda::ReciprocalOp 替换为 moore::ReciprocalOp + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::ReciprocalOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + // 确保使用 Moore 环境下的 bfloat16 类型定义 + return _device_info->calculate<256, moore::ReciprocalOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::ReciprocalOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::ReciprocalOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reciprocal::moore \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/moore/reciprocal_moore_kernel.h b/src/infiniop/ops/reciprocal/moore/reciprocal_moore_kernel.h new file mode 100644 index 000000000..74d162671 --- /dev/null +++ b/src/infiniop/ops/reciprocal/moore/reciprocal_moore_kernel.h @@ -0,0 +1,47 @@ +#ifndef __RECIPROCAL_MOORE_KERNEL_H__ +#define __RECIPROCAL_MOORE_KERNEL_H__ + +/* + * This file contains the Reciprocal operation implementation for the MUSA backend. + * + * It follows the consistent code structure to ensure alignment across different + * hardware platforms within the Moore Threads (MUSA) ecosystem. + */ +namespace op::reciprocal::moore { + +typedef struct ReciprocalOp { +public: + // 一元算子,输入数量为 1 + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &a) const { + if constexpr (std::is_same_v) { + // 使用 MUSA 的 half2 倒数指令(如果硬件支持) + // 或者转为 float2 进行计算 + float2 f2 = __half22float2(a); + f2.x = 1.0f / f2.x; + f2.y = 1.0f / f2.y; + return __float22half2_rn(f2); + } else if constexpr (std::is_same_v) { + // 提升到 float 计算以保证数值稳定性 + return __float2half(1.0f / __half2float(a)); + } else if constexpr (std::is_same_v) { + // BF16 在 MUSA 上推荐转为 float 处理 + float a_f = __bfloat162float(a); + return __float2bfloat16_rn(1.0f / a_f); + } else if constexpr (std::is_same_v) { + // 编译器通常会将 1.0f/a 优化为硬件 rcp 指令 (Round to Nearest) + return 1.0f / a; + } else if constexpr (std::is_same_v) { + return 1.0 / a; + } else { + // 整数类型倒数通常返回 0 (除 1 以外),保持标准 C++ 行为 + return static_cast(1) / a; + } + } +} ReciprocalOp; + +} // namespace op::reciprocal::moore + +#endif // __RECIPROCAL_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu new file mode 100644 index 000000000..523259bad --- /dev/null +++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "reciprocal_nvidia.cuh" + +namespace op::reciprocal::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &y_desc = out_desc; + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = y_desc->shape(); + const auto &x_shape = x_desc->shape(); + + // Reciprocal typically only supports floating point types + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ReciprocalOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ReciprocalOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reciprocal::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh new file mode 100644 index 000000000..68f640767 --- /dev/null +++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __RECIPROCAL_CUDA_API_H__ +#define __RECIPROCAL_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(reciprocal, nvidia) + +#endif // __RECIPROCAL_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc new file mode 100644 index 000000000..37c38bc43 --- /dev/null +++ b/src/infiniop/ops/reciprocal/operator.cc @@ -0,0 +1,199 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/reciprocal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/reciprocal_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/reciprocal_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/reciprocal_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/reciprocal_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/reciprocal_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/reciprocal_moore.h" +#endif + +__C infiniStatus_t infiniopCreateReciprocalDescriptor( + infiniopHandle_t handle, + infiniopReciprocalDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::reciprocal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReciprocal( + infiniopReciprocalDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/test/infinicore/ops/addcmul.py b/test/infinicore/ops/addcmul.py index 143154f84..5dfb6a834 100644 --- a/test/infinicore/ops/addcmul.py +++ b/test/infinicore/ops/addcmul.py @@ -119,9 +119,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.addcmul(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.addcmul(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.addcmul(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/atanh.py b/test/infinicore/ops/atanh.py index 4f8cab954..fec2027ce 100644 --- a/test/infinicore/ops/atanh.py +++ b/test/infinicore/ops/atanh.py @@ -97,9 +97,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.atanh(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.atanh(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore atanh implementation""" + return infinicore.atanh(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/binary_cross_entropy_with_logits.py b/test/infinicore/ops/binary_cross_entropy_with_logits.py index 986605a38..2e8273e47 100644 --- a/test/infinicore/ops/binary_cross_entropy_with_logits.py +++ b/test/infinicore/ops/binary_cross_entropy_with_logits.py @@ -80,9 +80,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/cdist.py b/test/infinicore/ops/cdist.py index 2d41668cc..f3827fd7a 100644 --- a/test/infinicore/ops/cdist.py +++ b/test/infinicore/ops/cdist.py @@ -63,9 +63,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.cdist(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.cdist(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.cdist(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/reciprocal.py b/test/infinicore/ops/reciprocal.py index 9d4ba0849..c4e6bfff8 100644 --- a/test/infinicore/ops/reciprocal.py +++ b/test/infinicore/ops/reciprocal.py @@ -89,9 +89,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.reciprocal(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.reciprocal(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.reciprocal(*args, **kwargs) def main(): diff --git a/test/infiniop/addcmul.py b/test/infiniop/addcmul.py new file mode 100644 index 000000000..474b5c3f5 --- /dev/null +++ b/test/infiniop/addcmul.py @@ -0,0 +1,163 @@ +import torch +import ctypes +from ctypes import c_uint64, c_float +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration +# ============================================================================== +_TEST_CASES_ = [ + # shape, input_stride, t1_stride, t2_stride + ((3, 3), None, None, None), + ((32, 512), None, None, None), + ((32, 512), (1024, 1), (1024, 1), (1024, 1)), + ((16, 32, 64), None, None, None), + ((8, 1, 1024), None, None, None), # 包含广播形状的潜在测试 +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, +} + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [Inplace.OUT_OF_PLACE, Inplace.INPLACE_INPUT] +_VALUES = [1.0, 0.5, -2.0] # 测试不同的 value 系数 + +_TEST_CASES = [ + test_case + (inplace_item, value) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE + for value in _VALUES +] + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 100 + +def test( + handle, + device, + shape, + input_stride=None, + t1_stride=None, + t2_stride=None, + inplace=Inplace.OUT_OF_PLACE, + value=1.0, + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing Addcmul on {InfiniDeviceNames[device]} with shape:{shape} value:{value} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # 准备输入 Tensor + input_tensor = TestTensor(shape, input_stride, dtype, device) + t1 = TestTensor(shape, t1_stride, dtype, device) + t2 = TestTensor(shape, t2_stride, dtype, device) + + # 使用 PyTorch 计算参考答案 + # out = input + value * t1 * t2 + ans = torch.addcmul(input_tensor.torch_tensor(), t1.torch_tensor(), t2.torch_tensor(), value=value) + + if inplace == Inplace.INPLACE_INPUT: + out = input_tensor + else: + out = TestTensor(shape, None, dtype, device) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + # 注意:根据之前的定义,Create 接口接收 value + check_error( + LIBINFINIOP.infiniopCreateAddcmulDescriptor( + handle, + ctypes.byref(descriptor), + out.descriptor, + input_tensor.descriptor, + t1.descriptor, + t2.descriptor, + c_float(value) + ) + ) + + # 销毁临时描述符以防内核错误引用 + for t in [input_tensor, t1, t2, out]: + t.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAddcmulWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, input_tensor.device) + + def lib_addcmul(): + check_error( + LIBINFINIOP.infiniopAddcmul( + descriptor, + workspace.data(), + workspace_size.value, + out.data(), + input_tensor.data(), + t1.data(), + t2.data(), + None, + ) + ) + + lib_addcmul() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(out.actual_tensor(), ans, atol=atol, rtol=rtol) + + assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol) + + if PROFILE: + profile_operation("PyTorch", lambda: torch.addcmul(input_tensor.torch_tensor(), t1.torch_tensor(), t2.torch_tensor(), value=value), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_addcmul(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyAddcmulDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mAddcmul tests passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/atanh.py b/test/infiniop/atanh.py new file mode 100644 index 000000000..32c8eaa2a --- /dev/null +++ b/test/infiniop/atanh.py @@ -0,0 +1,171 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration +# ============================================================================== +_TEST_CASES_ = [ + # shape, a_stride, y_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((16, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# atanh typically supports floating point types +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +def atanh_torch(y, a): + torch.atanh(a, out=y) + +def test( + handle, + device, + shape, + a_stride=None, + y_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # Initialize input tensor + a = TestTensor(shape, a_stride, dtype, device) + + # Crucial: clamp values to (-1, 1) to avoid NaN/Inf for atanh + with torch.no_grad(): + a.torch_tensor().clamp_(-0.99, 0.99) + # Keep underlying data in sync for all devices (including CPU) + a.actual_tensor().copy_(a.torch_tensor()) + + if inplace == Inplace.INPLACE_A: + if a_stride != y_stride: + return + y = a + else: + y = TestTensor(shape, y_stride, dtype, device, mode="ones") + + if y.is_broadcast(): + return + + print( + f"Testing Atanh on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} y_stride:{y_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # Reference calculation + atanh_torch(y.torch_tensor(), a.torch_tensor()) + + if sync is not None: + sync() + + # Create descriptor + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAtanhDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + a.descriptor, + ) + ) + + # Invalidate descriptors to ensure kernel uses its own internal state + for tensor in [a, y]: + tensor.destroy_desc() + + # Workspace management + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAtanhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_atanh(): + check_error( + LIBINFINIOP.infiniopAtanh( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + a.data(), + None, + ) + ) + + # Run library function + lib_atanh() + + # Validate results + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling + if PROFILE: + profile_operation("PyTorch", lambda: atanh_torch(y.torch_tensor(), a.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_atanh(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyAtanhDescriptor(descriptor)) + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mAtanh Test passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/binary_cross_entropy_with_logits.py b/test/infiniop/binary_cross_entropy_with_logits.py new file mode 100644 index 000000000..b09c95570 --- /dev/null +++ b/test/infiniop/binary_cross_entropy_with_logits.py @@ -0,0 +1,154 @@ +import torch +import ctypes +from ctypes import c_uint64, c_float, c_char_p +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration +# ============================================================================== +# 测试场景:(shape, has_weight, has_pos_weight, reduction) +_TEST_CASES_DATA = [ + ((4, 5), False, False, "none"), + ((8, 8), True, False, "sum"), + ((32, 512), False, True, "mean"), + ((16, 32, 64), True, True, "mean"), +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 5e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, +} + +_REDUCTIONS = ["none", "mean", "sum"] + +# 生成最终测试用例组合 +_TEST_CASES = _TEST_CASES_DATA + +DEBUG = False +PROFILE = False + +def test( + handle, + device, + shape, + has_weight=False, + has_pos_weight=False, + reduction="none", + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing BCEWithLogits on {InfiniDeviceNames[device]} shape:{shape} " + f"weight:{has_weight} pos_weight:{has_pos_weight} reduction:{reduction} dtype:{InfiniDtypeNames[dtype]}" + ) + + # 1. 准备输入 Tensor + input_tensor = TestTensor(shape, None, dtype, device) + target = TestTensor(shape, None, dtype, device) + + weight = TestTensor(shape, None, dtype, device) if has_weight else None + # pos_weight 通常在最后一维广播,形状为 (C,) + pos_weight_shape = (shape[-1],) + pos_weight = TestTensor(pos_weight_shape, None, dtype, device) if has_pos_weight else None + + # 2. 使用 PyTorch 计算参考答案 + torch_input = input_tensor.torch_tensor() + torch_target = target.torch_tensor() + torch_weight = weight.torch_tensor() if has_weight else None + torch_pos_weight = pos_weight.torch_tensor() if has_pos_weight else None + + ans = torch.nn.functional.binary_cross_entropy_with_logits( + torch_input, + torch_target, + weight=torch_weight, + pos_weight=torch_pos_weight, + reduction=reduction + ) + + # 3. 准备输出 Tensor (根据 reduction 确定形状) + out_shape = () if reduction != "none" else shape + out = TestTensor(out_shape, None, dtype, device) + + if sync is not None: + sync() + + # 4. 创建描述符并执行 + descriptor = infiniopOperatorDescriptor_t() + + # 模拟 C 接口调用 + check_error( + LIBINFINIOP.infiniopCreateBCEWithLogitsDescriptor( + handle, + ctypes.byref(descriptor), + out.descriptor, + input_tensor.descriptor, + target.descriptor, + weight.descriptor if has_weight else None, + pos_weight.descriptor if has_pos_weight else None, + c_char_p(reduction.encode('utf-8')) # 传入归约方式 + ) + ) + + # 销毁临时描述符 + for t in [input_tensor, target, out]: + t.destroy_desc() + if weight: weight.destroy_desc() + if pos_weight: pos_weight.destroy_desc() + + workspace_size = c_uint64(0) + check_error(LIBINFINIOP.infiniopGetBCEWithLogitsWorkspaceSize(descriptor, ctypes.byref(workspace_size))) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_op(): + check_error( + LIBINFINIOP.infiniopBCEWithLogits( + descriptor, + workspace.data(), + workspace_size.value, + out.data(), + input_tensor.data(), + target.data(), + weight.data() if has_weight else None, + pos_weight.data() if has_pos_weight else None, + ) + ) + + lib_op() + + if sync is not None: + sync() + + # 5. 验证结果 + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol) + + if PROFILE: + profile_operation("PyTorch", lambda: torch.nn.functional.binary_cross_entropy_with_logits( + torch_input, torch_target, weight=torch_weight, pos_weight=torch_pos_weight, reduction=reduction + ), device) + profile_operation(" lib", lib_op, device) + + check_error(LIBINFINIOP.infiniopDestroyBCEWithLogitsDescriptor(descriptor)) + +if __name__ == "__main__": + args = get_args() + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mBCEWithLogits tests passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/cdist.py b/test/infiniop/cdist.py new file mode 100644 index 000000000..750f9ceaa --- /dev/null +++ b/test/infiniop/cdist.py @@ -0,0 +1,156 @@ +import torch +import ctypes +from ctypes import c_uint64, c_float, c_double +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration +# ============================================================================== +# 格式: (M, N, D, x1_stride, x2_stride) +# x1: (M, D), x2: (N, D), out: (M, N) +_TEST_CASES_DATA = [ + (5, 6, 3, None, None), + (32, 64, 128, None, None), + (32, 64, 128, (256, 1), (256, 1)), # 测试带步长的输入 + (10, 7, 5, None, None), +] + +_TENSOR_DTYPES = [InfiniDtype.F32] # cdist 通常对精度敏感,初测建议用 F32 + +_TOLERANCE_MAP = { + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4}, +} + +_P_VALUES = [1.0, 2.0, float("inf")] # 不同的 p 范数测试 + +_TEST_CASES = [ + test_case + (p_val,) + for test_case in _TEST_CASES_DATA + for p_val in _P_VALUES +] + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 100 + +def test( + handle, + device, + M, N, D, + x1_stride=None, + x2_stride=None, + p=2.0, + dtype=InfiniDtype.F32, + sync=None, +): + print( + f"Testing Cdist on {InfiniDeviceNames[device]} with M:{M}, N:{N}, D:{D}, p:{p}, dtype:{InfiniDtypeNames[dtype]}" + ) + + # 1. 准备输入输出形状 + x1_shape = (M, D) + x2_shape = (N, D) + out_shape = (M, N) + + # 2. 准备输入 Tensor + x1 = TestTensor(x1_shape, x1_stride, dtype, device) + x2 = TestTensor(x2_shape, x2_stride, dtype, device) + out = TestTensor(out_shape, None, dtype, device) + + # 3. 使用 PyTorch 计算参考答案 + # torch.cdist 要求输入至少是 2D + ans = torch.cdist(x1.torch_tensor(), x2.torch_tensor(), p=p) + + if sync is not None: + sync() + + # 4. 创建算子描述符 + descriptor = infiniopOperatorDescriptor_t() + # 注意:这里假设 C 接口名为 infiniopCreateCdistDescriptor + check_error( + LIBINFINIOP.infiniopCreateCdistDescriptor( + handle, + ctypes.byref(descriptor), + out.descriptor, + x1.descriptor, + x2.descriptor, + c_double(p) # 通常 p 使用 double 或 float 传递 + ) + ) + + # 销毁临时描述符以防内核错误引用(沿用 addcmul 风格) + for t in [x1, x2, out]: + t.destroy_desc() + + # 5. Workspace 准备 + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCdistWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x1.device) + + # 6. 执行函数定义 + def lib_cdist(): + check_error( + LIBINFINIOP.infiniopCdist( + descriptor, + workspace.data(), + workspace_size.value, + out.data(), + x1.data(), + x2.data(), + None, # stream + ) + ) + + # 7. 运行 + lib_cdist() + + if sync is not None: + sync() + + # 8. 验证结果 + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(out.actual_tensor(), ans, atol=atol, rtol=rtol) + + assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol) + + # 9. 性能分析 + if PROFILE: + profile_operation("PyTorch", lambda: torch.cdist(x1.torch_tensor(), x2.torch_tensor(), p=p), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cdist(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyCdistDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mCdist tests passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 5b2974111..892facdfe 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -20,6 +20,36 @@ def register_lib(cls, lib): for op in cls.registry: op(lib) +@OpRegister.operator +def atanh_(lib): + lib.infiniopCreateAtanhDescriptor.restype = c_int32 + lib.infiniopCreateAtanhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetAtanhWorkspaceSize.restype = c_int32 + lib.infiniopGetAtanhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopAtanh.restype = c_int32 + lib.infiniopAtanh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # y_data + c_void_p, # a_data + c_void_p, # stream + ] + + lib.infiniopDestroyAtanhDescriptor.restype = c_int32 + lib.infiniopDestroyAtanhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] @OpRegister.operator def add_(lib): @@ -54,6 +84,156 @@ def add_(lib): infiniopOperatorDescriptor_t, ] +@OpRegister.operator +def addcmul_(lib): + lib.infiniopCreateAddcmulDescriptor.restype = c_int32 + lib.infiniopCreateAddcmulDescriptor.argtypes = [ + infiniopHandle_t, # handle + POINTER(infiniopOperatorDescriptor_t), # desc_ptr + infiniopTensorDescriptor_t, # out_desc + infiniopTensorDescriptor_t, # input_desc + infiniopTensorDescriptor_t, # t1_desc + infiniopTensorDescriptor_t, # t2_desc + c_float, # value (标量系数) + ] + + lib.infiniopGetAddcmulWorkspaceSize.restype = c_int32 + lib.infiniopGetAddcmulWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + POINTER(c_size_t), # size_ptr + ] + + lib.infiniopAddcmul.restype = c_int32 + lib.infiniopAddcmul.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # out_ptr + c_void_p, # input_ptr + c_void_p, # t1_ptr + c_void_p, # t2_ptr + c_void_p, # stream + ] + + lib.infiniopDestroyAddcmulDescriptor.restype = c_int32 + lib.infiniopDestroyAddcmulDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + ] + +@OpRegister.operator +def cdist_(lib): + # 1. 创建描述符接口 + # 接口通常接收 handle, 输出 desc, 两个输入 desc, 以及范数 p + lib.infiniopCreateCdistDescriptor.restype = c_int32 + lib.infiniopCreateCdistDescriptor.argtypes = [ + infiniopHandle_t, # handle + POINTER(infiniopOperatorDescriptor_t), # desc_ptr + infiniopTensorDescriptor_t, # y_desc (输出) + infiniopTensorDescriptor_t, # x1_desc + infiniopTensorDescriptor_t, # x2_desc + c_double, # p (范数阶数) + ] + + # 2. 获取 Workspace 大小接口 + lib.infiniopGetCdistWorkspaceSize.restype = c_int32 + lib.infiniopGetCdistWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + POINTER(c_size_t), # size_ptr + ] + + # 3. 执行算子接口 + lib.infiniopCdist.restype = c_int32 + lib.infiniopCdist.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # y_ptr + c_void_p, # x1_ptr + c_void_p, # x2_ptr + c_void_p, # stream + ] + + # 4. 销毁描述符接口 + lib.infiniopDestroyCdistDescriptor.restype = c_int32 + lib.infiniopDestroyCdistDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + ] + +@OpRegister.operator +def binary_cross_entropy_with_logits_(lib): + # 1. 创建描述符 (Descriptor Creation) + lib.infiniopCreateBCEWithLogitsDescriptor.restype = c_int32 + lib.infiniopCreateBCEWithLogitsDescriptor.argtypes = [ + infiniopHandle_t, # handle + POINTER(infiniopOperatorDescriptor_t), # desc_ptr + infiniopTensorDescriptor_t, # out_desc + infiniopTensorDescriptor_t, # input_desc (logits) + infiniopTensorDescriptor_t, # target_desc + infiniopTensorDescriptor_t, # weight_desc (可选,不可用则传 NULL) + infiniopTensorDescriptor_t, # pos_weight_desc (可选,不可用则传 NULL) + c_int32 # reduction (0:none, 1:mean, 2:sum) + ] + + # 2. 获取工作空间大小 (Workspace Size) + lib.infiniopGetBCEWithLogitsWorkspaceSize.restype = c_int32 + lib.infiniopGetBCEWithLogitsWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + POINTER(c_size_t), # size_ptr + ] + + # 3. 执行算子 (Execution) + lib.infiniopBCEWithLogits.restype = c_int32 + lib.infiniopBCEWithLogits.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # out_ptr + c_void_p, # input_ptr (logits) + c_void_p, # target_ptr + c_void_p, # weight_ptr (可选) + c_void_p, # pos_weight_ptr (可选) + c_void_p, # stream + ] + + # 4. 销毁描述符 (Destruction) + lib.infiniopDestroyBCEWithLogitsDescriptor.restype = c_int32 + lib.infiniopDestroyBCEWithLogitsDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + ] + +@OpRegister.operator +def reciprocal_(lib): + lib.infiniopCreateReciprocalDescriptor.restype = c_int32 + lib.infiniopCreateReciprocalDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # Output descriptor + infiniopTensorDescriptor_t, # Input descriptor + ] + + # 获取工作空间大小接口 + lib.infiniopGetReciprocalWorkspaceSize.restype = c_int32 + lib.infiniopGetReciprocalWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + # 最后的 c_void_p 通常对应 stream 或其他异步句柄,保持一致即可 + lib.infiniopReciprocal.restype = c_int32 + lib.infiniopReciprocal.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, # Workspace pointer + c_size_t, # Workspace size + c_void_p, # Output data pointer + c_void_p, # Input data pointer + c_void_p, # Stream pointer (optional) + ] + + # 销毁描述符接口 + lib.infiniopDestroyReciprocalDescriptor.restype = c_int32 + lib.infiniopDestroyReciprocalDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] @OpRegister.operator def attention_(lib): diff --git a/test/infiniop/reciprocal.py b/test/infiniop/reciprocal.py new file mode 100644 index 000000000..4742ffdb8 --- /dev/null +++ b/test/infiniop/reciprocal.py @@ -0,0 +1,165 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration +# ============================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((13, 16, 2), (128, 4, 1), (64, 4, 1)), + ((4, 4, 5632), None, None), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Reciprocal usually outputs floats; Integer types are often not supported or special-cased +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +def reciprocal(y, x): + torch.reciprocal(x, out=y) + +def test( + handle, + device, + shape, + in_stride=None, + out_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # Initialize input 'x' + # Use 'random' mode but ensure values are not near zero to avoid infinity + x = TestTensor(shape, in_stride, dtype, device) + + if inplace == Inplace.INPLACE: + if in_stride != out_stride: + return + y = x + else: + y = TestTensor(shape, out_stride, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Reciprocal on {InfiniDeviceNames[device]} with shape:{shape} " + f"in_stride:{in_stride} out_stride:{out_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # Calculate ground truth using PyTorch + reciprocal(y.torch_tensor(), x.torch_tensor()) + + if sync is not None: + sync() + + # Create Descriptor + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReciprocalDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ) + ) + + # Invalidate descriptors as per framework requirement + for tensor in [x, y]: + tensor.destroy_desc() + + # Workspace allocation + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReciprocalWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_reciprocal(): + check_error( + LIBINFINIOP.infiniopReciprocal( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + None, + ) + ) + + lib_reciprocal() + + # Verification + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling + if PROFILE: + profile_operation("PyTorch", lambda: reciprocal(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_reciprocal(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyReciprocalDescriptor(descriptor)) + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file