diff --git a/include/infinicore/ops/logcumsumexp.hpp b/include/infinicore/ops/logcumsumexp.hpp new file mode 100644 index 000000000..7aaaf510a --- /dev/null +++ b/include/infinicore/ops/logcumsumexp.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class LogCumSumExp { +public: + using schema = void (*)(Tensor, Tensor, int, bool, bool); + + static void execute(Tensor y, Tensor x, int axis, bool exclusive, bool reverse); + + static common::OpDispatcher &dispatcher(); +}; + +Tensor logcumsumexp(Tensor x, int axis, bool exclusive = false, bool reverse = false); + +void logcumsumexp_(Tensor y, Tensor x, int axis, bool exclusive = false, bool reverse = false); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/logical_and.hpp b/include/infinicore/ops/logical_and.hpp new file mode 100644 index 000000000..6cb543151 --- /dev/null +++ b/include/infinicore/ops/logical_and.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class LogicalAnd { +public: + // LogicalAnd 是二元操作,schema 通常定义为 (Output, Input1, Input2) + using schema = void (*)(Tensor, Tensor, Tensor); + + static void execute(Tensor output, Tensor input1, Tensor input2); + static common::OpDispatcher &dispatcher(); +}; + +Tensor logical_and(Tensor input1, Tensor input2); +void logical_and_(Tensor output, Tensor input1, Tensor input2); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/logical_not.hpp b/include/infinicore/ops/logical_not.hpp new file mode 100644 index 000000000..68c947ce8 --- /dev/null +++ b/include/infinicore/ops/logical_not.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class LogicalNot { +public: + // LogicalNot 是一元操作,schema 定义为 (Output, Input) + using schema = void (*)(Tensor, Tensor); + + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +// 构造新 Tensor 返回结果 +Tensor logical_not(Tensor input); + +// 将结果写入指定的 output Tensor +void logical_not_(Tensor output, Tensor input); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/unfold.hpp b/include/infinicore/ops/unfold.hpp new file mode 100644 index 000000000..6dea85562 --- /dev/null +++ b/include/infinicore/ops/unfold.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Unfold { +public: + // schema: output, input, kernel_sizes, dilations, paddings, strides + using schema = void (*)(Tensor, Tensor, const std::vector&, const std::vector&, const std::vector&, const std::vector&); + + static void execute(Tensor output, Tensor input, + const std::vector& kernel_sizes, + const std::vector& dilations, + const std::vector& paddings, + const std::vector& strides); + static common::OpDispatcher &dispatcher(); +}; + +// Functional API +Tensor unfold(Tensor input, + std::vector kernel_sizes, + std::vector dilations, + std::vector paddings, + std::vector strides); + +void unfold_(Tensor output, Tensor input, + std::vector kernel_sizes, + std::vector dilations, + std::vector paddings, + std::vector strides); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/vander.hpp b/include/infinicore/ops/vander.hpp new file mode 100644 index 000000000..1ee4e112e --- /dev/null +++ b/include/infinicore/ops/vander.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Vander { +public: + // schema: output, input, N, increasing + using schema = void (*)(Tensor, Tensor, int64_t, bool); + + static void execute(Tensor output, Tensor input, int64_t N, bool increasing); + static common::OpDispatcher &dispatcher(); +}; + +// N defaults to 0 (implying N = input.size(0), i.e., a square matrix) +Tensor vander(Tensor input, int64_t N = 0, bool increasing = false); +void vander_(Tensor output, Tensor input, int64_t N, bool increasing); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infiniop.h b/include/infiniop.h index c0a09fcb4..301cae708 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -34,6 +34,9 @@ #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" #include "infiniop/ops/zeros.h" +#include "infiniop/ops/unfold.h" +#include "infiniop/ops/vander.h" +#include "infiniop/ops/logcumsumexp.h" #include "infiniop/tensor_descriptor.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/logcumsumexp.h b/include/infiniop/ops/logcumsumexp.h new file mode 100644 index 000000000..a1fdc1376 --- /dev/null +++ b/include/infiniop/ops/logcumsumexp.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_LOGCUMSUMEXP_API_H__ +#define __INFINIOP_LOGCUMSUMEXP_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogCumSumExpDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogCumSumExpDescriptor(infiniopHandle_t handle, + infiniopLogCumSumExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int axis, + int exclusive, + int reverse); + +/* 获取执行 LogCumSumExp 所需的临时空间大小 */ +__C __export infiniStatus_t infiniopGetLogCumSumExpWorkspaceSize(infiniopLogCumSumExpDescriptor_t desc, + size_t *size); + +__C __export infiniStatus_t infiniopLogCumSumExp(infiniopLogCumSumExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +/* 销毁描述符 */ +__C __export infiniStatus_t infiniopDestroyLogCumSumExpDescriptor(infiniopLogCumSumExpDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/unfold.h b/include/infiniop/ops/unfold.h new file mode 100644 index 000000000..013cba214 --- /dev/null +++ b/include/infiniop/ops/unfold.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_UNFOLD_API_H__ +#define __INFINIOP_UNFOLD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopUnfoldDescriptor_t; + +__C __export infiniStatus_t infiniopCreateUnfoldDescriptor(infiniopHandle_t handle, + infiniopUnfoldDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + const int *kernel_sizes, + const int *strides, + const int *paddings, + const int *dilations); + +// 获取 Unfold 工作区大小 +__C __export infiniStatus_t infiniopGetUnfoldWorkspaceSize(infiniopUnfoldDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopUnfold(infiniopUnfoldDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); +__C __export infiniStatus_t infiniopDestroyUnfoldDescriptor(infiniopUnfoldDescriptor_t desc); + +#endif // __INFINIOP_UNFOLD_API_H__ \ No newline at end of file diff --git a/include/infiniop/ops/vander.h b/include/infiniop/ops/vander.h new file mode 100644 index 000000000..43e560434 --- /dev/null +++ b/include/infiniop/ops/vander.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_VANDER_API_H__ +#define __INFINIOP_VANDER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopVanderDescriptor_t; +__C __export infiniStatus_t infiniopCreateVanderDescriptor(infiniopHandle_t handle, + infiniopVanderDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + int N, + int increasing); + +__C __export infiniStatus_t infiniopGetVanderWorkspaceSize(infiniopVanderDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopVander(infiniopVanderDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyVanderDescriptor(infiniopVanderDescriptor_t desc); + +#endif // __INFINIOP_VANDER_API_H__ \ No newline at end of file diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index b7288f3ac..e9869f879 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -51,6 +51,10 @@ from infinicore.ops.rearrange import rearrange from infinicore.ops.squeeze import squeeze from infinicore.ops.unsqueeze import unsqueeze +from infinicore.ops.logcumsumexp import logcumsumexp +from infinicore.ops.logical_and import logical_and +from infinicore.ops.logical_not import logical_not +from infinicore.ops.vander import vander from infinicore.tensor import ( Tensor, empty, @@ -121,6 +125,10 @@ "from_list", "from_numpy", "from_torch", + "logcumsumexp", + "logical_not", + "logical_and", + "vander", "paged_caching", "paged_attention", "paged_attention_prefill", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..8b58fb727 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -6,6 +6,7 @@ from .rope import RopeAlgo, rope from .silu import silu from .swiglu import swiglu +from .unfold import unfold __all__ = [ "causal_softmax", @@ -16,5 +17,6 @@ "linear", "embedding", "rope", + "unfold", "RopeAlgo", ] diff --git a/python/infinicore/nn/functional/unfold.py b/python/infinicore/nn/functional/unfold.py new file mode 100644 index 000000000..dc7f69528 --- /dev/null +++ b/python/infinicore/nn/functional/unfold.py @@ -0,0 +1,58 @@ +from typing import Optional, Union, Tuple, List +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def unfold( + input: Tensor, + kernel_size: Union[int, Tuple[int, ...], List[int]], + dilation: Union[int, Tuple[int, ...], List[int]] = 1, + padding: Union[int, Tuple[int, ...], List[int]] = 0, + stride: Union[int, Tuple[int, ...], List[int]] = 1, + *, + out: Optional[Tensor] = None +) -> Tensor: + r"""Extracts sliding local blocks from a batched input tensor. + + Also known as im2col. The output tensor contains the flattened blocks. + + Args: + input (Tensor): The input tensor. + kernel_size (int or tuple): The size of the sliding blocks. + dilation (int or tuple, optional): The parameter that controls the stride of elements within the neighborhood. Default: 1. + padding (int or tuple, optional): Implicit zero padding to be added on both sides of input. Default: 0. + stride (int or tuple, optional): The stride of the sliding blocks. Default: 1. + out (Tensor, optional): The output tensor. + """ + + if not input.is_contiguous(): + input = input.contiguous() + + # Helper to ensure parameters are iterable (assuming 2D spatial dims for single int) + def _pair(x): + return (x, x) if isinstance(x, int) else x + + k_val = _pair(kernel_size) + d_val = _pair(dilation) + p_val = _pair(padding) + s_val = _pair(stride) + + if out is not None: + _infinicore.unfold_( + out._underlying, + input._underlying, + k_val, + d_val, + p_val, + s_val + ) + return out + + return Tensor( + _infinicore.unfold( + input._underlying, + k_val, + d_val, + p_val, + s_val + ) + ) \ No newline at end of file diff --git a/python/infinicore/ops/logcumsumexp.py b/python/infinicore/ops/logcumsumexp.py new file mode 100644 index 000000000..52aa5fd09 --- /dev/null +++ b/python/infinicore/ops/logcumsumexp.py @@ -0,0 +1,23 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def logcumsumexp(input, dim, exclusive=False, reverse=False, *, out=None): + if out is None: + # 调用 C++ 绑定的非原地操作,返回新创建的 _underlying 句柄 + return Tensor(_infinicore.logcumsumexp( + input._underlying, + dim, + exclusive, + reverse + )) + + # 调用 C++ 绑定的原地/指定输出操作 + _infinicore.logcumsumexp_( + out._underlying, + input._underlying, + dim, + exclusive, + reverse + ) + + return out \ No newline at end of file diff --git a/python/infinicore/ops/logical_and.py b/python/infinicore/ops/logical_and.py new file mode 100644 index 000000000..da0e98c72 --- /dev/null +++ b/python/infinicore/ops/logical_and.py @@ -0,0 +1,14 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def logical_and(input: Tensor, other: Tensor, *, out=None) -> Tensor: + r"""Computes the element-wise logical AND of the given input tensors.""" + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.logical_and(input, other, out=out) + + if out is None: + return Tensor(_infinicore.logical_and(input._underlying, other._underlying)) + + _infinicore.logical_and_(out._underlying, input._underlying, other._underlying) + return out \ No newline at end of file diff --git a/python/infinicore/ops/logical_not.py b/python/infinicore/ops/logical_not.py new file mode 100644 index 000000000..d6ff4a3b7 --- /dev/null +++ b/python/infinicore/ops/logical_not.py @@ -0,0 +1,17 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def logical_not(input: Tensor, *, out=None) -> Tensor: + r"""Computes the element-wise logical NOT of the given input tensors.""" + # 1. 如果启用了 ntops 且设备支持,调用 ntops 实现 + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.logical_not(input, out=out) + + # 2. 如果没有提供 out,创建一个新的 Tensor 并返回 + if out is None: + return Tensor(_infinicore.logical_not(input._underlying)) + + # 3. 如果提供了 out,进行原地操作 (In-place operation) + _infinicore.logical_not_(out._underlying, input._underlying) + return out \ No newline at end of file diff --git a/python/infinicore/ops/vander.py b/python/infinicore/ops/vander.py new file mode 100644 index 000000000..907f5f35a --- /dev/null +++ b/python/infinicore/ops/vander.py @@ -0,0 +1,45 @@ +from typing import Optional +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + +def vander( + x: Tensor, + N: Optional[int] = None, + increasing: bool = False, + *, + out: Optional[Tensor] = None +) -> Tensor: + r"""Generates a Vandermonde matrix. + + The columns of the output matrix are powers of the input vector. The + order of the powers is determined by the ``increasing`` boolean argument. + + Args: + x (Tensor): 1-D input tensor. + N (int, optional): Number of columns in the output. If None, defaults to the size of x. + increasing (bool, optional): Order of the powers. + If False (default), the powers are descending (x^(N-1), ..., x^0). + If True, the powers are ascending (x^0, ..., x^(N-1)). + out (Tensor, optional): The output tensor. + """ + + if not x.is_contiguous(): + x = x.contiguous() + N_val = N if N is not None else 0 + + if out is not None: + _infinicore.vander_( + out._underlying, + x._underlying, + N_val, + increasing + ) + return out + + return Tensor( + _infinicore.vander( + x._underlying, + N_val, + increasing + ) + ) \ No newline at end of file diff --git a/src/infinicore/ops/logcumsumexp/logcumsumexp.cc b/src/infinicore/ops/logcumsumexp/logcumsumexp.cc new file mode 100644 index 000000000..274fc6a79 --- /dev/null +++ b/src/infinicore/ops/logcumsumexp/logcumsumexp.cc @@ -0,0 +1,35 @@ +#include "infinicore/ops/logcumsumexp.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +// 初始化 Dispatcher 单例 +common::OpDispatcher &LogCumSumExp::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +// 算子执行逻辑:校验设备并分发任务 +void LogCumSumExp::execute(Tensor y, Tensor x, int axis, bool exclusive, bool reverse) { + // 确保输入输出张量在同一设备上 + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x); + + // 切换到目标设备的上下文 + infinicore::context::setDevice(y->device()); + dispatcher().lookup(y->device().getType())(y, x, axis, exclusive, reverse); +} + +// 函数式接口:自动创建输出张量并返回 +Tensor logcumsumexp(Tensor x, int axis, bool exclusive, bool reverse) { + // 创建一个与输入 x 形状、类型和设备相同的空张量作为输出 + auto y = Tensor::empty(x->shape(), x->dtype(), x->device()); + logcumsumexp_(y, x, axis, exclusive, reverse); + return y; +} + +// 原地/指定输出接口 +void logcumsumexp_(Tensor y, Tensor x, int axis, bool exclusive, bool reverse) { + LogCumSumExp::execute(y, x, axis, exclusive, reverse); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/logcumsumexp/logcumsumexp_infiniop.cc b/src/infinicore/ops/logcumsumexp/logcumsumexp_infiniop.cc new file mode 100644 index 000000000..a102f81b6 --- /dev/null +++ b/src/infinicore/ops/logcumsumexp/logcumsumexp_infiniop.cc @@ -0,0 +1,69 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/logcumsumexp.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::logcumsumexp_impl::infiniop { + +// 定义缓存:用于管理 infiniop 算子描述符的生命周期 +thread_local common::OpCache caches( + 100, // 缓存容量 + [](infiniopLogCumSumExpDescriptor_t &desc) { + if (desc != nullptr) { + // 使用 API 定义中的销毁函数 + INFINICORE_CHECK_ERROR(infiniopDestroyLogCumSumExpDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor y, Tensor x, int axis, bool exclusive, bool reverse) { + // 1. 生成 Hash Key:必须包含张量信息和算子特有参数 (axis, exclusive, reverse) + size_t seed = hash_combine(y, x, axis, exclusive, reverse); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + // 2. 获取当前设备对应的缓存实例 + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopLogCumSumExpDescriptor_t desc = nullptr; + + // 3. 如果缓存未命中,创建新的描述符 + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateLogCumSumExpDescriptor( + context::getInfiniopHandle(y->device()), + &desc, + y->desc(), // 输出张量描述符 + x->desc(), // 输入张量描述符 + axis, + exclusive, + reverse)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 4. 获取算子执行所需的 Workspace 大小并分配内存 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetLogCumSumExpWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + // 5. 执行算子 + INFINICORE_CHECK_ERROR(infiniopLogCumSumExp( + desc, + workspace->data(), + workspace_size, + y->data(), + x->data(), + context::getStream())); +} + +// 6. 自动注册:将此实现注册到 LogCumSumExp 的调度器中 +static bool registered = []() { + LogCumSumExp::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::logcumsumexp_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/logical_and/logical_and.cc b/src/infinicore/ops/logical_and/logical_and.cc new file mode 100644 index 000000000..39618ae89 --- /dev/null +++ b/src/infinicore/ops/logical_and/logical_and.cc @@ -0,0 +1,37 @@ +#include "infinicore/ops/logical_and.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &LogicalAnd::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void LogicalAnd::execute(Tensor output, Tensor input1, Tensor input2) { + // --- 修正点:去掉第二个参数 true --- + infinicore::context::setDevice(input1->device()); + + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No LogicalAnd implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input1, input2); +} + +Tensor logical_and(Tensor input1, Tensor input2) { + Shape shape = input1->shape(); + auto output = Tensor::empty(shape, input1->dtype(), input1->device()); + + logical_and_(output, input1, input2); + return output; +} + +void logical_and_(Tensor output, Tensor input1, Tensor input2) { + LogicalAnd::execute(output, input1, input2); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/logical_and/logical_and_cpu.cc b/src/infinicore/ops/logical_and/logical_and_cpu.cc new file mode 100644 index 000000000..c305173a0 --- /dev/null +++ b/src/infinicore/ops/logical_and/logical_and_cpu.cc @@ -0,0 +1,128 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/logical_and.hpp" +#include "infinicore/tensor.hpp" +#include +#include +#include + +namespace infinicore::op::logical_and_impl::cpu { + +void calculate(Tensor output, Tensor input1, Tensor input2) { + auto ndim = output->ndim(); + auto numel = output->numel(); + auto shapes = output->shape(); + + auto strides1 = input1->strides(); + auto strides2 = input2->strides(); + auto out_strides = output->strides(); + + auto dtype = input1->dtype(); + auto dtype_size = input1->element_size(); + + auto out_dtype = output->dtype(); + auto out_dtype_size = output->element_size(); + + // 假设 Tensor::data() 返回的是支持字节加法的指针 (char* 或 uint8_t*) + // 如果是 void*,建议显式强转为 uint8_t* 或 char* + auto input1_base = reinterpret_cast(input1->data()); + auto input2_base = reinterpret_cast(input2->data()); + auto output_base = reinterpret_cast(output->data()); + + std::vector indices(ndim, 0); + + for (size_t idx = 0; idx < numel; ++idx) { + size_t offset1 = 0; + size_t offset2 = 0; + size_t out_offset = 0; + + for (size_t dim = 0; dim < ndim; ++dim) { + offset1 += indices[dim] * strides1[dim]; + offset2 += indices[dim] * strides2[dim]; + out_offset += indices[dim] * out_strides[dim]; + } + + bool result = false; + + // ========================================== + // INPUT TYPE DISPATCH (输入类型分发) + // ========================================== + + // 1. 浮点型 + if (dtype == DataType::F32) { + auto *p1 = reinterpret_cast(input1_base + offset1 * dtype_size); + auto *p2 = reinterpret_cast(input2_base + offset2 * dtype_size); + result = (*p1 != 0.0f) && (*p2 != 0.0f); + } else if (dtype == DataType::F64) { + auto *p1 = reinterpret_cast(input1_base + offset1 * dtype_size); + auto *p2 = reinterpret_cast(input2_base + offset2 * dtype_size); + result = (*p1 != 0.0) && (*p2 != 0.0); + } else if (dtype == DataType::F16) { + auto *p1 = reinterpret_cast(input1_base + offset1 * dtype_size); + auto *p2 = reinterpret_cast(input2_base + offset2 * dtype_size); + float v1 = utils::cast(*p1); + float v2 = utils::cast(*p2); + result = (v1 != 0.0f) && (v2 != 0.0f); + + // 2. 布尔与8位整型 + } else if (dtype == DataType::BOOL || dtype == DataType::U8) { + auto *p1 = reinterpret_cast(input1_base + offset1 * dtype_size); + auto *p2 = reinterpret_cast(input2_base + offset2 * dtype_size); + result = (*p1 != 0) && (*p2 != 0); + + // 3. 【新增】32位整型 (修复 int32 测试失败的关键!) + } else if (dtype == DataType::I32 || dtype == DataType::U32) { + // 无论是 I32 还是 U32,做非零判断逻辑是一样的,直接强转成 int32_t 读取即可 + auto *p1 = reinterpret_cast(input1_base + offset1 * dtype_size); + auto *p2 = reinterpret_cast(input2_base + offset2 * dtype_size); + result = (*p1 != 0) && (*p2 != 0); + + // 4. 【新增】64位整型 (增强健壮性) + } else if (dtype == DataType::I64 || dtype == DataType::U64) { + auto *p1 = reinterpret_cast(input1_base + offset1 * dtype_size); + auto *p2 = reinterpret_cast(input2_base + offset2 * dtype_size); + result = (*p1 != 0) && (*p2 != 0); + + // 5. 【新增】16位整型 (增强健壮性) + } else if (dtype == DataType::I16 || dtype == DataType::U16) { + auto *p1 = reinterpret_cast(input1_base + offset1 * dtype_size); + auto *p2 = reinterpret_cast(input2_base + offset2 * dtype_size); + result = (*p1 != 0) && (*p2 != 0); + + } else { + // 如果遇到 I8,可以合并到 U8 处理;如果没有,这里抛出异常是正确的 + throw std::runtime_error("Unsupported data type for logical_and operation."); + } + + // ========================================== + // OUTPUT TYPE DISPATCH (输出类型分发) + // ========================================== + if (out_dtype == DataType::BOOL || out_dtype == DataType::U8) { + auto *output_ptr = reinterpret_cast(output_base + out_offset * out_dtype_size); + *output_ptr = result ? 1 : 0; + } else if (out_dtype == DataType::F32) { + *reinterpret_cast(output_base + out_offset * out_dtype_size) = result ? 1.0f : 0.0f; + } else if (out_dtype == DataType::I32) { // 预防性增加对 int32 输出的支持 + *reinterpret_cast(output_base + out_offset * out_dtype_size) = result ? 1 : 0; + } else { + // 也可以选择在这里 throw,或者默认不做处理 + } + + // --- 维度索引递增逻辑 --- + for (ssize_t dim = ndim - 1; dim >= 0; --dim) { + indices[dim]++; + if (indices[dim] < shapes[dim]) { + break; + } else { + indices[dim] = 0; + } + } + } +} + +static bool registered = []() { + LogicalAnd::dispatcher().registerDevice(Device::Type::CPU, &calculate); + return true; +}(); + +} // namespace infinicore::op::logical_and_impl::cpu \ No newline at end of file diff --git a/src/infinicore/ops/logical_not/logical_not.cc b/src/infinicore/ops/logical_not/logical_not.cc new file mode 100644 index 000000000..c75899739 --- /dev/null +++ b/src/infinicore/ops/logical_not/logical_not.cc @@ -0,0 +1,38 @@ +#include "infinicore/ops/logical_not.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher & +LogicalNot::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void LogicalNot::execute(Tensor output, Tensor input) { + infinicore::context::setDevice(input->device()); + + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + if (func == nullptr) { + throw std::runtime_error( + "No LogicalNot implementation found for device type: " + + std::to_string(static_cast(device_type)) + ); + } + + func(output, input); +} + +Tensor logical_not(Tensor input) { + auto output = + Tensor::empty(input->shape(), input->dtype(), input->device()); + logical_not_(output, input); + return output; +} + +void logical_not_(Tensor output, Tensor input) { + LogicalNot::execute(output, input); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/logical_not/logical_not_cpu.cc b/src/infinicore/ops/logical_not/logical_not_cpu.cc new file mode 100644 index 000000000..0b1e5e5cf --- /dev/null +++ b/src/infinicore/ops/logical_not/logical_not_cpu.cc @@ -0,0 +1,169 @@ +#include "../../../utils.h" +#include "infinicore/device.hpp" +#include "infinicore/ops/logical_not.hpp" +#include "infinicore/tensor.hpp" + +#include +#include // memcpy +#include +#include + +namespace infinicore::op::logical_not_impl::cpu { + +// ---------- safe load/store (avoid strict-aliasing & alignment UB) ---------- +template +static inline T load_scalar(const uint8_t* p) { + T v; + std::memcpy(&v, p, sizeof(T)); + return v; +} + +template +static inline void store_scalar(uint8_t* p, T v) { + std::memcpy(p, &v, sizeof(T)); +} + +// ---------- read "truthiness": treat nonzero as true ---------- +static inline bool read_truth(const uint8_t* base, ptrdiff_t elem_off, DataType dtype, size_t elem_size) { + const uint8_t* p = base + elem_off * static_cast(elem_size); + + switch (dtype) { + case DataType::BOOL: + case DataType::U8: + return load_scalar(p) != 0; + + case DataType::I16: + return load_scalar(p) != 0; + case DataType::U16: + return load_scalar(p) != 0; + + case DataType::I32: + return load_scalar(p) != 0; + case DataType::U32: + return load_scalar(p) != 0; + + case DataType::I64: + return load_scalar(p) != 0; + case DataType::U64: + return load_scalar(p) != 0; + + case DataType::F32: + return load_scalar(p) != 0.0f; + case DataType::F64: + return load_scalar(p) != 0.0; + + case DataType::F16: { + fp16_t h = load_scalar(p); + float fv = utils::cast(h); + return fv != 0.0f; + } + + default: + throw std::runtime_error("logical_not(cpu): unsupported input dtype."); + } +} + +// ---------- write result (bool -> out_dtype as 0/1) ---------- +static inline void write_bool(uint8_t* base, ptrdiff_t elem_off, DataType out_dtype, size_t out_elem_size, bool b) { + uint8_t* p = base + elem_off * static_cast(out_elem_size); + + switch (out_dtype) { + case DataType::BOOL: + case DataType::U8: + store_scalar(p, b ? 1 : 0); + return; + + case DataType::I16: + store_scalar(p, b ? 1 : 0); + return; + case DataType::U16: + store_scalar(p, b ? 1 : 0); + return; + + case DataType::I32: + store_scalar(p, b ? 1 : 0); + return; + case DataType::U32: + store_scalar(p, b ? 1u : 0u); + return; + + case DataType::I64: + store_scalar(p, b ? 1 : 0); + return; + case DataType::U64: + store_scalar(p, b ? 1ull : 0ull); + return; + + case DataType::F32: + store_scalar(p, b ? 1.0f : 0.0f); + return; + case DataType::F64: + store_scalar(p, b ? 1.0 : 0.0); + return; + + case DataType::F16: { + fp16_t h = utils::cast(b ? 1.0f : 0.0f); + store_scalar(p, h); + return; + } + + default: + throw std::runtime_error("logical_not(cpu): unsupported output dtype."); + } +} + +void calculate(Tensor output, Tensor input) { + const auto ndim = output->ndim(); + if (input->ndim() != ndim) { + throw std::runtime_error("logical_not(cpu): input/output ndim mismatch."); + } + + const auto numel = output->numel(); + const auto shape = output->shape(); + + // IMPORTANT: strides in TensorImpl are "element strides" (see tensor.cc calculate_contiguous_strides) + const auto in_strides = input->strides(); + const auto out_strides = output->strides(); + + const auto in_dtype = input->dtype(); + const auto out_dtype = output->dtype(); + + const auto in_es = input->element_size(); + const auto out_es = output->element_size(); + + const uint8_t* in_base = reinterpret_cast(input->data()); + uint8_t* out_base = reinterpret_cast(output->data()); + + std::vector idx(ndim, 0); + + for (size_t linear = 0; linear < numel; ++linear) { + // use ptrdiff_t all the way (must NOT cast strides to size_t) + ptrdiff_t in_elem_off = 0; + ptrdiff_t out_elem_off = 0; + + for (size_t d = 0; d < ndim; ++d) { + in_elem_off += static_cast(idx[d]) * in_strides[d]; + out_elem_off += static_cast(idx[d]) * out_strides[d]; + } + + // logical_not: result == (input == 0) == !(truthiness) + const bool truth = read_truth(in_base, in_elem_off, in_dtype, in_es); + const bool result = !truth; + + write_bool(out_base, out_elem_off, out_dtype, out_es, result); + + // increment multi-d index + for (ptrdiff_t d = static_cast(ndim) - 1; d >= 0; --d) { + idx[static_cast(d)]++; + if (idx[static_cast(d)] < shape[static_cast(d)]) break; + idx[static_cast(d)] = 0; + } + } +} + +static bool registered = []() { + LogicalNot::dispatcher().registerDevice(Device::Type::CPU, &calculate); + return true; +}(); + +} // namespace infinicore::op::logical_not_impl::cpu diff --git a/src/infinicore/ops/unfold/unfold.cc b/src/infinicore/ops/unfold/unfold.cc new file mode 100644 index 000000000..b14fcbcfe --- /dev/null +++ b/src/infinicore/ops/unfold/unfold.cc @@ -0,0 +1,73 @@ +#include "infinicore/ops/unfold.hpp" + +namespace infinicore::op { + +// 1. 定义 Dispatcher 单例 +common::OpDispatcher &Unfold::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +// 2. Execute 方法实现 +void Unfold::execute(Tensor output, Tensor input, + const std::vector& kernel_sizes, + const std::vector& dilations, + const std::vector& paddings, + const std::vector& strides) { + dispatcher().lookup(context::getDevice().getType())(output, input, kernel_sizes, dilations, paddings, strides); +} + +// 3. 函数式接口 +Tensor unfold(Tensor input, + std::vector kernel_sizes, + std::vector dilations, + std::vector paddings, + std::vector strides) { + + // 基础维度校验与获取 + const auto& input_shape = input->shape(); + int64_t n_dim = input->ndim(); + int64_t spatial_dims = n_dim - 2; // N, C, D1, D2... -> spatial starts at 2 + int64_t N = input_shape[0]; + int64_t C = input_shape[1]; + + // 计算 dim 1: C * kernel_sizes[0] * kernel_sizes[1] ... + int64_t output_dim1 = C; + for (auto k : kernel_sizes) { + output_dim1 *= k; + } + + int64_t L = 1; + for (int i = 0; i < spatial_dims; ++i) { + int64_t input_dim = input_shape[i + 2]; + int64_t k = kernel_sizes[i]; + int64_t p = paddings[i]; + int64_t d = dilations[i]; + int64_t s = strides[i]; + + // 公式: out = floor((in + 2*p - d*(k-1) - 1) / s + 1) + int64_t output_spatial = (input_dim + 2 * p - d * (k - 1) - 1) / s + 1; + L *= output_spatial; + } + Shape output_shape = { + static_cast(N), + static_cast(output_dim1), + static_cast(L) + }; + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + + unfold_(output, input, kernel_sizes, dilations, paddings, strides); + return output; +} + +// 4. In-place / 显式输出接口 +void unfold_(Tensor output, Tensor input, + std::vector kernel_sizes, + std::vector dilations, + std::vector paddings, + std::vector strides) { + Unfold::execute(output, input, kernel_sizes, dilations, paddings, strides); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/unfold/unfold_infiniop.cc b/src/infinicore/ops/unfold/unfold_infiniop.cc new file mode 100644 index 000000000..b75aadf53 --- /dev/null +++ b/src/infinicore/ops/unfold/unfold_infiniop.cc @@ -0,0 +1,101 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/unfold.hpp" +#include +#include +#include + +namespace infinicore::op::unfold_impl::infiniop { + +// 定义描述符缓存 +thread_local common::OpCache caches( + 100, // capacity + [](infiniopUnfoldDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyUnfoldDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, + const std::vector& kernel_sizes, + const std::vector& dilations, + const std::vector& paddings, + const std::vector& strides) { + + // 1. 计算 Hash Key (修复点:手动拆解,避开 hash.hpp 的递归 bug 和 vector 不支持问题) + size_t seed = 0; + + // 基础 Tensor 支持 (hash.hpp 中有 Tensor 重载) + hash_combine(seed, output); + hash_combine(seed, input); + + // Vector 类型必须手动遍历 (hash.hpp 不支持 vector 直接 hash) + for (auto v : kernel_sizes) hash_combine(seed, v); + for (auto v : dilations) hash_combine(seed, v); + for (auto v : paddings) hash_combine(seed, v); + for (auto v : strides) hash_combine(seed, v); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopUnfoldDescriptor_t desc = nullptr; + + if (!desc_opt) { + // 2. 创建描述符 + + // 辅助函数:将 int64_t vector 转换为 int vector 以匹配 C API 的 int* 签名 + auto to_int_vec = [](const std::vector& src) { + std::vector dst(src.size()); + std::transform(src.begin(), src.end(), dst.begin(), + [](int64_t val) { return static_cast(val); }); + return dst; + }; + + std::vector k_int = to_int_vec(kernel_sizes); + std::vector s_int = to_int_vec(strides); + std::vector p_int = to_int_vec(paddings); + std::vector d_int = to_int_vec(dilations); + + INFINICORE_CHECK_ERROR(infiniopCreateUnfoldDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + k_int.data(), + s_int.data(), + p_int.data(), + d_int.data() + )); + + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 3. 获取 Workspace 并执行 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetUnfoldWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopUnfold( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + context::getStream() + )); +} + +// 4. 注册算子实现 +static bool registered = []() { + Unfold::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::unfold_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/vander/vander.cc b/src/infinicore/ops/vander/vander.cc new file mode 100644 index 000000000..5f3d8dd29 --- /dev/null +++ b/src/infinicore/ops/vander/vander.cc @@ -0,0 +1,36 @@ +#include "infinicore/ops/vander.hpp" + +namespace infinicore::op { + +// 1. 定义 Dispatcher 单例 +common::OpDispatcher &Vander::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +// 2. Execute 方法实现 +void Vander::execute(Tensor output, Tensor input, int64_t N, bool increasing) { + dispatcher().lookup(context::getDevice().getType())(output, input, N, increasing); +} + +// 3. 函数式接口 +Tensor vander(Tensor input, int64_t N, bool increasing) { + int64_t input_size = input->shape()[0]; + int64_t cols = (N > 0) ? N : input_size; + Shape output_shape = { + static_cast(input_size), + static_cast(cols) + }; + + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + + vander_(output, input, N, increasing); + return output; +} + +// 4. In-place / 显式输出接口 +void vander_(Tensor output, Tensor input, int64_t N, bool increasing) { + Vander::execute(output, input, N, increasing); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/vander/vander_infiniop.cc b/src/infinicore/ops/vander/vander_infiniop.cc new file mode 100644 index 000000000..f0ab7b30f --- /dev/null +++ b/src/infinicore/ops/vander/vander_infiniop.cc @@ -0,0 +1,70 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/vander.hpp" +#include + +namespace infinicore::op::vander_impl::infiniop { + +// 定义描述符缓存 +thread_local common::OpCache caches( + 100, // capacity + [](infiniopVanderDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyVanderDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input, int64_t N, bool increasing) { + // 1. 计算 Hash Key + // 直接组合 output, input 以及标量参数 N, increasing + size_t seed = hash_combine(output, input, N, increasing); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopVanderDescriptor_t desc = nullptr; + + if (!desc_opt) { + // 2. 创建描述符 + // 注意:将 int64_t N 转换为 int,bool increasing 转换为 int 以匹配 C API 签名 + INFINICORE_CHECK_ERROR(infiniopCreateVanderDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + static_cast(N), + static_cast(increasing) + )); + + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // 3. 获取 Workspace 并执行 + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetVanderWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopVander( + desc, + workspace->data(), + workspace_size, + output->data(), + input->data(), + context::getStream() + )); +} + +// 4. 注册算子实现 +static bool registered = []() { + Vander::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::vander_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 431c3a37b..1eeb46ef4 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -18,7 +18,11 @@ #include "ops/rope.hpp" #include "ops/silu.hpp" #include "ops/swiglu.hpp" - +#include "ops/vander.hpp" +#include "ops/unfold.hpp" +#include "ops/logcumsumexp.hpp" +#include "ops/logical_and.hpp" +#include "ops/logical_not.hpp" namespace py = pybind11; namespace infinicore::ops { @@ -38,6 +42,11 @@ inline void bind(py::module &m) { bind_rms_norm(m); bind_silu(m); bind_swiglu(m); + bind_logcumsumexp(m); + bind_logical_and(m); + bind_logical_not(m); + bind_vander(m); + bind_unfold(m); bind_rope(m); bind_embedding(m); } diff --git a/src/infinicore/pybind11/ops/logcumsumexp.hpp b/src/infinicore/pybind11/ops/logcumsumexp.hpp new file mode 100644 index 000000000..3359cfe56 --- /dev/null +++ b/src/infinicore/pybind11/ops/logcumsumexp.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include +#include "infinicore/ops/logcumsumexp.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_logcumsumexp(py::module &m) { + // 绑定非原地操作接口 (返回新 Tensor) + m.def("logcumsumexp", + &op::logcumsumexp, + py::arg("input"), + py::arg("dim"), + py::arg("exclusive") = false, + py::arg("reverse") = false, + R"doc(Computes the logarithm of the cumulative summation of the exponentiation of elements.)doc"); + + // 绑定原地/指定输出接口 + m.def("logcumsumexp_", + &op::logcumsumexp_, + py::arg("out"), + py::arg("input"), + py::arg("dim"), + py::arg("exclusive") = false, + py::arg("reverse") = false, + R"doc(In-place version of logcumsumexp.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/logical_and.hpp b/src/infinicore/pybind11/ops/logical_and.hpp new file mode 100644 index 000000000..c68614e96 --- /dev/null +++ b/src/infinicore/pybind11/ops/logical_and.hpp @@ -0,0 +1,27 @@ +#pragma once +#include + +#include "infinicore/ops/logical_and.hpp" + +namespace py = pybind11; +namespace infinicore::ops { + +inline void bind_logical_and(py::module &m) { + // 绑定常规函数: logical_and(input, other) -> Tensor + m.def("logical_and", + &op::logical_and, + py::arg("input"), + py::arg("other"), + R"doc(Computes the element-wise logical AND of the given input tensors.)doc"); + + // 绑定底层输出指定函数: logical_and_(output, input, other) + // 对应 Python 调用: _infinicore.logical_and_(out, input, other) + m.def("logical_and_", + &op::logical_and_, + py::arg("output"), + py::arg("input"), + py::arg("other"), + R"doc(Explicit output logical AND computation.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/logical_not.hpp b/src/infinicore/pybind11/ops/logical_not.hpp new file mode 100644 index 000000000..a400dfea6 --- /dev/null +++ b/src/infinicore/pybind11/ops/logical_not.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include +#include "infinicore/ops/logical_not.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_logical_not(py::module &m) { + // Out-of-place: output = logical_not(input) + m.def("logical_not", + &op::logical_not, + py::arg("input"), + R"doc(Logical NOT of the tensor.)doc"); + + // In-place / Explicit Output: logical_not_(output, input) + // 对应 C++: void logical_not_(Tensor output, Tensor input) + m.def("logical_not_", + &op::logical_not_, + py::arg("output"), + py::arg("input"), + R"doc(In-place logical NOT computation.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/unfold.hpp b/src/infinicore/pybind11/ops/unfold.hpp new file mode 100644 index 000000000..43c1aa374 --- /dev/null +++ b/src/infinicore/pybind11/ops/unfold.hpp @@ -0,0 +1,59 @@ +#pragma once + +#include +#include +#include "infinicore/ops/unfold.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_unfold(py::module &m) { + // ------------------------------------------------------------------------- + // 1. 绑定函数式接口 (unfold) + // ------------------------------------------------------------------------- + m.def("unfold", + [](const Tensor& input, + std::vector kernel_sizes, + std::vector dilations, + std::vector paddings, + std::vector strides) { + return op::unfold(input, kernel_sizes, dilations, paddings, strides); + }, + py::arg("input"), + py::arg("kernel_sizes"), + py::arg("dilations"), + py::arg("paddings"), + py::arg("strides"), + R"doc(Extracts sliding local blocks from a batched input tensor. + + Args: + input (Tensor): The input tensor. + kernel_sizes (List[int]): The size of the sliding blocks. + dilations (List[int]): The parameter that controls the stride of elements within the neighborhood. + paddings (List[int]): Implicit zero padding to be added on both sides of input. + strides (List[int]): The stride of the sliding blocks. + )doc"); + + // ------------------------------------------------------------------------- + // 2. 绑定 in-place 接口 (unfold_) + // ------------------------------------------------------------------------- + m.def("unfold_", + [](Tensor& output, + const Tensor& input, + std::vector kernel_sizes, + std::vector dilations, + std::vector paddings, + std::vector strides) { + op::unfold_(output, input, kernel_sizes, dilations, paddings, strides); + }, + py::arg("output"), + py::arg("input"), + py::arg("kernel_sizes"), + py::arg("dilations"), + py::arg("paddings"), + py::arg("strides"), + R"doc(Explicit output Unfold operation. Writes the result into the output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/vander.hpp b/src/infinicore/pybind11/ops/vander.hpp new file mode 100644 index 000000000..bfbb1acf0 --- /dev/null +++ b/src/infinicore/pybind11/ops/vander.hpp @@ -0,0 +1,40 @@ +#pragma once + +#include +#include "infinicore/ops/vander.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_vander(py::module &m) { + m.def("vander", + [](const Tensor& input, int64_t N, bool increasing) { + return op::vander(input, N, increasing); + }, + py::arg("input"), + py::arg("N") = 0, + py::arg("increasing") = false, + R"doc(Generates a Vandermonde matrix. + + Args: + input (Tensor): 1-D input tensor. + N (int, optional): Number of columns in the output. If 0, defaults to input size (square matrix). Default: 0. + increasing (bool, optional): Order of the powers. If True, powers increase (x^0, x^1...). Default: False. + )doc"); + + // ------------------------------------------------------------------------- + // 2. 绑定 in-place 接口 (vander_) + // ------------------------------------------------------------------------- + m.def("vander_", + [](Tensor& output, const Tensor& input, int64_t N, bool increasing) { + op::vander_(output, input, N, increasing); + }, + py::arg("output"), + py::arg("input"), + py::arg("N") = 0, + py::arg("increasing") = false, + R"doc(Explicit output Vander operation. Writes the result into the output tensor.)doc"); +} + +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.cc b/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.cc new file mode 100644 index 000000000..fb208d4f2 --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.cc @@ -0,0 +1,145 @@ +#include "logcumsumexp_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include +#include +#include + +#include "../../../../utils/custom_types.h" + +namespace op::logcumsumexp::cpu { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int axis, + int exclusive, + int reverse) { + + auto handle = reinterpret_cast(handle_); + + // 注意:这里复用了你之前修改过的 Info 类,它现在包含正确的 stride 信息 + auto result = LogCumSumExpInfo::create(y_desc, x_desc, axis, exclusive, reverse); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + 0, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +template +void calculate_cpu_impl( + const LogCumSumExpInfo &info, + void *y, + const void *x) { + + size_t outer_size = info.outer_size(); + size_t axis_size = info.axis_size(); + size_t inner_size = info.inner_size(); + bool exclusive = info.exclusive(); + bool reverse = info.reverse(); + + // 获取 Info 中存储的物理 stride + size_t x_outer_stride = info._x_outer_stride; + size_t x_axis_stride = info._x_axis_stride; + size_t x_inner_stride = info._x_inner_stride; + + size_t y_outer_stride = info._y_outer_stride; + size_t y_axis_stride = info._y_axis_stride; + size_t y_inner_stride = info._y_inner_stride; + + auto y_ptr = reinterpret_cast(y); + auto x_ptr = reinterpret_cast(x); + + #pragma omp parallel for collapse(2) schedule(static) + for (size_t i = 0; i < outer_size; ++i) { + for (size_t j = 0; j < inner_size; ++j) { + + // [修复] 使用物理 Stride 计算起始偏移量,而不是逻辑 Shape + size_t x_base = i * x_outer_stride + j * x_inner_stride; + size_t y_base = i * y_outer_stride + j * y_inner_stride; + + double running_max = -std::numeric_limits::infinity(); + double running_sum_exp = 0.0; + + for (size_t k = 0; k < axis_size; ++k) { + // 处理 reverse 逻辑 + size_t k_idx = reverse ? (axis_size - 1 - k) : k; + + // [修复] 使用物理 axis stride + size_t x_offset = x_base + k_idx * x_axis_stride; + size_t y_offset = y_base + k_idx * y_axis_stride; + + float val = utils::cast(x_ptr[x_offset]); + + // 如果是 exclusive,先记录结果再更新状态 + if (exclusive) { + if (running_sum_exp == 0.0) { + y_ptr[y_offset] = utils::cast(-std::numeric_limits::infinity()); + } else { + y_ptr[y_offset] = utils::cast(static_cast(running_max + std::log(running_sum_exp))); + } + } + + // 更新数值稳定的累加状态 + if (val > running_max) { + running_sum_exp = running_sum_exp * std::exp(running_max - val) + 1.0; + running_max = val; + } else { + running_sum_exp += std::exp(val - running_max); + } + + // 如果不是 exclusive,更新状态后记录结果 + if (!exclusive) { + y_ptr[y_offset] = utils::cast(static_cast(running_max + std::log(running_sum_exp))); + } + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + auto dtype = _info.dtype(); + + switch (dtype) { + case INFINI_DTYPE_F32: + cpu::calculate_cpu_impl(_info, y, x); + break; + case INFINI_DTYPE_F16: + cpu::calculate_cpu_impl(_info, y, x); + break; + case INFINI_DTYPE_BF16: + cpu::calculate_cpu_impl(_info, y, x); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::logcumsumexp::cpu \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.h b/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.h new file mode 100644 index 000000000..ce1b2f2f1 --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.h @@ -0,0 +1,8 @@ +#ifndef __LOGCUMSUMEXP_CPU_H__ +#define __LOGCUMSUMEXP_CPU_H__ + +#include "../logcumsumexp.h" + +DESCRIPTOR(cpu) + +#endif // __LOGCUMSUMEXP_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/cuda/kernel.cuh b/src/infiniop/ops/logcumsumexp/cuda/kernel.cuh new file mode 100644 index 000000000..2bf93f27f --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/cuda/kernel.cuh @@ -0,0 +1,124 @@ +#ifndef __LOGCUMSUMEXP_CUDA_CUH__ +#define __LOGCUMSUMEXP_CUDA_CUH__ + +#include + +#if defined ENABLE_METAX_API + #include + #include +#else + #include + #include +#endif + +#include +#include + +namespace op::logcumsumexp::cuda { + +// ============================================================ +// 数值稳定 LogSumExp prefix state +// 等价于:log(sum(exp(x[0:i]))) +// ============================================================ + +struct LSEState { + float m; // running max + float s; // sum(exp(x - m)) + + // 数学单位元:log(0) = -inf + __device__ __forceinline__ static LSEState identity() { + return { -INFINITY, 0.0f }; + } + + // prefix 更新 + __device__ __forceinline__ void update(float v) { + if (m == -INFINITY) { + // 第一个元素 + m = v; + s = 1.0f; + } else if (v > m) { + // max 发生变化,需要 rescale + s = s * expf(m - v) + 1.0f; + m = v; + } else { + s += expf(v - m); + } + } + + // 当前 log-sum-exp 值 + __device__ __forceinline__ float value() const { + return (s == 0.0f) ? -INFINITY : (m + logf(s)); + } +}; + +// ============================================================ +// kernel:一个 thread 负责一个 (outer, inner) 前缀向量 +// ============================================================ + +template +__global__ void logcumsumexp_kernel( + T* __restrict__ y, + const T* __restrict__ x, + + size_t outer_size, + size_t axis_size, + size_t inner_size, + + size_t x_axis_stride, + size_t x_inner_stride, + size_t x_outer_stride, + + size_t y_axis_stride, + size_t y_inner_stride, + size_t y_outer_stride, + + bool exclusive, + bool reverse +) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t num_vec = outer_size * inner_size; + if (tid >= num_vec) return; + + size_t o = tid / inner_size; + size_t i = tid % inner_size; + + // base offset(正确处理 stride) + size_t x_base = o * x_outer_stride + i * x_inner_stride; + size_t y_base = o * y_outer_stride + i * y_inner_stride; + + LSEState state = LSEState::identity(); + + for (size_t k = 0; k < axis_size; ++k) { + size_t kk = reverse ? (axis_size - 1 - k) : k; + + size_t x_off = x_base + kk * x_axis_stride; + size_t y_off = y_base + kk * y_axis_stride; + + float v = static_cast(x[x_off]); + + if (exclusive) { + // y[i] = log(sum(exp(x[:i]))) + y[y_off] = static_cast(state.value()); + state.update(v); + } else { + // y[i] = log(sum(exp(x[:i+1]))) + state.update(v); + y[y_off] = static_cast(state.value()); + } + + // ===== 调试用(需要时打开)===== + /* + if (o == 0 && i == 0 && k < 5) { + printf( + "[CUDA] k=%zu v=%f m=%f s=%f out=%f\n", + k, v, state.m, state.s, + static_cast(y[y_off]) + ); + } + */ + } +} + +} // namespace op::logcumsumexp::cuda + +#endif // __LOGCUMSUMEXP_CUDA_CUH__ diff --git a/src/infiniop/ops/logcumsumexp/info.h b/src/infiniop/ops/logcumsumexp/info.h new file mode 100644 index 000000000..4a220f524 --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/info.h @@ -0,0 +1,152 @@ +#ifndef __LOGCUMSUMEXP_INFO_H__ +#define __LOGCUMSUMEXP_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::logcumsumexp { + +class LogCumSumExpInfo { + LogCumSumExpInfo() = default; + +public: + int _dtype; + int _axis; + bool _exclusive; + bool _reverse; + + size_t _outer_size; + size_t _axis_size; + size_t _inner_size; + + size_t _x_axis_stride; + size_t _x_inner_stride; + size_t _x_outer_stride; + + size_t _y_axis_stride; + size_t _y_inner_stride; + size_t _y_outer_stride; + + int dtype() const { return _dtype; } + int axis() const { return _axis; } + bool exclusive() const { return _exclusive; } + bool reverse() const { return _reverse; } + size_t outer_size() const { return _outer_size; } + size_t axis_size() const { return _axis_size; } + size_t inner_size() const { return _inner_size; } + + LogCumSumExpInfo( + int dtype, + int axis, + bool exclusive, + bool reverse, + size_t outer, + size_t axis_len, + size_t inner, + size_t x_as, + size_t x_is, + size_t x_os, + size_t y_as, + size_t y_is, + size_t y_os + ) + : _dtype(dtype), + _axis(axis), + _exclusive(exclusive), + _reverse(reverse), + _outer_size(outer), + _axis_size(axis_len), + _inner_size(inner), + _x_axis_stride(x_as), + _x_inner_stride(x_is), + _x_outer_stride(x_os), + _y_axis_stride(y_as), + _y_inner_stride(y_is), + _y_outer_stride(y_os) {} + + static utils::Result create( + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int axis, + int exclusive, + int reverse) { + + if (y_desc->ndim() != x_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t ndim = x_desc->ndim(); + for (size_t i = 0; i < ndim; ++i) { + if (y_desc->shape()[i] != x_desc->shape()[i]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + if (y_desc->dtype() != x_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + if (axis < 0 || static_cast(axis) >= ndim) { + return INFINI_STATUS_BAD_PARAM; + } + + // ================================ + // 逻辑维度大小 + // ================================ + size_t outer = 1; + for (size_t i = 0; i < static_cast(axis); ++i) { + outer *= x_desc->shape()[i]; + } + + size_t axis_len = x_desc->shape()[axis]; + + size_t inner = 1; + for (size_t i = static_cast(axis) + 1; i < ndim; ++i) { + inner *= x_desc->shape()[i]; + } + + // ================================ + // 物理 stride 计算 (已修复) + // ================================ + + // 1. Axis Stride + size_t x_axis_stride = static_cast(x_desc->stride(axis)); + size_t y_axis_stride = static_cast(y_desc->stride(axis)); + + // 2. Inner Stride + // [关键修复]: + // Kernel 将 inner 部分视为被展平的一维数组 (0 到 inner-1)。 + // 对于连续 (Contiguous) 的 Tensor,这部分数据的内存是连续的。 + // 因此,无论 inner 包含多少个逻辑维度,访问下一个元素的物理偏移量固定为 1。 + // 原先使用 stride(axis + 1) 会导致多维 inner 时跳过数据。 + size_t x_inner_stride = 1; + size_t y_inner_stride = 1; + + // 3. Outer Stride + // 如果 axis == 0,outer_size 为 1,outer stride 设为 0。 + // 否则取 axis 前一维的 stride,代表跨越整个 axis+inner 数据块的步长。 + size_t x_outer_stride = (axis == 0) ? 0 : static_cast(x_desc->stride(axis - 1)); + size_t y_outer_stride = (axis == 0) ? 0 : static_cast(y_desc->stride(axis - 1)); + + return utils::Result(LogCumSumExpInfo{ + x_desc->dtype(), + axis, + static_cast(exclusive), + static_cast(reverse), + outer, + axis_len, + inner, + x_axis_stride, + x_inner_stride, + x_outer_stride, + y_axis_stride, + y_inner_stride, + y_outer_stride + }); + } +}; + +} // namespace op::logcumsumexp + +#endif // __LOGCUMSUMEXP_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/logcumsumexp.h b/src/infiniop/ops/logcumsumexp/logcumsumexp.h new file mode 100644 index 000000000..51f1a01aa --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/logcumsumexp.h @@ -0,0 +1,50 @@ +#ifndef __LOGCUMSUMEXP_H__ +#define __LOGCUMSUMEXP_H__ + +#include "../../operator.h" +#include "info.h" // 引用对应的 LogCumSumExpInfo 定义 + +// 宏定义:用于生成不同命名空间下的 Descriptor 类 (例如 cpu, nvidia, metax) +#define DESCRIPTOR(NAMESPACE) \ + namespace op::logcumsumexp::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; /* 指向后端私有实现数据 */ \ + LogCumSumExpInfo _info; /* 存储校验过的张量和算子信息 */ \ + size_t _workspace_size; /* 缓存计算所需的显存/内存大小 */ \ + \ + Descriptor( \ + Opaque *opaque, \ + LogCumSumExpInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + int axis, \ + int exclusive, \ + int reverse); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +#endif // __LOGCUMSUMEXP_H__ \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.h b/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.h new file mode 100644 index 000000000..a681f402f --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.h @@ -0,0 +1,8 @@ +#ifndef __LOGCUMSUMEXP_METAX_API_H__ +#define __LOGCUMSUMEXP_METAX_API_H__ + +#include "../logcumsumexp.h" + +DESCRIPTOR(metax) + +#endif // __LOGCUMSUMEXP_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.maca b/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.maca new file mode 100644 index 000000000..6927f5623 --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.maca @@ -0,0 +1,209 @@ +#include "logcumsumexp_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include +#include +#include + + +namespace op::logcumsumexp::metax { + + +struct LSEState { + float m; // running max + float s; // sum(exp(x - m)) + + // 数学单位元:log(0) = -inf + __device__ __forceinline__ static LSEState identity() { + return { -INFINITY, 0.0f }; + } + + // prefix 更新逻辑 + __device__ __forceinline__ void update(float v) { + if (m == -INFINITY) { + // 第一个有效值 + m = v; + s = 1.0f; + } else if (v > m) { + // 新的 max 出现,需要缩放之前的 sum + // s_new = s_old * exp(old_m - new_v) + 1 + s = s * expf(m - v) + 1.0f; + m = v; + } else { + // 累加项 + s += expf(v - m); + } + } + + // 计算当前的 log-sum-exp 值: m + log(s) + __device__ __forceinline__ float value() const { + return (s == 0.0f) ? -INFINITY : (m + logf(s)); + } +}; + + +template +__global__ void logcumsumexp_kernel( + T* __restrict__ y, + const T* __restrict__ x, + + size_t outer_size, + size_t axis_size, + size_t inner_size, + + size_t x_axis_stride, + size_t x_inner_stride, + size_t x_outer_stride, + + size_t y_axis_stride, + size_t y_inner_stride, + size_t y_outer_stride, + + bool exclusive, + bool reverse +) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t num_vec = outer_size * inner_size; + if (tid >= num_vec) return; + + // 计算当前的 outer (o) 和 inner (i) 索引 + size_t o = tid / inner_size; + size_t i = tid % inner_size; + + // 计算 Base Offset + size_t x_base = o * x_outer_stride + i * x_inner_stride; + size_t y_base = o * y_outer_stride + i * y_inner_stride; + + LSEState state = LSEState::identity(); + + // 沿 Axis 维度循环 + for (size_t k = 0; k < axis_size; ++k) { + // 处理 reverse (从后向前累加) + size_t kk = reverse ? (axis_size - 1 - k) : k; + + size_t x_off = x_base + kk * x_axis_stride; + size_t y_off = y_base + kk * y_axis_stride; + + // 统一转为 float 计算 + float v = static_cast(x[x_off]); + + if (exclusive) { + // Exclusive: 先写入当前状态,再更新 + // y[0] = log(0) = -inf + y[y_off] = static_cast(state.value()); + state.update(v); + } else { + // Inclusive: 先更新状态,再写入 + // y[0] = x[0] + state.update(v); + y[y_off] = static_cast(state.value()); + } + } +} + + +template +void launch_kernel( + void* y, + const void* x, + const LogCumSumExpInfo& info, + void* stream) { + + auto x_ptr = reinterpret_cast(x); + auto y_ptr = reinterpret_cast(y); + auto mc_stream = reinterpret_cast(stream); + + size_t outer = info.outer_size(); + size_t axis = info.axis_size(); + size_t inner = info.inner_size(); + + size_t total = outer * inner; + constexpr size_t block = 256; + size_t grid = (total + block - 1) / block; + + logcumsumexp_kernel + <<>>( + y_ptr, + x_ptr, + outer, + axis, + inner, + + info._x_axis_stride, + info._x_inner_stride, + info._x_outer_stride, + + info._y_axis_stride, + info._y_inner_stride, + info._y_outer_stride, + + info.exclusive(), + info.reverse() + ); +} + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor** desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int axis, + int exclusive, + int reverse) { + + auto handle = reinterpret_cast(handle_); + + auto info_result = + LogCumSumExpInfo::create(y_desc, x_desc, axis, exclusive, reverse); + if (!info_result) return info_result.status(); + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + /*workspace*/ 0, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void* workspace, + size_t workspace_size, + void* y, + const void* x, + void* stream) const { + + switch (_info.dtype()) { + case INFINI_DTYPE_F16: + launch_kernel<__half>(y, x, _info, stream); + break; + case INFINI_DTYPE_BF16: +#if defined(__MACA__) || defined(__MACACC__) + launch_kernel<__maca_bfloat16>(y, x, _info, stream); +#endif + break; + case INFINI_DTYPE_F32: + launch_kernel(y, x, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(y, x, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::logcumsumexp::metax \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.h b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.h new file mode 100644 index 000000000..1d9c9699f --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.h @@ -0,0 +1,6 @@ +#ifndef __LOGCUMSUMEXP_MOORE_API_H__ +#define __LOGCUMSUMEXP_MOORE_API_H__ +#include "../logcumsumexp.h" +DESCRIPTOR(moore) + +#endif // __LOGCUMSUMEXP_MOORE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.mu b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.mu new file mode 100644 index 000000000..4be0cfa78 --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.mu @@ -0,0 +1,118 @@ +#include "logcumsumexp_moore.h" +#include "logcumsumexp_moore_kernel.h" +#include "../../../devices/moore/moore_handle.h" +#include +#include +#include + +namespace op::logcumsumexp::moore { + +// ============================================================ +// Kernel Launch +// ============================================================ + +template +void launch_kernel( + void* y, + const void* x, + const LogCumSumExpInfo& info, + void* stream) { + + auto x_ptr = reinterpret_cast(x); + auto y_ptr = reinterpret_cast(y); + auto musa_stream = reinterpret_cast(stream); + + size_t outer = info.outer_size(); + size_t axis = info.axis_size(); + size_t inner = info.inner_size(); + + size_t total = outer * inner; + constexpr size_t block = 256; + size_t grid = (total + block - 1) / block; + + op::logcumsumexp::moore::logcumsumexp_kernel + <<>>( + y_ptr, + x_ptr, + outer, + axis, + inner, + + info._x_axis_stride, + info._x_inner_stride, + info._x_outer_stride, // ✅ 新增 + + info._y_axis_stride, + info._y_inner_stride, + info._y_outer_stride, // ✅ 新增 + + info.exclusive(), + info.reverse() + ); +} + +// ============================================================ +// Descriptor +// ============================================================ + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor** desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int axis, + int exclusive, + int reverse) { + + auto handle = reinterpret_cast(handle_); + + auto info_result = + LogCumSumExpInfo::create(y_desc, x_desc, axis, exclusive, reverse); + if (!info_result) return info_result.status(); + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + /*workspace*/ 0, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void* workspace, + size_t workspace_size, + void* y, + const void* x, + void* stream) const { + + switch (_info.dtype()) { + case INFINI_DTYPE_F16: + launch_kernel(y, x, _info, stream); + break; + case INFINI_DTYPE_BF16: + // Moore 架构通常使用 __mt_bfloat16 + launch_kernel<__mt_bfloat16>(y, x, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(y, x, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(y, x, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::logcumsumexp::moore \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore_kernel.h b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore_kernel.h new file mode 100644 index 000000000..086a67493 --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore_kernel.h @@ -0,0 +1,110 @@ +#ifndef __LOGCUMSUMEXP_MOORE_KERNEL_H__ +#define __LOGCUMSUMEXP_MOORE_KERNEL_H__ + +#include +#include +#include + +#include +#include + +namespace op::logcumsumexp::moore { + +// ============================================================ +// 数值稳定 LogSumExp prefix state +// 等价于:log(sum(exp(x[0:i]))) +// ============================================================ + +struct LSEState { + float m; // running max + float s; // sum(exp(x - m)) + + // 数学单位元:log(0) = -inf + __device__ __forceinline__ static LSEState identity() { + return { -INFINITY, 0.0f }; + } + + // prefix 更新 + __device__ __forceinline__ void update(float v) { + if (m == -INFINITY) { + // 第一个元素 + m = v; + s = 1.0f; + } else if (v > m) { + // max 发生变化,需要 rescale + s = s * expf(m - v) + 1.0f; + m = v; + } else { + s += expf(v - m); + } + } + + // 当前 log-sum-exp 值 + __device__ __forceinline__ float value() const { + return (s == 0.0f) ? -INFINITY : (m + logf(s)); + } +}; + +// ============================================================ +// kernel:一个 thread 负责一个 (outer, inner) 前缀向量 +// ============================================================ + +template +__global__ void logcumsumexp_kernel( + T* __restrict__ y, + const T* __restrict__ x, + + size_t outer_size, + size_t axis_size, + size_t inner_size, + + size_t x_axis_stride, + size_t x_inner_stride, + size_t x_outer_stride, + + size_t y_axis_stride, + size_t y_inner_stride, + size_t y_outer_stride, + + bool exclusive, + bool reverse +) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t num_vec = outer_size * inner_size; + if (tid >= num_vec) return; + + size_t o = tid / inner_size; + size_t i = tid % inner_size; + + // base offset(正确处理 stride) + size_t x_base = o * x_outer_stride + i * x_inner_stride; + size_t y_base = o * y_outer_stride + i * y_inner_stride; + + LSEState state = LSEState::identity(); + + for (size_t k = 0; k < axis_size; ++k) { + size_t kk = reverse ? (axis_size - 1 - k) : k; + + size_t x_off = x_base + kk * x_axis_stride; + size_t y_off = y_base + kk * y_axis_stride; + + // 注意:这里利用 static_cast 将 half/__mt_bfloat16 转为 float + // MUSA 编译器通常支持这种隐式或显式转换,如果特定类型不支持, + // 需要替换为 __half2float(val) 或 static_cast(val) + float v = static_cast(x[x_off]); + + if (exclusive) { + // y[i] = log(sum(exp(x[:i]))) + y[y_off] = static_cast(state.value()); + state.update(v); + } else { + // y[i] = log(sum(exp(x[:i+1]))) + state.update(v); + y[y_off] = static_cast(state.value()); + } + } +} + +} // namespace op::logcumsumexp::moore + +#endif // __LOGCUMSUMEXP_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cu b/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cu new file mode 100644 index 000000000..967be0bf0 --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cu @@ -0,0 +1,113 @@ +#include "logcumsumexp_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include + +namespace op::logcumsumexp::nvidia { + +// ============================================================ +// Kernel Launch +// ============================================================ + +template +void launch_kernel( + void* y, + const void* x, + const LogCumSumExpInfo& info, + void* stream) { + + auto x_ptr = reinterpret_cast(x); + auto y_ptr = reinterpret_cast(y); + auto cuda_stream = reinterpret_cast(stream); + + size_t outer = info.outer_size(); + size_t axis = info.axis_size(); + size_t inner = info.inner_size(); + + size_t total = outer * inner; + constexpr size_t block = 256; + size_t grid = (total + block - 1) / block; + + op::logcumsumexp::cuda::logcumsumexp_kernel + <<>>( + y_ptr, + x_ptr, + outer, + axis, + inner, + + info._x_axis_stride, + info._x_inner_stride, + info._x_outer_stride, // ✅ 新增 + + info._y_axis_stride, + info._y_inner_stride, + info._y_outer_stride, // ✅ 新增 + + info.exclusive(), + info.reverse() + ); +} + +// ============================================================ +// Descriptor +// ============================================================ + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor** desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + int axis, + int exclusive, + int reverse) { + + auto info_result = + LogCumSumExpInfo::create(y_desc, x_desc, axis, exclusive, reverse); + if (!info_result) return info_result.status(); + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + /*workspace*/ 0, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void* workspace, + size_t workspace_size, + void* y, + const void* x, + void* stream) const { + + switch (_info.dtype()) { + case INFINI_DTYPE_F16: + launch_kernel(y, x, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(y, x, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(y, x, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(y, x, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::logcumsumexp::nvidia diff --git a/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cuh b/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cuh new file mode 100644 index 000000000..0032a0621 --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGCUMSUMEXP_NVIDIA_CUH__ +#define __LOGCUMSUMEXP_NVIDIA_CUH__ + +#include "../logcumsumexp.h" + +DESCRIPTOR(nvidia) + +#endif // __LOGCUMSUMEXP_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/logcumsumexp/operator.cc b/src/infiniop/ops/logcumsumexp/operator.cc new file mode 100644 index 000000000..07379676a --- /dev/null +++ b/src/infiniop/ops/logcumsumexp/operator.cc @@ -0,0 +1,183 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/logcumsumexp.h" + +// --- 各硬件后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/logcumsumexp_cpu.h" +#endif + +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/logcumsumexp_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/logcumsumexp_metax.h" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/logcumsumexp_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateLogCumSumExpDescriptor( + infiniopHandle_t handle, + infiniopLogCumSumExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int axis, + int exclusive, + int reverse) { + + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logcumsumexp::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y, \ + x, \ + axis, \ + exclusive, \ + reverse) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetLogCumSumExpWorkspaceSize(infiniopLogCumSumExpDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 +// ======================================================================= +__C infiniStatus_t infiniopLogCumSumExp( + infiniopLogCumSumExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroyLogCumSumExpDescriptor(infiniopLogCumSumExpDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/unfold/cpu/unfold_cpu.cc b/src/infiniop/ops/unfold/cpu/unfold_cpu.cc new file mode 100644 index 000000000..53708fb76 --- /dev/null +++ b/src/infiniop/ops/unfold/cpu/unfold_cpu.cc @@ -0,0 +1,158 @@ +#include "unfold_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include + +#include "../../../../utils/custom_types.h" + +namespace op::unfold::cpu { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + const int *kernel_sizes, + const int *strides, + const int *paddings, + const int *dilations) { + + auto handle = reinterpret_cast(handle_); + + // Call the static create method from UnfoldInfo + auto result = UnfoldInfo::infer(out_desc, input_desc, kernel_sizes, strides, paddings, dilations); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + 0, // No workspace needed for CPU + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +// Core Calculation Implementation +template +void calculate_cpu_impl( + const UnfoldInfo &info, + void *output, + const void *input) { + + // 1. Retrieve parameters from vectors + // Note: This implementation assumes 2D spatial dimensions (NCHW) + // If the vector size != 2, this logic needs adaptation. + if (info._kernel_sizes.size() < 2) return; + + int64_t batch = info._N; + int64_t in_c = info._C_in; + + // Spatial Input Dimensions + int64_t in_h = info._input_spatial_shape[0]; + int64_t in_w = info._input_spatial_shape[1]; + + // Spatial Output Dimensions + int64_t out_h = info._output_spatial_shape[0]; + int64_t out_w = info._output_spatial_shape[1]; + + // Kernel / Stride / Pad / Dilation + int64_t k_h = info._kernel_sizes[0]; + int64_t k_w = info._kernel_sizes[1]; + int64_t stride_h = info._strides[0]; + int64_t stride_w = info._strides[1]; + int64_t pad_h = info._paddings[0]; + int64_t pad_w = info._paddings[1]; + int64_t dil_h = info._dilations[0]; + int64_t dil_w = info._dilations[1]; + + // 2. Prepare pointers + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + + // 3. Helper variables + int64_t L = info._L; + int64_t out_c_dim = info._C_out; + + // 4. Parallel Execution + // Collapsing Batch and Input Channel dimensions + #pragma omp parallel for collapse(2) schedule(static) + for (int64_t n = 0; n < batch; ++n) { + for (int64_t c = 0; c < in_c; ++c) { + + int64_t in_batch_offset = n * in_c * in_h * in_w; + int64_t out_batch_offset = n * out_c_dim * L; + + // Loop over kernel window (Unfolding to channel dimension) + for (int64_t kh = 0; kh < k_h; ++kh) { + for (int64_t kw = 0; kw < k_w; ++kw) { + + int64_t out_c_idx = c * k_h * k_w + kh * k_w + kw; + + // Loop over output spatial locations (Flattened L) + for (int64_t oh = 0; oh < out_h; ++oh) { + for (int64_t ow = 0; ow < out_w; ++ow) { + + // Mapping Logic + int64_t h_in = oh * stride_h - pad_h + kh * dil_h; + int64_t w_in = ow * stride_w - pad_w + kw * dil_w; + + int64_t out_idx = out_batch_offset + out_c_idx * L + (oh * out_w + ow); + + // Boundary Check & Assignment + if (h_in >= 0 && h_in < in_h && w_in >= 0 && w_in < in_w) { + int64_t in_idx = in_batch_offset + c * in_h * in_w + h_in * in_w + w_in; + out_ptr[out_idx] = in_ptr[in_idx]; + } else { + // Padding with zero + out_ptr[out_idx] = utils::cast(0.0f); + } + } + } + } + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + auto dtype = _info.dtype_val(); + + switch (dtype) { + case INFINI_DTYPE_F32: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_F64: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_F16: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_BF16: + cpu::calculate_cpu_impl(_info, output, input); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::unfold::cpu \ No newline at end of file diff --git a/src/infiniop/ops/unfold/cpu/unfold_cpu.h b/src/infiniop/ops/unfold/cpu/unfold_cpu.h new file mode 100644 index 000000000..0f37e3bc3 --- /dev/null +++ b/src/infiniop/ops/unfold/cpu/unfold_cpu.h @@ -0,0 +1,8 @@ +#ifndef __UNFOLD_CPU_H__ +#define __UNFOLD_CPU_H__ + +#include "../unfold.h" + +DESCRIPTOR(cpu) + +#endif // __UNFOLD_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/unfold/cuda/kernel.cuh b/src/infiniop/ops/unfold/cuda/kernel.cuh new file mode 100644 index 000000000..ca9315ea8 --- /dev/null +++ b/src/infiniop/ops/unfold/cuda/kernel.cuh @@ -0,0 +1,76 @@ +#ifndef __UNFOLD_CUDA_CUH__ +#define __UNFOLD_CUDA_CUH__ + +#include +#if defined ENABLE_METAX_API + #include + #include + using nv_bfloat162 = __maca_bfloat162; +#else + #include + #include +#endif + +#include + +namespace op::unfold::cuda { +template +__global__ void unfold_kernel( + T * __restrict__ output, // [N, C_out, L] + const T * __restrict__ input, // [N, C, H, W] + // 维度参数 + int C, int H, int W, // 输入维度 + int out_h, int out_w, // 输出空间维度 + // 算子参数 + int k_h, int k_w, // Kernel Size + int pad_h, int pad_w, // Padding + int stride_h, int stride_w, // Stride + int dil_h, int dil_w, // Dilation + // 总任务量 + size_t total_elements) { + + // 平铺式索引:每个线程处理输出的一个元素 + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < total_elements) { + // -------------------------------------------------------- + // 1. 坐标反算:从线性 idx 解析出逻辑维度 + // 输出形状逻辑上为: [N, (C * kH * kW), (out_h * out_w)] + // -------------------------------------------------------- + int L = out_h * out_w; + int kernel_area = k_h * k_w; + int C_col = C * kernel_area; // 输出的通道数 (Column Channel) + int l_idx = idx % L; + size_t temp = idx / L; + int c_col_idx = temp % C_col; + int n_idx = temp / C_col; + + // 解析空间坐标 (h_out, w_out) + int w_out = l_idx % out_w; + int h_out = l_idx / out_w; + + // 解析通道坐标 -> (c_in, kh, kw) + int kw = c_col_idx % k_w; + int temp_k = c_col_idx / k_w; + int kh = temp_k % k_h; + int c_in = temp_k / k_h; + int h_in = h_out * stride_h - pad_h + kh * dil_h; + int w_in = w_out * stride_w - pad_w + kw * dil_w; + T val; + + if (h_in >= 0 && h_in < H && w_in >= 0 && w_in < W) { + // 计算输入线性索引:[n, c, h, w] + size_t in_idx = ((static_cast(n_idx) * C + c_in) * H + h_in) * W + w_in; + val = input[in_idx]; + } else { + // Padding 区域填 0 + val = static_cast(0.0f); + } + + output[idx] = val; + } +} + +} // namespace op::unfold::cuda + +#endif // __UNFOLD_CUDA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/unfold/info.h b/src/infiniop/ops/unfold/info.h new file mode 100644 index 000000000..59bbe6712 --- /dev/null +++ b/src/infiniop/ops/unfold/info.h @@ -0,0 +1,133 @@ +#ifndef __OPS_UNFOLD_INFO_H__ +#define __OPS_UNFOLD_INFO_H__ + +// 按照你提供的路径引用 +#include "../../../utils.h" +#include "../../tensor.h" + +#include +#include + +namespace op::unfold { + +struct UnfoldInfo { +public: + int _dtype; + + // 空间参数 + std::vector _kernel_sizes; + std::vector _strides; + std::vector _paddings; + std::vector _dilations; + + // 形状缓存 + size_t _N; // Batch + size_t _C_in; // Input Channels + size_t _C_out; // Output Channels + size_t _L; // Output Spatial Length + + // 缓存输入的空间维度 (H_in, W_in) + std::vector _input_spatial_shape; + // 缓存输出的空间维度 (H_out, W_out) + std::vector _output_spatial_shape; + + // 默认构造必须是 public + UnfoldInfo() = default; + + // 构造函数 + UnfoldInfo(int dtype, + std::vector kernel_sizes, + std::vector strides, + std::vector paddings, + std::vector dilations, + size_t N, size_t C_in, size_t C_out, size_t L, + std::vector in_spatial, + std::vector out_spatial) + : _dtype(dtype), + _kernel_sizes(std::move(kernel_sizes)), + _strides(std::move(strides)), + _paddings(std::move(paddings)), + _dilations(std::move(dilations)), + _N(N), _C_in(C_in), _C_out(C_out), _L(L), + _input_spatial_shape(std::move(in_spatial)), + _output_spatial_shape(std::move(out_spatial)) {} + + // Getters + int dtype_val() const { return _dtype; } + + // 对应 .cc 中的调用 + static utils::Result infer( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + const int *kernel_sizes, + const int *strides, + const int *paddings, + const int *dilations) { + + // 1. 检查维度 (假设 ndim() 是方法) + if (input_desc->ndim() < 3) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + int ndim = input_desc->ndim(); + int spatial_dims = ndim - 2; + + // 2. 读取参数 + size_t N = input_desc->shape()[0]; + size_t C_in = input_desc->shape()[1]; + + std::vector k_vec(kernel_sizes, kernel_sizes + spatial_dims); + std::vector s_vec(strides, strides + spatial_dims); + std::vector p_vec(paddings, paddings + spatial_dims); + std::vector d_vec(dilations, dilations + spatial_dims); + + // 3. 计算形状 + size_t kernel_prod = 1; + for (int k : k_vec) kernel_prod *= k; + size_t C_out = C_in * kernel_prod; + + size_t L = 1; + std::vector in_spatial; + std::vector out_spatial; + + for (int i = 0; i < spatial_dims; ++i) { + int64_t in_dim = input_desc->shape()[i + 2]; + in_spatial.push_back(in_dim); + + int k = k_vec[i]; + int s = s_vec[i]; + int p = p_vec[i]; + int d = d_vec[i]; + + int64_t numerator = in_dim + 2 * p - d * (k - 1) - 1; + if (numerator < 0) numerator = -s; + + int64_t out_dim = (numerator / s) + 1; + if (out_dim <= 0) return INFINI_STATUS_BAD_PARAM; + + out_spatial.push_back(out_dim); + L *= out_dim; + } + + // 4. 校验输出 + if (out_desc->ndim() == 3) { + if (out_desc->shape()[0] != N || + out_desc->shape()[1] != C_out || + out_desc->shape()[2] != L) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + // 显式构造 Result + return utils::Result(UnfoldInfo( + input_desc->dtype(), + std::move(k_vec), std::move(s_vec), std::move(p_vec), std::move(d_vec), + N, C_in, C_out, L, + std::move(in_spatial), std::move(out_spatial) + )); + } +}; + +} // namespace op::unfold + +#endif // __OPS_UNFOLD_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/unfold/metax/unfold_metax.h b/src/infiniop/ops/unfold/metax/unfold_metax.h new file mode 100644 index 000000000..07a3aad1a --- /dev/null +++ b/src/infiniop/ops/unfold/metax/unfold_metax.h @@ -0,0 +1,7 @@ +#ifndef __UNFOLD_METAX_API_H__ +#define __UNFOLD_METAX_API_H__ + +#include "../unfold.h" +DESCRIPTOR(metax) + +#endif // __UNFOLD_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/unfold/metax/unfold_metax.maca b/src/infiniop/ops/unfold/metax/unfold_metax.maca new file mode 100644 index 000000000..01981add3 --- /dev/null +++ b/src/infiniop/ops/unfold/metax/unfold_metax.maca @@ -0,0 +1,216 @@ +#include "unfold_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include + +// ================================================================== +// 1. MACA 类型兼容 +// ================================================================== +#if defined(__MACA__) || defined(__MACACC__) + #include + #include + using nv_bfloat16 = __maca_bfloat16; + using nv_bfloat162 = __maca_bfloat162; +#endif + +namespace op::unfold::metax { + +// ================================================================== +// 2. Kernel 实现 +// ================================================================== +template +__global__ void unfold_kernel( + T * __restrict__ output, // [N, C_out, L] + const T * __restrict__ input, // [N, C, H, W] + // 维度参数 + int C, int H, int W, // 输入维度 + int out_h, int out_w, // 输出空间维度 + // 算子参数 + int k_h, int k_w, // Kernel Size + int pad_h, int pad_w, // Padding + int stride_h, int stride_w, // Stride + int dil_h, int dil_w, // Dilation + // 总任务量 + size_t total_elements) { + + // 平铺式索引:每个线程处理输出的一个元素 + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < total_elements) { + // -------------------------------------------------------- + // 1. 坐标反算:从线性 idx 解析出逻辑维度 + // 输出形状逻辑上为: [N, (C * kH * kW), (out_h * out_w)] + // -------------------------------------------------------- + int L = out_h * out_w; + int kernel_area = k_h * k_w; + int C_col = C * kernel_area; // 输出的通道数 (Column Channel) + + // 优化除法/取模运算顺序 + int l_idx = idx % L; + size_t temp = idx / L; + int c_col_idx = temp % C_col; + int n_idx = temp / C_col; + + // 解析空间坐标 (h_out, w_out) + int w_out = l_idx % out_w; + int h_out = l_idx / out_w; + + // 解析通道坐标 -> (c_in, kh, kw) + int kw = c_col_idx % k_w; + int temp_k = c_col_idx / k_w; + int kh = temp_k % k_h; + int c_in = temp_k / k_h; + + // 计算对应的输入坐标 + int h_in = h_out * stride_h - pad_h + kh * dil_h; + int w_in = w_out * stride_w - pad_w + kw * dil_w; + + T val; + + // 边界检查与赋值 + if (h_in >= 0 && h_in < H && w_in >= 0 && w_in < W) { + // 计算输入线性索引:[n, c, h, w] + size_t in_idx = ((static_cast(n_idx) * C + c_in) * H + h_in) * W + w_in; + val = input[in_idx]; + } else { + // Padding 区域填 0 + val = static_cast(0.0f); + } + + output[idx] = val; + } +} + +// ================================================================== +// 3. Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const UnfoldInfo& info, + void *stream) { + + // 1. 准备指针 + // [修复] 修正了这里的拼写错误 (autoin_ptr -> auto in_ptr) + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + auto mc_stream = reinterpret_cast(stream); + + // 2. 准备参数 (从 Info 的向量中解包) + if (info._kernel_sizes.size() < 2) { + return; + } + + int C = info._C_in; + int H = info._input_spatial_shape[0]; + int W = info._input_spatial_shape[1]; + + int out_h = info._output_spatial_shape[0]; + int out_w = info._output_spatial_shape[1]; + + int k_h = info._kernel_sizes[0]; + int k_w = info._kernel_sizes[1]; + int pad_h = info._paddings[0]; + int pad_w = info._paddings[1]; + int stride_h = info._strides[0]; + int stride_w = info._strides[1]; + int dil_h = info._dilations[0]; + int dil_w = info._dilations[1]; + + // 3. 计算 Grid + size_t out_channels = info._C_out; + size_t out_spatial = info._L; + size_t total_elements = info._N * out_channels * out_spatial; + + size_t block_size = 256; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + // 4. 调用 Kernel + unfold_kernel + <<>>( + out_ptr, in_ptr, + C, H, W, + out_h, out_w, + k_h, k_w, + pad_h, pad_w, + stride_h, stride_w, + dil_h, dil_w, + total_elements + ); +} + +// ================================================================== +// 4. Descriptor 实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + const int *kernel_sizes, + const int *strides, + const int *paddings, + const int *dilations) { + + auto handle = reinterpret_cast(handle_); + + // 1. 创建并校验 Info + auto result = UnfoldInfo::infer(out_desc, input_desc, kernel_sizes, strides, paddings, dilations); + if (!result) return result.status(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + auto dtype = _info.dtype_val(); + + // 3. 根据数据类型分发 Kernel + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel<__half>(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: +#if defined(__MACA__) || defined(__MACACC__) + launch_kernel<__maca_bfloat16>(output, input, _info, stream); +#endif + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::unfold::metax \ No newline at end of file diff --git a/src/infiniop/ops/unfold/moore/unfold_moore.h b/src/infiniop/ops/unfold/moore/unfold_moore.h new file mode 100644 index 000000000..a61cf097e --- /dev/null +++ b/src/infiniop/ops/unfold/moore/unfold_moore.h @@ -0,0 +1,8 @@ +#ifndef __UNFOLD_MOORE_H__ +#define __UNFOLD_MOORE_H__ + +#include "../unfold.h" + +DESCRIPTOR(moore) + +#endif // __UNFOLD_MOORE_H__ \ No newline at end of file diff --git a/src/infiniop/ops/unfold/moore/unfold_moore.mu b/src/infiniop/ops/unfold/moore/unfold_moore.mu new file mode 100644 index 000000000..59db55568 --- /dev/null +++ b/src/infiniop/ops/unfold/moore/unfold_moore.mu @@ -0,0 +1,140 @@ +#include "unfold_moore.h" +#include "unfold_moore_kernel.h" +#include "../../../devices/moore/moore_handle.h" +#include +#include + +namespace op::unfold::moore { + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const UnfoldInfo& info, + void *stream) { + + // 1. 准备指针 + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + auto musa_stream = reinterpret_cast(stream); + + // 2. 准备参数 (从 Info 的向量中解包) + // 注意:目前的 Kernel 仅支持 2D Spatial (NCHW),这里取前两个维度 + if (info._kernel_sizes.size() < 2) { + // 异常情况处理,或者直接返回 + return; + } + + int C = info._C_in; + int H = info._input_spatial_shape[0]; + int W = info._input_spatial_shape[1]; + + int out_h = info._output_spatial_shape[0]; + int out_w = info._output_spatial_shape[1]; + + int k_h = info._kernel_sizes[0]; + int k_w = info._kernel_sizes[1]; + int pad_h = info._paddings[0]; + int pad_w = info._paddings[1]; + int stride_h = info._strides[0]; + int stride_w = info._strides[1]; + int dil_h = info._dilations[0]; + int dil_w = info._dilations[1]; + + // 3. 计算 Grid + // 输出通道数 = C * kH * kW + size_t out_channels = info._C_out; + size_t out_spatial = info._L; + size_t total_elements = info._N * out_channels * out_spatial; + + size_t block_size = 256; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + // 4. 调用 Moore Kernel + op::unfold::moore::unfold_kernel + <<>>( + out_ptr, in_ptr, + C, H, W, + out_h, out_w, + k_h, k_w, + pad_h, pad_w, + stride_h, stride_w, + dil_h, dil_w, + total_elements + ); +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + const int *kernel_sizes, + const int *strides, + const int *paddings, + const int *dilations) { + + auto handle = reinterpret_cast(handle_); + + // 1. 创建并校验 Info + // 使用新的 infer 接口 + auto result = UnfoldInfo::infer(out_desc, input_desc, kernel_sizes, strides, paddings, dilations); + if (!result) return result.status(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + // 使用新的 getter + auto dtype = _info.dtype_val(); + + // 3. 根据数据类型分发 Kernel + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16>(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::unfold::moore \ No newline at end of file diff --git a/src/infiniop/ops/unfold/moore/unfold_moore_kernel.h b/src/infiniop/ops/unfold/moore/unfold_moore_kernel.h new file mode 100644 index 000000000..3f0945f9e --- /dev/null +++ b/src/infiniop/ops/unfold/moore/unfold_moore_kernel.h @@ -0,0 +1,80 @@ +#ifndef __UNFOLD_MOORE_KERNEL_H__ +#define __UNFOLD_MOORE_KERNEL_H__ + +#include +#include +#include +#include +#include + +namespace op::unfold::moore { + +template +__global__ void unfold_kernel( + T * __restrict__ output, // [N, C_out, L] + const T * __restrict__ input, // [N, C, H, W] + // 维度参数 + int C, int H, int W, // 输入维度 + int out_h, int out_w, // 输出空间维度 + // 算子参数 + int k_h, int k_w, // Kernel Size + int pad_h, int pad_w, // Padding + int stride_h, int stride_w, // Stride + int dil_h, int dil_w, // Dilation + // 总任务量 + size_t total_elements) { + + // 平铺式索引:每个线程处理输出的一个元素 + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < total_elements) { + // -------------------------------------------------------- + // 1. 坐标反算:从线性 idx 解析出逻辑维度 + // 输出形状逻辑上为: [N, (C * kH * kW), (out_h * out_w)] + // -------------------------------------------------------- + int L = out_h * out_w; + int kernel_area = k_h * k_w; + int C_col = C * kernel_area; // 输出的通道数 (Column Channel) + + int l_idx = idx % L; + size_t temp = idx / L; + int c_col_idx = temp % C_col; + int n_idx = temp / C_col; + + // 解析空间坐标 (h_out, w_out) + int w_out = l_idx % out_w; + int h_out = l_idx / out_w; + + // 解析通道坐标 -> (c_in, kh, kw) + int kw = c_col_idx % k_w; + int temp_k = c_col_idx / k_w; + int kh = temp_k % k_h; + int c_in = temp_k / k_h; + + // 计算输入特征图上的坐标 + int h_in = h_out * stride_h - pad_h + kh * dil_h; + int w_in = w_out * stride_w - pad_w + kw * dil_w; + + T val; + + if (h_in >= 0 && h_in < H && w_in >= 0 && w_in < W) { + // 计算输入线性索引:[n, c, h, w] + size_t in_idx = ((static_cast(n_idx) * C + c_in) * H + h_in) * W + w_in; + val = input[in_idx]; + } else { + if constexpr (std::is_same_v) { + val = __float2half(0.0f); + } else if constexpr (std::is_same_v) { + val = __float2bfloat16(0.0f); + } else { + val = static_cast(0.0f); + } + } + + output[idx] = val; + } +} + +} // namespace op::unfold::moore + +#endif // __UNFOLD_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cu b/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cu new file mode 100644 index 000000000..b08542a0d --- /dev/null +++ b/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cu @@ -0,0 +1,140 @@ +#include "unfold_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include +#include + +namespace op::unfold::nvidia { + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const UnfoldInfo& info, + void *stream) { + + // 1. 准备指针 + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + auto cuda_stream = reinterpret_cast(stream); + + // 2. 准备参数 (从 Info 的向量中解包) + // 注意:目前的 CUDA Kernel 仅支持 2D Spatial (NCHW),这里取前两个维度 + if (info._kernel_sizes.size() < 2) { + // 异常情况处理,或者直接返回 + return; + } + + int C = info._C_in; + int H = info._input_spatial_shape[0]; + int W = info._input_spatial_shape[1]; + + int out_h = info._output_spatial_shape[0]; + int out_w = info._output_spatial_shape[1]; + + int k_h = info._kernel_sizes[0]; + int k_w = info._kernel_sizes[1]; + int pad_h = info._paddings[0]; + int pad_w = info._paddings[1]; + int stride_h = info._strides[0]; + int stride_w = info._strides[1]; + int dil_h = info._dilations[0]; + int dil_w = info._dilations[1]; + + // 3. 计算 Grid + // 输出通道数 = C * kH * kW + size_t out_channels = info._C_out; + size_t out_spatial = info._L; + size_t total_elements = info._N * out_channels * out_spatial; + + size_t block_size = 256; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + // 4. 调用 CUDA Kernel + op::unfold::cuda::unfold_kernel + <<>>( + out_ptr, in_ptr, + C, H, W, + out_h, out_w, + k_h, k_w, + pad_h, pad_w, + stride_h, stride_w, + dil_h, dil_w, + total_elements + ); +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + const int *kernel_sizes, + const int *strides, + const int *paddings, + const int *dilations) { + + auto handle = reinterpret_cast(handle_); + + // 1. 创建并校验 Info + // 使用新的 infer 接口 + auto result = UnfoldInfo::infer(out_desc, input_desc, kernel_sizes, strides, paddings, dilations); + if (!result) return result.status(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + // 使用新的 getter + auto dtype = _info.dtype_val(); + + // 3. 根据数据类型分发 Kernel + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::unfold::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cuh b/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cuh new file mode 100644 index 000000000..7246da912 --- /dev/null +++ b/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __UNFOLD_NVIDIA_CUH__ +#define __UNFOLD_NVIDIA_CUH__ + +#include "../unfold.h" +DESCRIPTOR(nvidia) + +#endif // __UNFOLD_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/unfold/operator.cc b/src/infiniop/ops/unfold/operator.cc new file mode 100644 index 000000000..d78346016 --- /dev/null +++ b/src/infiniop/ops/unfold/operator.cc @@ -0,0 +1,185 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/unfold.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/unfold_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/unfold_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/unfold_metax.h" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/unfold_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateUnfoldDescriptor( + infiniopHandle_t handle, + infiniopUnfoldDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + const int *kernel_sizes, + const int *strides, + const int *paddings, + const int *dilations) { + + // 定义局部宏以简化多后端分发逻辑 + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::unfold::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + kernel_sizes, \ + strides, \ + paddings, \ + dilations) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetUnfoldWorkspaceSize(infiniopUnfoldDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopUnfold( + infiniopUnfoldDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroyUnfoldDescriptor(infiniopUnfoldDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/unfold/unfold.h b/src/infiniop/ops/unfold/unfold.h new file mode 100644 index 000000000..fe9241bbc --- /dev/null +++ b/src/infiniop/ops/unfold/unfold.h @@ -0,0 +1,49 @@ +#ifndef __UNFOLD_H__ +#define __UNFOLD_H__ + +#include "../../operator.h" +#include "info.h" // 引用对应的 UnfoldInfo 定义 +#define DESCRIPTOR(NAMESPACE) \ + namespace op::unfold::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + UnfoldInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + UnfoldInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + const int *kernel_sizes, \ + const int *strides, \ + const int *paddings, \ + const int *dilations); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __UNFOLD_H__ \ No newline at end of file diff --git a/src/infiniop/ops/vander/cpu/vander_cpu.cc b/src/infiniop/ops/vander/cpu/vander_cpu.cc new file mode 100644 index 000000000..fd57a2cb6 --- /dev/null +++ b/src/infiniop/ops/vander/cpu/vander_cpu.cc @@ -0,0 +1,115 @@ +#include "vander_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include +#include +#include + +#include "../../../../utils/custom_types.h" + +namespace op::vander::cpu { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + _opaque = nullptr; + } +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + int N, + int increasing) { + + auto handle = reinterpret_cast(handle_); + + // 调用 Info::create 进行校验和元数据构建 + auto result = VanderInfo::create(out_desc, input_desc, N, increasing); + CHECK_RESULT(result); + + *desc_ptr = new Descriptor( + new Opaque(), + result.take(), + 0, // CPU 实现通常不需要额外的 workspace + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +template +void calculate_cpu_impl( + const VanderInfo &info, + void *output, + const void *input) { + + size_t rows = info.rows(); // Input Size + size_t cols = info.cols(); // Output Cols + bool increasing = info.increasing(); + + auto out_ptr = reinterpret_cast(output); + auto in_ptr = reinterpret_cast(input); + + // 对每一行(输入向量的每个元素)进行并行计算 + #pragma omp parallel for schedule(static) + for (size_t i = 0; i < rows; ++i) { + // 将输入转换为 float/double 进行高精度计算,避免 fp16/bf16 累乘精度损失 + float x = utils::cast(in_ptr[i]); + + // 优化:使用累乘法替代 pow 函数 + // x^0 = 1.0 + float val = 1.0f; + + if (increasing) { + // 顺序:x^0, x^1, x^2 ... + for (size_t j = 0; j < cols; ++j) { + out_ptr[i * cols + j] = utils::cast(val); + val *= x; + } + } else { + // 顺序:... x^2, x^1, x^0 + // 从最后一列 (x^0) 向前填充 + for (int64_t j = static_cast(cols) - 1; j >= 0; --j) { + out_ptr[i * cols + j] = utils::cast(val); + val *= x; + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + auto dtype = _info.dtype(); + + switch (dtype) { + case INFINI_DTYPE_F32: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_F64: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_F16: + cpu::calculate_cpu_impl(_info, output, input); + break; + case INFINI_DTYPE_BF16: + cpu::calculate_cpu_impl(_info, output, input); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::vander::cpu \ No newline at end of file diff --git a/src/infiniop/ops/vander/cpu/vander_cpu.h b/src/infiniop/ops/vander/cpu/vander_cpu.h new file mode 100644 index 000000000..1940ab023 --- /dev/null +++ b/src/infiniop/ops/vander/cpu/vander_cpu.h @@ -0,0 +1,8 @@ +#ifndef __VANDER_CPU_H__ +#define __VANDER_CPU_H__ + +#include "../vander.h" + +DESCRIPTOR(cpu) + +#endif // __VANDER_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/vander/cuda/kernel.cuh b/src/infiniop/ops/vander/cuda/kernel.cuh new file mode 100644 index 000000000..33ced35f3 --- /dev/null +++ b/src/infiniop/ops/vander/cuda/kernel.cuh @@ -0,0 +1,48 @@ +#ifndef __VANDER_CUDA_CUH__ +#define __VANDER_CUDA_CUH__ + +#include +#if defined ENABLE_METAX_API + #include + #include + using nv_bfloat162 = __maca_bfloat162; +#else + #include + #include +#endif + +#include +#include + +namespace op::vander::cuda { + +// ================================================================== +// 核心 Kernel +// ================================================================== +template +__global__ void vander_kernel( + T * __restrict__ output, // [rows, cols] + const T * __restrict__ input, // [rows] + size_t rows, + size_t cols, + bool increasing) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total_elements = rows * cols; + + if (idx < total_elements) { + size_t row = idx / cols; + size_t col = idx % cols; + + // 加载输入 (同一个 row 的不同 col 线程会读取同一个 input[row],L1 Cache 友好) + float x = static_cast(input[row]); + float power = increasing ? static_cast(col) + : static_cast(cols - 1 - col); + float res = powf(x, power); + + output[idx] = static_cast(res); + } +} + +} // namespace op::vander::cuda + +#endif // __VANDER_CUDA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/vander/info.h b/src/infiniop/ops/vander/info.h new file mode 100644 index 000000000..18823aa2b --- /dev/null +++ b/src/infiniop/ops/vander/info.h @@ -0,0 +1,71 @@ +#ifndef __VANDER_INFO_H__ +#define __VANDER_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::vander { + +class VanderInfo { + VanderInfo() = default; + +public: + int _dtype; // 数据类型 + bool _increasing; // 幂次顺序 (false: 递减, true: 递增) + + // 形状信息缓存 + size_t _rows; // 输入向量长度 (N) + size_t _cols; // 输出矩阵列数 (M) + + int dtype() const { return _dtype; } + bool increasing() const { return _increasing; } + size_t rows() const { return _rows; } + size_t cols() const { return _cols; } + + // 构造函数 + VanderInfo(int dtype, bool increasing, size_t rows, size_t cols) + : _dtype(dtype), _increasing(increasing), _rows(rows), _cols(cols) {} + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + int N, // 用户指定的列数,若 <= 0 则默认为输入长度 + int increasing) { // C API 传入的是 int,内部转为 bool + + // 1. 检查输入形状 + // Input 必须是 1D 向量: (rows) + if (input_desc->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + size_t rows = input_desc->shape()[0]; + size_t cols = (N > 0) ? static_cast(N) : rows; + + // 3. 检查输出形状 + // Output 必须是 2D 矩阵: (rows, cols) + if (out_desc->ndim() != 2) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (out_desc->shape()[0] != rows) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (out_desc->shape()[1] != cols) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (out_desc->dtype() != input_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return utils::Result(VanderInfo{ + input_desc->dtype(), // _dtype + static_cast(increasing), // _increasing + rows, // _rows + cols // _cols + }); + } +}; + +} // namespace op::vander + +#endif // __VANDER_INFO_H__ \ No newline at end of file diff --git a/src/infiniop/ops/vander/metax/vander_metax.h b/src/infiniop/ops/vander/metax/vander_metax.h new file mode 100644 index 000000000..0f5557dcb --- /dev/null +++ b/src/infiniop/ops/vander/metax/vander_metax.h @@ -0,0 +1,8 @@ +#ifndef __VANDER_METAX_API_H__ +#define __VANDER_METAX_API_H__ + +#include "../vander.h" + +DESCRIPTOR(metax) + +#endif // __VANDER_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/vander/metax/vander_metax.maca b/src/infiniop/ops/vander/metax/vander_metax.maca new file mode 100644 index 000000000..58e44f7f3 --- /dev/null +++ b/src/infiniop/ops/vander/metax/vander_metax.maca @@ -0,0 +1,144 @@ +#include "vander_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include +#include +#include +#include +#include +#include +#include +using nv_bfloat16 = __maca_bfloat16; +using nv_bfloat162 = __maca_bfloat162; + +namespace op::vander::metax { +template +__global__ void vander_kernel( + T * __restrict__ output, // [rows, cols] + const T * __restrict__ input, // [rows] + size_t rows, + size_t cols, + bool increasing) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total_elements = rows * cols; + + if (idx < total_elements) { + size_t row = idx / cols; + size_t col = idx % cols; + + // 加载输入 (同一个 row 的不同 col 线程会读取同一个 input[row],L1 Cache 友好) + float x = static_cast(input[row]); + + // 计算指数 + // increasing=True: x^0, x^1, ..., x^(N-1) + // increasing=False: x^(N-1), ..., x^1, x^0 + float power = increasing ? static_cast(col) + : static_cast(cols - 1 - col); + + float res = powf(x, power); + + output[idx] = static_cast(res); + } +} + +// ================================================================== +// 3. Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const VanderInfo& info, + void *stream) { + + // 1. 准备指针 + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + auto mc_stream = reinterpret_cast(stream); + + // 2. 准备参数 + size_t rows = info.rows(); + size_t cols = info.cols(); + bool increasing = info.increasing(); + + // 计算 Grid Size + size_t total_elements = rows * cols; + size_t block_size = 256; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + // 3. 启动 Kernel + vander_kernel + <<>>( + out_ptr, in_ptr, rows, cols, increasing + ); +} + +// ================================================================== +// 4. Descriptor 实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + int N, + int increasing) { + + auto handle = reinterpret_cast(handle_); + + // 1. 创建并校验 Info + auto info_result = VanderInfo::create(out_desc, input_desc, N, increasing); + if (!info_result) return info_result.status(); + + // 2. 创建 Descriptor + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + auto dtype = _info.dtype(); + + // 3. 根据数据类型分发 Kernel + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel<__half>(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__maca_bfloat16>(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::vander::metax \ No newline at end of file diff --git a/src/infiniop/ops/vander/moore/vander_moore.h b/src/infiniop/ops/vander/moore/vander_moore.h new file mode 100644 index 000000000..57d1e4d8e --- /dev/null +++ b/src/infiniop/ops/vander/moore/vander_moore.h @@ -0,0 +1,8 @@ +#ifndef __VANDER_MOORE_H__ +#define __VANDER_MOORE_H__ + +#include "../vander.h" + +DESCRIPTOR(moore) + +#endif // __VANDER_MOORE_H__ \ No newline at end of file diff --git a/src/infiniop/ops/vander/moore/vander_moore.mu b/src/infiniop/ops/vander/moore/vander_moore.mu new file mode 100644 index 000000000..6f9db92bb --- /dev/null +++ b/src/infiniop/ops/vander/moore/vander_moore.mu @@ -0,0 +1,109 @@ +#include "vander_moore.h" +#include "vander_moore_kernel.h" +#include "../../../devices/moore/moore_handle.h" +#include +#include + +namespace op::vander::moore { + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const VanderInfo& info, + void *stream) { + + // 1. 准备指针 + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + + auto musa_stream = reinterpret_cast(stream); + + // 2. 准备参数 + size_t rows = info.rows(); + size_t cols = info.cols(); + bool increasing = info.increasing(); + + // 计算总元素数量以确定 Grid Size + size_t total_elements = rows * cols; + size_t block_size = 256; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + // 调用 Moore Kernel + op::vander::moore::vander_kernel + <<>>( + out_ptr, in_ptr, rows, cols, increasing + ); +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + int N, + int increasing) { + + auto handle = reinterpret_cast(handle_); + + // 1. 创建并校验 Info + auto info_result = VanderInfo::create(out_desc, input_desc, N, increasing); + if (!info_result) return info_result.status(); + + // 2. 创建 Descriptor + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + auto dtype = _info.dtype(); + + // 3. 根据数据类型分发 Kernel + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel<__mt_bfloat16>(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::vander::moore \ No newline at end of file diff --git a/src/infiniop/ops/vander/moore/vander_moore_kernel.h b/src/infiniop/ops/vander/moore/vander_moore_kernel.h new file mode 100644 index 000000000..82d8a246c --- /dev/null +++ b/src/infiniop/ops/vander/moore/vander_moore_kernel.h @@ -0,0 +1,61 @@ +#ifndef __VANDER_MOORE_KERNEL_H__ +#define __VANDER_MOORE_KERNEL_H__ + +#include +#include +#include +#include +#include +#include + +namespace op::vander::moore { + +// ================================================================== +// 核心 Kernel +// ================================================================== +template +__global__ void vander_kernel( + T * __restrict__ output, // [rows, cols] + const T * __restrict__ input, // [rows] + size_t rows, + size_t cols, + bool increasing) { + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total_elements = rows * cols; + + if (idx < total_elements) { + size_t row = idx / cols; + size_t col = idx % cols; + + // 1. 读取输入并转换为 float + T in_val = input[row]; + float x; + + if constexpr (std::is_same_v) { + x = __half2float(in_val); + } else if constexpr (std::is_same_v) { + x = __bfloat162float(in_val); + } else { + x = static_cast(in_val); + } + + // 2. 计算幂 + float power = increasing ? static_cast(col) + : static_cast(cols - 1 - col); + float res = powf(x, power); + + // 3. 结果写回 + if constexpr (std::is_same_v) { + output[idx] = __float2half(res); + } else if constexpr (std::is_same_v) { + output[idx] = __float2bfloat16(res); + } else { + output[idx] = static_cast(res); + } + } +} + +} // namespace op::vander::moore + +#endif // __VANDER_MOORE_KERNEL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/vander/nvidia/vander_nvidia.cu b/src/infiniop/ops/vander/nvidia/vander_nvidia.cu new file mode 100644 index 000000000..00bf41872 --- /dev/null +++ b/src/infiniop/ops/vander/nvidia/vander_nvidia.cu @@ -0,0 +1,108 @@ +#include "vander_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../../../handle.h" +#include +#include + +namespace op::vander::nvidia { + +// ================================================================== +// Kernel Launch Logic +// ================================================================== +template +void launch_kernel( + void *output, + const void *input, + const VanderInfo& info, + void *stream) { + + // 1. 准备指针 + auto in_ptr = reinterpret_cast(input); + auto out_ptr = reinterpret_cast(output); + + auto cuda_stream = reinterpret_cast(stream); + + // 2. 准备参数 + size_t rows = info.rows(); + size_t cols = info.cols(); + bool increasing = info.increasing(); + + // 计算总元素数量以确定 Grid Size + size_t total_elements = rows * cols; + size_t block_size = 256; + size_t grid_size = (total_elements + block_size - 1) / block_size; + + // 调用 CUDA Kernel + op::vander::cuda::vander_kernel + <<>>( + out_ptr, in_ptr, rows, cols, increasing + ); +} + +// ================================================================== +// Descriptor 实现 +// ================================================================== +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + int N, + int increasing) { + + // 1. 创建并校验 Info + auto info_result = VanderInfo::create(out_desc, input_desc, N, increasing); + if (!info_result) return info_result.status(); + + // 2. 创建 Descriptor + // Vander 算子是 Element-wise 展开操作,不需要额外的 workspace + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + new Opaque(), + info_result.take(), + workspace_size, + handle->device, + handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + auto dtype = _info.dtype(); + + // 3. 根据数据类型分发 Kernel + switch (dtype) { + case INFINI_DTYPE_F16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + launch_kernel(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + launch_kernel(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::vander::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/vander/nvidia/vander_nvidia.cuh b/src/infiniop/ops/vander/nvidia/vander_nvidia.cuh new file mode 100644 index 000000000..b9da220ef --- /dev/null +++ b/src/infiniop/ops/vander/nvidia/vander_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __VANDER_NVIDIA_CUH__ +#define __VANDER_NVIDIA_CUH__ + +#include "../vander.h" + +DESCRIPTOR(nvidia) + +#endif // __VANDER_NVIDIA_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/vander/operator.cc b/src/infiniop/ops/vander/operator.cc new file mode 100644 index 000000000..715e769dd --- /dev/null +++ b/src/infiniop/ops/vander/operator.cc @@ -0,0 +1,181 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/vander.h" + +// --- 后端实现头文件 --- +#ifdef ENABLE_CPU_API +#include "cpu/vander_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/vander_nvidia.cuh" +#endif + +#ifdef ENABLE_METAX_API +#include "metax/vander_metax.h" +#endif + +#ifdef ENABLE_MOORE_API +#include "moore/vander_moore.h" +#endif + +extern "C" { + +// ======================================================================= +// 1. 创建算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopCreateVanderDescriptor( + infiniopHandle_t handle, + infiniopVanderDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + int N, + int increasing) { + + // 定义局部宏以简化多后端分发逻辑 + #define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::vander::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + input, \ + N, \ + increasing) + + switch (handle->device) { + #ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CREATE +} + +// ======================================================================= +// 2. 获取 Workspace 大小 +// ======================================================================= +__C infiniStatus_t infiniopGetVanderWorkspaceSize(infiniopVanderDescriptor_t desc, size_t *size) { + + #define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef GET +} + +// ======================================================================= +// 3. 执行计算 (Calculate) +// ======================================================================= +__C infiniStatus_t infiniopVander( + infiniopVanderDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + + #define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef CALCULATE +} + +// ======================================================================= +// 4. 销毁算子描述符 +// ======================================================================= +__C infiniStatus_t infiniopDestroyVanderDescriptor(infiniopVanderDescriptor_t desc) { + + #define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + #ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); + #endif + #ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); + #endif + #ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); + #endif + #ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); + #endif + #ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); + #endif + #ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); + #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + #undef DELETE +} + +} // extern "C" \ No newline at end of file diff --git a/src/infiniop/ops/vander/vander.h b/src/infiniop/ops/vander/vander.h new file mode 100644 index 000000000..0eeaaea50 --- /dev/null +++ b/src/infiniop/ops/vander/vander.h @@ -0,0 +1,49 @@ +#ifndef __VANDER_H__ +#define __VANDER_H__ + +#include "../../operator.h" +#include "info.h" // 引用对应的 VanderInfo 定义 + +// 宏定义:用于生成不同命名空间下的 Descriptor 类 +#define DESCRIPTOR(NAMESPACE) \ + namespace op::vander::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + VanderInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + VanderInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + int N, \ + int increasing); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __VANDER_H__ \ No newline at end of file diff --git a/test/infinicore/ops/logcumsumexp.py b/test/infinicore/ops/logcumsumexp.py index 82514064b..385c592f1 100644 --- a/test/infinicore/ops/logcumsumexp.py +++ b/test/infinicore/ops/logcumsumexp.py @@ -77,9 +77,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.logcumsumexp(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.logcumsumexp(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.logcumsumexp(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/logical_and.py b/test/infinicore/ops/logical_and.py index 85f5438b9..caa976f0c 100644 --- a/test/infinicore/ops/logical_and.py +++ b/test/infinicore/ops/logical_and.py @@ -109,9 +109,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.logical_and(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.logical_and(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.logical_and(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/logical_not.py b/test/infinicore/ops/logical_not.py index c2e0d363d..e7857bcf7 100644 --- a/test/infinicore/ops/logical_not.py +++ b/test/infinicore/ops/logical_not.py @@ -93,9 +93,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.logical_not(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.logical_not(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.logical_not(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/unfold.py b/test/infinicore/ops/unfold.py index 2d2b30603..384c63506 100644 --- a/test/infinicore/ops/unfold.py +++ b/test/infinicore/ops/unfold.py @@ -72,9 +72,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.unfold(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.unfold(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.nn.functional.unfold(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/vander.py b/test/infinicore/ops/vander.py index 25040d474..02465d8aa 100644 --- a/test/infinicore/ops/vander.py +++ b/test/infinicore/ops/vander.py @@ -60,9 +60,8 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.vander(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.vander(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + return infinicore.vander(*args, **kwargs) def main():