diff --git a/include/infinicore/ops/logical_or.hpp b/include/infinicore/ops/logical_or.hpp new file mode 100644 index 000000000..6342aeda3 --- /dev/null +++ b/include/infinicore/ops/logical_or.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class LogicalOr { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor c, Tensor a, Tensor b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor logical_or(Tensor a, Tensor b); +void logical_or_(Tensor c, Tensor a, Tensor b); +} // namespace infinicore::op + diff --git a/include/infinicore/ops/logical_xor.hpp b/include/infinicore/ops/logical_xor.hpp new file mode 100644 index 000000000..e9624bdd6 --- /dev/null +++ b/include/infinicore/ops/logical_xor.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class LogicalXor { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor c, Tensor a, Tensor b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor logical_xor(Tensor a, Tensor b); +void logical_xor_(Tensor c, Tensor a, Tensor b); +} // namespace infinicore::op + diff --git a/include/infinicore/ops/logsigmoid.hpp b/include/infinicore/ops/logsigmoid.hpp new file mode 100644 index 000000000..ba7af1699 --- /dev/null +++ b/include/infinicore/ops/logsigmoid.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class LogSigmoid { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor logsigmoid(Tensor input); +void logsigmoid_(Tensor output, Tensor input); +} // namespace infinicore::op + diff --git a/include/infinicore/ops/vdot.hpp b/include/infinicore/ops/vdot.hpp new file mode 100644 index 000000000..ba2b9550c --- /dev/null +++ b/include/infinicore/ops/vdot.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Vdot { +public: + using schema = void (*)(Tensor out, Tensor a, Tensor b); + static void execute(Tensor out, Tensor a, Tensor b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor vdot(Tensor a, Tensor b); +void vdot_(Tensor out, Tensor a, Tensor b); + +} // namespace infinicore::op + + diff --git a/include/infinicore/ops/where.hpp b/include/infinicore/ops/where.hpp new file mode 100644 index 000000000..85bfce417 --- /dev/null +++ b/include/infinicore/ops/where.hpp @@ -0,0 +1,30 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Where { +public: + using schema = void (*)(Tensor, Tensor, Tensor, Tensor); + static void execute(Tensor out, Tensor cond, Tensor x, Tensor y); + static common::OpDispatcher &dispatcher(); +}; + +Tensor where(Tensor cond, Tensor x, Tensor y); +void where_(Tensor out, Tensor cond, Tensor x, Tensor y); + +class WhereIndices { +public: + using schema = std::vector (*)(Tensor); + static std::vector execute(Tensor cond); + static common::OpDispatcher &dispatcher(); +}; + +// where(cond) -> tuple of index tensors +std::vector where_indices(Tensor cond); + +} // namespace infinicore::op + + diff --git a/include/infiniop.h b/include/infiniop.h index 92e6f5963..cc221682d 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -11,6 +11,9 @@ #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/layer_norm.h" +#include "infiniop/ops/logical_or.h" +#include "infiniop/ops/logical_xor.h" +#include "infiniop/ops/logsigmoid.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" #include "infiniop/ops/mul.h" @@ -29,6 +32,8 @@ #include "infiniop/ops/tanh.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/ops/topksoftmax.h" +#include "infiniop/ops/where.h" +#include "infiniop/ops/vdot.h" #include "infiniop/ops/zeros.h" #include "infiniop/tensor_descriptor.h" diff --git a/include/infiniop/ops/logical_or.h b/include/infiniop/ops/logical_or.h new file mode 100644 index 000000000..0c5351f63 --- /dev/null +++ b/include/infiniop/ops/logical_or.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_LOGICAL_OR_API_H__ +#define __INFINIOP_LOGICAL_OR_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogicalOrDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogicalOrDescriptor(infiniopHandle_t handle, + infiniopLogicalOrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + +__C __export infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLogicalOr(infiniopLogicalOrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc); + +#endif + diff --git a/include/infiniop/ops/logical_xor.h b/include/infiniop/ops/logical_xor.h new file mode 100644 index 000000000..fe6f9ba68 --- /dev/null +++ b/include/infiniop/ops/logical_xor.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_LOGICAL_XOR_API_H__ +#define __INFINIOP_LOGICAL_XOR_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogicalXorDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogicalXorDescriptor(infiniopHandle_t handle, + infiniopLogicalXorDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + +__C __export infiniStatus_t infiniopGetLogicalXorWorkspaceSize(infiniopLogicalXorDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLogicalXor(infiniopLogicalXorDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogicalXorDescriptor(infiniopLogicalXorDescriptor_t desc); + +#endif + diff --git a/include/infiniop/ops/logsigmoid.h b/include/infiniop/ops/logsigmoid.h new file mode 100644 index 000000000..5037bc67e --- /dev/null +++ b/include/infiniop/ops/logsigmoid.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_LOGSIGMOID_API_H__ +#define __INFINIOP_LOGSIGMOID_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogSigmoidDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogSigmoidDescriptor(infiniopHandle_t handle, + infiniopLogSigmoidDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetLogSigmoidWorkspaceSize(infiniopLogSigmoidDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLogSigmoid(infiniopLogSigmoidDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogSigmoidDescriptor(infiniopLogSigmoidDescriptor_t desc); + +#endif + diff --git a/include/infiniop/ops/vdot.h b/include/infiniop/ops/vdot.h new file mode 100644 index 000000000..b486b8e72 --- /dev/null +++ b/include/infiniop/ops/vdot.h @@ -0,0 +1,34 @@ +#ifndef __INFINIOP_VDOT_API_H__ +#define __INFINIOP_VDOT_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopVdotDescriptor_t; + +// out = vdot(a, b) +__C __export infiniStatus_t infiniopCreateVdotDescriptor( + infiniopHandle_t handle, + infiniopVdotDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + +__C __export infiniStatus_t infiniopGetVdotWorkspaceSize( + infiniopVdotDescriptor_t desc, + size_t *size); + +__C __export infiniStatus_t infiniopVdot( + infiniopVdotDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyVdotDescriptor( + infiniopVdotDescriptor_t desc); + +#endif // __INFINIOP_VDOT_API_H__ + + diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h new file mode 100644 index 000000000..196720cfb --- /dev/null +++ b/include/infiniop/ops/where.h @@ -0,0 +1,60 @@ +#ifndef __INFINIOP_WHERE_API_H__ +#define __INFINIOP_WHERE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t; + +// y = where(cond, x, y) +__C __export infiniStatus_t infiniopCreateWhereDescriptor( + infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t cond_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc); + +__C __export infiniStatus_t infiniopGetWhereWorkspaceSize( + infiniopWhereDescriptor_t desc, + size_t *size); + +__C __export infiniStatus_t infiniopWhere( + infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *cond, + const void *x, + const void *y, + void *stream); + +__C __export infiniStatus_t infiniopDestroyWhereDescriptor( + infiniopWhereDescriptor_t desc); + +// where(cond) -> indices tuple +typedef struct InfiniopDescriptor *infiniopWhereIndicesDescriptor_t; + +__C __export infiniStatus_t infiniopCreateWhereIndicesDescriptor( + infiniopHandle_t handle, + infiniopWhereIndicesDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t cond_desc); + +__C __export infiniStatus_t infiniopGetWhereIndicesWorkspaceSize( + infiniopWhereIndicesDescriptor_t desc, + size_t *size); + +__C __export infiniStatus_t infiniopWhereIndices( + infiniopWhereIndicesDescriptor_t desc, + void *workspace, + size_t workspace_size, + void **outputs, // NDIM 个输出张量的指针数组 + const void *cond, + void *stream, + size_t *num_true); // 输出:True 元素的数量 + +__C __export infiniStatus_t infiniopDestroyWhereIndicesDescriptor( + infiniopWhereIndicesDescriptor_t desc); + +#endif + + diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 5c541ec3c..00aa7d066 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -41,6 +41,11 @@ ) from infinicore.ops.add import add from infinicore.ops.attention import attention +from infinicore.ops.logical_or import logical_or +from infinicore.ops.logical_xor import logical_xor +from infinicore.ops.logsigmoid import logsigmoid +from infinicore.ops.where import where +from infinicore.ops.vdot import vdot from infinicore.ops.matmul import matmul from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow @@ -101,6 +106,11 @@ # Operations. "add", "attention", + "logical_or", + "logical_xor", + "logsigmoid", + "where", + "vdot", "matmul", "mul", "narrow", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..5819a19e6 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -1,6 +1,7 @@ from .causal_softmax import causal_softmax from .embedding import embedding from .linear import linear +from .logsigmoid import logsigmoid from .random_sample import random_sample from .rms_norm import rms_norm from .rope import RopeAlgo, rope @@ -17,4 +18,5 @@ "embedding", "rope", "RopeAlgo", + "logsigmoid", ] diff --git a/python/infinicore/nn/functional/logsigmoid.py b/python/infinicore/nn/functional/logsigmoid.py new file mode 100644 index 000000000..e79f16f65 --- /dev/null +++ b/python/infinicore/nn/functional/logsigmoid.py @@ -0,0 +1,14 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def logsigmoid(input: Tensor, out=None) -> Tensor: + """Apply elementwise log-sigmoid.""" + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.logsigmoid(input, out=out) + if out is None: + return Tensor(_infinicore.logsigmoid(input._underlying)) + + _infinicore.logsigmoid_(out._underlying, input._underlying) + return out diff --git a/python/infinicore/nn/functional/silu.py b/python/infinicore/nn/functional/silu.py index 36c7e4e3e..b2f97d0c0 100644 --- a/python/infinicore/nn/functional/silu.py +++ b/python/infinicore/nn/functional/silu.py @@ -6,7 +6,7 @@ def silu(input: Tensor, inplace: bool = False, *, out=None) -> Tensor: r"""Apply the Sigmoid Linear Unit (SiLU) function, element-wise.""" - if infinicore.use_ntops and input.device.type in ("cuda", "musa") and out is None: + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): return infinicore.ntops.torch.silu(input, inplace=inplace) if inplace: diff --git a/python/infinicore/ops/logical_or.py b/python/infinicore/ops/logical_or.py new file mode 100644 index 000000000..b83e23c88 --- /dev/null +++ b/python/infinicore/ops/logical_or.py @@ -0,0 +1,15 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def logical_or(input, other, *, out=None): + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.logical_or(input, other, out=out) + + if out is None: + return Tensor(_infinicore.logical_or(input._underlying, other._underlying)) + + _infinicore.logical_or_(out._underlying, input._underlying, other._underlying) + + return out diff --git a/python/infinicore/ops/logical_xor.py b/python/infinicore/ops/logical_xor.py new file mode 100644 index 000000000..5065d6e8d --- /dev/null +++ b/python/infinicore/ops/logical_xor.py @@ -0,0 +1,15 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def logical_xor(input, other, *, out=None): + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.logical_xor(input, other, out=out) + + if out is None: + return Tensor(_infinicore.logical_xor(input._underlying, other._underlying)) + + _infinicore.logical_xor_(out._underlying, input._underlying, other._underlying) + + return out diff --git a/python/infinicore/ops/logsigmoid.py b/python/infinicore/ops/logsigmoid.py new file mode 100644 index 000000000..3a0b85d39 --- /dev/null +++ b/python/infinicore/ops/logsigmoid.py @@ -0,0 +1,15 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def logsigmoid(input, *, out=None): + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.logsigmoid(input, out=out) + + if out is None: + return Tensor(_infinicore.logsigmoid(input._underlying)) + + _infinicore.logsigmoid_(out._underlying, input._underlying) + + return out diff --git a/python/infinicore/ops/vdot.py b/python/infinicore/ops/vdot.py new file mode 100644 index 000000000..2928a1d20 --- /dev/null +++ b/python/infinicore/ops/vdot.py @@ -0,0 +1,11 @@ +import infinicore +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def vdot(a: Tensor, b: Tensor) -> Tensor: + """ + InfiniCore vdot: 1D vector dot product, aligned with torch.vdot + for real-valued tensors (no complex conjugation). + """ + return Tensor(_infinicore.vdot(a._underlying, b._underlying)) diff --git a/python/infinicore/ops/where.py b/python/infinicore/ops/where.py new file mode 100644 index 000000000..552f3b1fb --- /dev/null +++ b/python/infinicore/ops/where.py @@ -0,0 +1,39 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor, from_torch +import infinicore + +import torch + + +def where(*args, out=None): + """Elementwise where(cond, x, y) selection. + + Two supported call patterns: + - where(cond, x, y) -> Tensor + - where(cond, x, y, out=...) -> out + + The condition-only variant where(cond) returning indices is implemented + by delegating to the underlying Torch tensor stored in cond._torch_ref. + """ + # condition-only mode: where(cond) -> indices tuple + if len(args) == 1: + cond = args[0] + + # Use native infiniop implementation + idx_tensors = _infinicore.where_indices(cond._underlying) + # Convert C++ Tensor objects to Python Tensor objects + return tuple(Tensor(t) for t in idx_tensors) + + if len(args) != 3: + raise TypeError("infinicore.where expects (cond, x, y)") + + cond, x, y = args + + if infinicore.use_ntops and x.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.where(cond, x, y, out=out) + + if out is None: + return Tensor(_infinicore.where(cond._underlying, x._underlying, y._underlying)) + + _infinicore.where_(out._underlying, cond._underlying, x._underlying, y._underlying) + return out diff --git a/python/infinicore/utils.py b/python/infinicore/utils.py index 094b2230e..4e6d3343c 100644 --- a/python/infinicore/utils.py +++ b/python/infinicore/utils.py @@ -23,6 +23,8 @@ def to_torch_dtype(infini_dtype): return torch.int64 elif infini_dtype == infinicore.uint8: return torch.uint8 + elif infini_dtype == infinicore.bool: + return torch.bool else: raise ValueError(f"Unsupported infinicore dtype: {infini_dtype}") @@ -45,6 +47,8 @@ def to_infinicore_dtype(torch_dtype): return infinicore.int64 elif torch_dtype == torch.uint8: return infinicore.uint8 + elif torch_dtype == torch.bool: + return infinicore.bool else: raise ValueError(f"Unsupported torch dtype: {torch_dtype}") diff --git a/src/infinicore/ops/logical_or/logical_or.cc b/src/infinicore/ops/logical_or/logical_or.cc new file mode 100644 index 000000000..597f38d88 --- /dev/null +++ b/src/infinicore/ops/logical_or/logical_or.cc @@ -0,0 +1,27 @@ +#include "infinicore/ops/logical_or.hpp" +#include "../../utils.hpp" +#include "infinicore/dtype.hpp" + +namespace infinicore::op { + +common::OpDispatcher &LogicalOr::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void LogicalOr::execute(Tensor c, Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); + infinicore::context::setDevice(c->device()); + dispatcher().lookup(c->device().getType())(c, a, b); +} + +Tensor logical_or(Tensor a, Tensor b) { + auto c = Tensor::empty(a->shape(), DataType::BOOL, a->device()); + logical_or_(c, a, b); + return c; +} + +void logical_or_(Tensor c, Tensor a, Tensor b) { + LogicalOr::execute(c, a, b); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/logical_or/logical_or_infiniop.cc b/src/infinicore/ops/logical_or/logical_or_infiniop.cc new file mode 100644 index 000000000..2398cae10 --- /dev/null +++ b/src/infinicore/ops/logical_or/logical_or_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/logical_or.hpp" +#include + +namespace infinicore::op::logical_or_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopLogicalOrDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyLogicalOrDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor c, Tensor a, Tensor b) { + size_t seed = hash_combine(c, a, b); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopLogicalOrDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateLogicalOrDescriptor( + context::getInfiniopHandle(c->device()), &desc, + c->desc(), a->desc(), b->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetLogicalOrWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopLogicalOr( + desc, workspace->data(), workspace_size, + c->data(), a->data(), b->data(), context::getStream())); +} + +static bool registered = []() { + LogicalOr::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::logical_or_impl::infiniop diff --git a/src/infinicore/ops/logical_xor/logical_xor.cc b/src/infinicore/ops/logical_xor/logical_xor.cc new file mode 100644 index 000000000..13270a90f --- /dev/null +++ b/src/infinicore/ops/logical_xor/logical_xor.cc @@ -0,0 +1,27 @@ +#include "infinicore/ops/logical_xor.hpp" +#include "../../utils.hpp" +#include "infinicore/dtype.hpp" + +namespace infinicore::op { + +common::OpDispatcher &LogicalXor::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void LogicalXor::execute(Tensor c, Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); + infinicore::context::setDevice(c->device()); + dispatcher().lookup(c->device().getType())(c, a, b); +} + +Tensor logical_xor(Tensor a, Tensor b) { + auto c = Tensor::empty(a->shape(), DataType::BOOL, a->device()); + logical_xor_(c, a, b); + return c; +} + +void logical_xor_(Tensor c, Tensor a, Tensor b) { + LogicalXor::execute(c, a, b); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/logical_xor/logical_xor_infiniop.cc b/src/infinicore/ops/logical_xor/logical_xor_infiniop.cc new file mode 100644 index 000000000..514b69ca4 --- /dev/null +++ b/src/infinicore/ops/logical_xor/logical_xor_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/logical_xor.hpp" +#include + +namespace infinicore::op::logical_xor_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopLogicalXorDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyLogicalXorDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor c, Tensor a, Tensor b) { + size_t seed = hash_combine(c, a, b); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopLogicalXorDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateLogicalXorDescriptor( + context::getInfiniopHandle(c->device()), &desc, + c->desc(), a->desc(), b->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetLogicalXorWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopLogicalXor( + desc, workspace->data(), workspace_size, + c->data(), a->data(), b->data(), context::getStream())); +} + +static bool registered = []() { + LogicalXor::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::logical_xor_impl::infiniop diff --git a/src/infinicore/ops/logsigmoid/logsigmoid.cc b/src/infinicore/ops/logsigmoid/logsigmoid.cc new file mode 100644 index 000000000..a1e5580a0 --- /dev/null +++ b/src/infinicore/ops/logsigmoid/logsigmoid.cc @@ -0,0 +1,34 @@ +#include "infinicore/ops/logsigmoid.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &LogSigmoid::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void LogSigmoid::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No LogSigmoid implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor logsigmoid(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + logsigmoid_(output, input); + return output; +} + +void logsigmoid_(Tensor output, Tensor input) { + LogSigmoid::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/logsigmoid/logsigmoid_infiniop.cc b/src/infinicore/ops/logsigmoid/logsigmoid_infiniop.cc new file mode 100644 index 000000000..9142032bf --- /dev/null +++ b/src/infinicore/ops/logsigmoid/logsigmoid_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/logsigmoid.hpp" +#include + +namespace infinicore::op::logsigmoid_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopLogSigmoidDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyLogSigmoidDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopLogSigmoidDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateLogSigmoidDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetLogSigmoidWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopLogSigmoid( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + LogSigmoid::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::logsigmoid_impl::infiniop diff --git a/src/infinicore/ops/vdot/vdot.cc b/src/infinicore/ops/vdot/vdot.cc new file mode 100644 index 000000000..24af3c279 --- /dev/null +++ b/src/infinicore/ops/vdot/vdot.cc @@ -0,0 +1,48 @@ +#include "infinicore/ops/vdot.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Vdot::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Vdot::execute(Tensor out, Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, a, b); + + // Inputs must be 1D and same length + if (a->ndim() != 1 || b->ndim() != 1) { + throw std::runtime_error("vdot: input tensors must be 1D"); + } + if (a->shape()[0] != b->shape()[0]) { + throw std::runtime_error("vdot: input tensors must have the same length"); + } + + infinicore::context::setDevice(out->device()); + auto device_type = out->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error( + "No Vdot implementation found for device type: " + + std::to_string(static_cast(device_type))); + } + + func(out, a, b); +} + +Tensor vdot(Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(a, b); + + // Output dtype equals input dtype for now + auto out = Tensor::empty({}, a->dtype(), a->device()); + vdot_(out, a, b); + return out; +} + +void vdot_(Tensor out, Tensor a, Tensor b) { + Vdot::execute(out, a, b); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/vdot/vdot_infiniop.cc b/src/infinicore/ops/vdot/vdot_infiniop.cc new file mode 100644 index 000000000..ff799aaad --- /dev/null +++ b/src/infinicore/ops/vdot/vdot_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/vdot.hpp" +#include + +namespace infinicore::op::vdot_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopVdotDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyVdotDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor out, Tensor a, Tensor b) { + size_t seed = hash_combine(out, a, b); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopVdotDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateVdotDescriptor( + context::getInfiniopHandle(out->device()), &desc, + out->desc(), a->desc(), b->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetVdotWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopVdot( + desc, workspace->data(), workspace_size, + out->data(), a->data(), b->data(), context::getStream())); +} + +static bool registered = []() { + Vdot::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::vdot_impl::infiniop diff --git a/src/infinicore/ops/where/where.cc b/src/infinicore/ops/where/where.cc new file mode 100644 index 000000000..2e00025b9 --- /dev/null +++ b/src/infinicore/ops/where/where.cc @@ -0,0 +1,61 @@ +#include "infinicore/ops/where.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Where::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Where::execute(Tensor out, Tensor cond, Tensor x, Tensor y) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, cond, x, y); + infinicore::context::setDevice(out->device()); + auto device_type = out->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error( + "No Where implementation found for device type: " + + std::to_string(static_cast(device_type))); + } + + func(out, cond, x, y); +} + +Tensor where(Tensor cond, Tensor x, Tensor y) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(cond, x, y); + // Output dtype follows x/y dtype + auto out = Tensor::empty(x->shape(), x->dtype(), x->device()); + where_(out, cond, x, y); + return out; +} + +void where_(Tensor out, Tensor cond, Tensor x, Tensor y) { + Where::execute(out, cond, x, y); +} + +common::OpDispatcher &WhereIndices::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +std::vector WhereIndices::execute(Tensor cond) { + infinicore::context::setDevice(cond->device()); + auto device_type = cond->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error( + "No WhereIndices implementation found for device type: " + + std::to_string(static_cast(device_type))); + } + + return func(cond); +} + +std::vector where_indices(Tensor cond) { + return WhereIndices::execute(cond); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/where/where_indices_infiniop.cc b/src/infinicore/ops/where/where_indices_infiniop.cc new file mode 100644 index 000000000..0930f8034 --- /dev/null +++ b/src/infinicore/ops/where/where_indices_infiniop.cc @@ -0,0 +1,85 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/dtype.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/where.hpp" +#include + +namespace infinicore::op::where_impl::infiniop { + +thread_local common::OpCache + indices_caches(100, [](infiniopWhereIndicesDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyWhereIndicesDescriptor(desc)); + desc = nullptr; + } + }); + +std::vector calculate(Tensor cond) { + size_t seed = hash_combine(cond); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = indices_caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopWhereIndicesDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateWhereIndicesDescriptor( + context::getInfiniopHandle(cond->device()), &desc, cond->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR( + infiniopGetWhereIndicesWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + size_t numel = cond->numel(); + int ndim = static_cast(cond->ndim()); + + // 先分配最大可能大小的输出(numel),实际大小会在 API 调用后确定 + std::vector outputs; + std::vector output_ptrs; + + for (int dim = 0; dim < ndim; ++dim) { + auto out = Tensor::empty({numel}, DataType::I64, cond->device()); + outputs.push_back(out); + output_ptrs.push_back(out->data()); + } + + // 调用 infiniop API,它会计算 num_true + size_t num_true = 0; + INFINICORE_CHECK_ERROR(infiniopWhereIndices( + desc, workspace->data(), workspace_size, output_ptrs.data(), cond->data(), + context::getStream(), &num_true)); + + // 同步流以确保计算完成 + context::syncStream(); + + // 如果实际 num_true 小于 numel,需要调整输出张量的大小 + // 但 Tensor 可能不支持调整大小,所以我们需要创建新的张量并复制数据 + if (num_true < numel) { + std::vector resized_outputs; + for (int dim = 0; dim < ndim; ++dim) { + auto resized = Tensor::empty({num_true}, DataType::I64, cond->device()); + // 复制前 num_true 个元素 + resized->copy_from(outputs[dim]->narrow({{0, 0, num_true}})); + resized_outputs.push_back(resized); + } + return resized_outputs; + } + + return outputs; +} + +static bool registered = []() { + WhereIndices::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::where_impl::infiniop diff --git a/src/infinicore/ops/where/where_infiniop.cc b/src/infinicore/ops/where/where_infiniop.cc new file mode 100644 index 000000000..1cd807d39 --- /dev/null +++ b/src/infinicore/ops/where/where_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/where.hpp" +#include + +namespace infinicore::op::where_impl::infiniop { + +thread_local common::OpCache caches( + 100, + [](infiniopWhereDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyWhereDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor out, Tensor cond, Tensor x, Tensor y) { + size_t seed = hash_combine(out, cond, x, y); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopWhereDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateWhereDescriptor( + context::getInfiniopHandle(out->device()), &desc, + out->desc(), cond->desc(), x->desc(), y->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetWhereWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopWhere( + desc, workspace->data(), workspace_size, + out->data(), cond->data(), x->data(), y->data(), context::getStream())); +} + +static bool registered = []() { + Where::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::where_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 978defa17..a015574a2 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -7,6 +7,9 @@ #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" #include "ops/linear.hpp" +#include "ops/logical_or.hpp" +#include "ops/logical_xor.hpp" +#include "ops/logsigmoid.hpp" #include "ops/matmul.hpp" #include "ops/mul.hpp" #include "ops/random_sample.hpp" @@ -14,7 +17,9 @@ #include "ops/rms_norm.hpp" #include "ops/rope.hpp" #include "ops/silu.hpp" +#include "ops/where.hpp" #include "ops/swiglu.hpp" +#include "ops/vdot.hpp" namespace py = pybind11; @@ -26,6 +31,9 @@ inline void bind(py::module &m) { bind_causal_softmax(m); bind_random_sample(m); bind_linear(m); + bind_logical_or(m); + bind_logical_xor(m); + bind_logsigmoid(m); bind_matmul(m); bind_mul(m); bind_rearrange(m); @@ -34,6 +42,8 @@ inline void bind(py::module &m) { bind_swiglu(m); bind_rope(m); bind_embedding(m); + bind_where(m); + bind_vdot(m); } } // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/logical_or.hpp b/src/infinicore/pybind11/ops/logical_or.hpp new file mode 100644 index 000000000..fef213e30 --- /dev/null +++ b/src/infinicore/pybind11/ops/logical_or.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include + +#include "infinicore/ops/logical_or.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_logical_or(py::module &m) { + m.def("logical_or", + &op::logical_or, + py::arg("a"), + py::arg("b"), + R"doc(Logical OR of two tensors.)doc"); + + m.def("logical_or_", + &op::logical_or_, + py::arg("c"), + py::arg("a"), + py::arg("b"), + R"doc(In-place logical OR of two tensors.)doc"); +} + +} // namespace infinicore::ops + diff --git a/src/infinicore/pybind11/ops/logical_xor.hpp b/src/infinicore/pybind11/ops/logical_xor.hpp new file mode 100644 index 000000000..53ef5361b --- /dev/null +++ b/src/infinicore/pybind11/ops/logical_xor.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include + +#include "infinicore/ops/logical_xor.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_logical_xor(py::module &m) { + m.def("logical_xor", + &op::logical_xor, + py::arg("a"), + py::arg("b"), + R"doc(Logical XOR of two tensors.)doc"); + + m.def("logical_xor_", + &op::logical_xor_, + py::arg("c"), + py::arg("a"), + py::arg("b"), + R"doc(In-place logical XOR of two tensors.)doc"); +} + +} // namespace infinicore::ops + diff --git a/src/infinicore/pybind11/ops/logsigmoid.hpp b/src/infinicore/pybind11/ops/logsigmoid.hpp new file mode 100644 index 000000000..7a56de01d --- /dev/null +++ b/src/infinicore/pybind11/ops/logsigmoid.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include + +#include "infinicore/ops/logsigmoid.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_logsigmoid(py::module &m) { + m.def("logsigmoid", + &op::logsigmoid, + py::arg("input"), + R"doc(LogSigmoid activation function.)doc"); + + m.def("logsigmoid_", + &op::logsigmoid_, + py::arg("output"), + py::arg("input"), + R"doc(In-place LogSigmoid activation function.)doc"); +} + +} // namespace infinicore::ops + diff --git a/src/infinicore/pybind11/ops/vdot.hpp b/src/infinicore/pybind11/ops/vdot.hpp new file mode 100644 index 000000000..63a4f85c0 --- /dev/null +++ b/src/infinicore/pybind11/ops/vdot.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include + +#include "infinicore/ops/vdot.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_vdot(py::module &m) { + m.def("vdot", + &op::vdot, + py::arg("a"), + py::arg("b"), + R"doc(Vector dot product for 1D tensors (real-valued).)doc"); +} + +} // namespace infinicore::ops + + diff --git a/src/infinicore/pybind11/ops/where.hpp b/src/infinicore/pybind11/ops/where.hpp new file mode 100644 index 000000000..0488ca662 --- /dev/null +++ b/src/infinicore/pybind11/ops/where.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include + +#include "infinicore/ops/where.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_where(py::module &m) { + m.def("where", + &op::where, + py::arg("cond"), + py::arg("x"), + py::arg("y"), + R"doc(Elementwise where(cond, x, y) selection.)doc"); + + m.def("where_", + &op::where_, + py::arg("out"), + py::arg("cond"), + py::arg("x"), + py::arg("y"), + R"doc(In-place elementwise where(cond, x, y) selection into out tensor.)doc"); + + m.def("where_indices", + &op::where_indices, + py::arg("cond"), + R"doc(Return a tuple of index tensors where condition is True.)doc"); +} + +} // namespace infinicore::ops + + diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc new file mode 100644 index 000000000..892b3d967 --- /dev/null +++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc @@ -0,0 +1,115 @@ +#include "logical_or_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include + +namespace op::logical_or::cpu { + +// Track input dtype for each Descriptor so we can support both +// - out-of-place / inplace(out) with bool output +// - inplace(a) / inplace(b) with output dtype == input dtype (int32/uint8) +static std::unordered_map g_input_dtype; +static std::mutex g_input_dtype_mutex; + +Descriptor::~Descriptor() { + std::lock_guard lock(g_input_dtype_mutex); + g_input_dtype.erase(this); +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + // True output dtype (memory layout of output buffer) + auto out_dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + // Input dtype(s) + auto a_dtype = a_desc->dtype(); + auto b_dtype = b_desc->dtype(); + + // Inputs must have the same dtype and be one of the supported ones + CHECK_OR_RETURN(a_dtype == b_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_DTYPE(a_dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I32, INFINI_DTYPE_U8); + + // Output must be either bool (standard case) or equal to input dtype + if (!(out_dtype == INFINI_DTYPE_BOOL || out_dtype == a_dtype)) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // Here we pass the *output* dtype into the descriptor, keeping the + // semantics of CREATE_ELEMENTWISE_CPU_DESCRIPTOR consistent with + // other elementwise ops. + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, out_dtype, out_desc, input_desc_vec); + + // Remember the common input dtype for this descriptor + { + std::lock_guard lock(g_input_dtype_mutex); + g_input_dtype[*desc_ptr] = a_dtype; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // _dtype now represents the output dtype (as in other elementwise ops) + auto out_dtype = _dtype; + + // Look up the input dtype for this descriptor + infiniDtype_t in_dtype; + { + std::lock_guard lock(g_input_dtype_mutex); + auto it = g_input_dtype.find(this); + if (it == g_input_dtype.end()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + in_dtype = it->second; + } + + // Case 1: boolean inputs (and thus boolean output) + if (in_dtype == INFINI_DTYPE_BOOL) { + // All bool: we can use the homogeneous path + return _device_info->calculate(_info, output, inputs, stream); + } + + // Case 2: integer inputs (int32 / uint8) + if (in_dtype == INFINI_DTYPE_I32) { + if (out_dtype == INFINI_DTYPE_BOOL) { + // Inputs int32, output bool + return _device_info->calculate(_info, output, inputs, stream); + } else if (out_dtype == INFINI_DTYPE_I32) { + // Inplace(a/b): inputs and output are int32 + // Use homogeneous path; LogicalOrOp returns bool which is + // implicitly converted to int32 (0/1). + return _device_info->calculate(_info, output, inputs, stream); + } + } else if (in_dtype == INFINI_DTYPE_U8) { + if (out_dtype == INFINI_DTYPE_BOOL) { + // Inputs uint8, output bool + return _device_info->calculate(_info, output, inputs, stream); + } else if (out_dtype == INFINI_DTYPE_U8) { + // Inplace(a/b): inputs and output are uint8 + return _device_info->calculate(_info, output, inputs, stream); + } + } + + return INFINI_STATUS_BAD_TENSOR_DTYPE; +} +} // namespace op::logical_or::cpu diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h new file mode 100644 index 000000000..720837dbf --- /dev/null +++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h @@ -0,0 +1,24 @@ +#ifndef __LOGICAL_OR_CPU_H__ +#define __LOGICAL_OR_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(logical_or, cpu) + +namespace op::logical_or::cpu { +typedef struct LogicalOrOp { +public: + static constexpr size_t num_inputs = 2; + template + bool operator()(const T &a, const T &b) const { + return static_cast(a) || static_cast(b); + } + // Support heterogeneous input types for elementwise framework + template + Tout operator()(const Ta &a, const Tb &b) const { + return static_cast(static_cast(a) || static_cast(b)); + } +} LogicalOrOp; +} // namespace op::logical_or::cpu + +#endif // __LOGICAL_OR_CPU_H__ diff --git a/src/infiniop/ops/logical_or/operator.cc b/src/infiniop/ops/logical_or/operator.cc new file mode 100644 index 000000000..544090081 --- /dev/null +++ b/src/infiniop/ops/logical_or/operator.cc @@ -0,0 +1,98 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/logical_or.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_or_cpu.h" +#endif + +__C infiniStatus_t infiniopCreateLogicalOrDescriptor( + infiniopHandle_t handle, + infiniopLogicalOrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logical_or::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLogicalOr( + infiniopLogicalOrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc new file mode 100644 index 000000000..384473b49 --- /dev/null +++ b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.cc @@ -0,0 +1,115 @@ +#include "logical_xor_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +#include + +namespace op::logical_xor::cpu { + +// Track input dtype for each Descriptor so we can support both +// - out-of-place / inplace(out) with bool output +// - inplace(a) / inplace(b) with output dtype == input dtype (int32/uint8) +static std::unordered_map g_input_dtype; +static std::mutex g_input_dtype_mutex; + +Descriptor::~Descriptor() { + std::lock_guard lock(g_input_dtype_mutex); + g_input_dtype.erase(this); +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + // True output dtype (memory layout of output buffer) + auto out_dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + // Input dtype(s) + auto a_dtype = a_desc->dtype(); + auto b_dtype = b_desc->dtype(); + + // Inputs must have the same dtype and be one of the supported ones + CHECK_OR_RETURN(a_dtype == b_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_DTYPE(a_dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I32, INFINI_DTYPE_U8); + + // Output must be either bool (standard case) or equal to input dtype + if (!(out_dtype == INFINI_DTYPE_BOOL || out_dtype == a_dtype)) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // Here we pass the *output* dtype into the descriptor, keeping the + // semantics of CREATE_ELEMENTWISE_CPU_DESCRIPTOR consistent with + // other elementwise ops. + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, out_dtype, out_desc, input_desc_vec); + + // Remember the common input dtype for this descriptor + { + std::lock_guard lock(g_input_dtype_mutex); + g_input_dtype[*desc_ptr] = a_dtype; + } + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // _dtype now represents the output dtype (as in other elementwise ops) + auto out_dtype = _dtype; + + // Look up the input dtype for this descriptor + infiniDtype_t in_dtype; + { + std::lock_guard lock(g_input_dtype_mutex); + auto it = g_input_dtype.find(this); + if (it == g_input_dtype.end()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + in_dtype = it->second; + } + + // Case 1: boolean inputs (and thus boolean output) + if (in_dtype == INFINI_DTYPE_BOOL) { + // All bool: we can use the homogeneous path + return _device_info->calculate(_info, output, inputs, stream); + } + + // Case 2: integer inputs (int32 / uint8) + if (in_dtype == INFINI_DTYPE_I32) { + if (out_dtype == INFINI_DTYPE_BOOL) { + // Inputs int32, output bool + return _device_info->calculate(_info, output, inputs, stream); + } else if (out_dtype == INFINI_DTYPE_I32) { + // Inplace(a/b): inputs and output are int32 + // Use homogeneous path; LogicalXorOp returns bool which is + // implicitly converted to int32 (0/1). + return _device_info->calculate(_info, output, inputs, stream); + } + } else if (in_dtype == INFINI_DTYPE_U8) { + if (out_dtype == INFINI_DTYPE_BOOL) { + // Inputs uint8, output bool + return _device_info->calculate(_info, output, inputs, stream); + } else if (out_dtype == INFINI_DTYPE_U8) { + // Inplace(a/b): inputs and output are uint8 + return _device_info->calculate(_info, output, inputs, stream); + } + } + + return INFINI_STATUS_BAD_TENSOR_DTYPE; +} +} // namespace op::logical_xor::cpu diff --git a/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h new file mode 100644 index 000000000..35747d742 --- /dev/null +++ b/src/infiniop/ops/logical_xor/cpu/logical_xor_cpu.h @@ -0,0 +1,24 @@ +#ifndef __LOGICAL_XOR_CPU_H__ +#define __LOGICAL_XOR_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(logical_xor, cpu) + +namespace op::logical_xor::cpu { +typedef struct LogicalXorOp { +public: + static constexpr size_t num_inputs = 2; + template + bool operator()(const T &a, const T &b) const { + return static_cast(a) != static_cast(b); + } + // Support heterogeneous input types for elementwise framework + template + Tout operator()(const Ta &a, const Tb &b) const { + return static_cast(static_cast(a) != static_cast(b)); + } +} LogicalXorOp; +} // namespace op::logical_xor::cpu + +#endif // __LOGICAL_XOR_CPU_H__ diff --git a/src/infiniop/ops/logical_xor/operator.cc b/src/infiniop/ops/logical_xor/operator.cc new file mode 100644 index 000000000..1558f4925 --- /dev/null +++ b/src/infiniop/ops/logical_xor/operator.cc @@ -0,0 +1,98 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/logical_xor.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_xor_cpu.h" +#endif + +__C infiniStatus_t infiniopCreateLogicalXorDescriptor( + infiniopHandle_t handle, + infiniopLogicalXorDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logical_xor::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogicalXorWorkspaceSize(infiniopLogicalXorDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLogicalXor( + infiniopLogicalXorDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyLogicalXorDescriptor(infiniopLogicalXorDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/logsigmoid/cpu/logsigmoid_cpu.cc b/src/infiniop/ops/logsigmoid/cpu/logsigmoid_cpu.cc new file mode 100644 index 000000000..1fdd2207b --- /dev/null +++ b/src/infiniop/ops/logsigmoid/cpu/logsigmoid_cpu.cc @@ -0,0 +1,51 @@ +#include "logsigmoid_cpu.h" + +namespace op::logsigmoid::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logsigmoid::cpu diff --git a/src/infiniop/ops/logsigmoid/cpu/logsigmoid_cpu.h b/src/infiniop/ops/logsigmoid/cpu/logsigmoid_cpu.h new file mode 100644 index 000000000..bdfaca385 --- /dev/null +++ b/src/infiniop/ops/logsigmoid/cpu/logsigmoid_cpu.h @@ -0,0 +1,20 @@ +#ifndef __LOGSIGMOID_CPU_H__ +#define __LOGSIGMOID_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(logsigmoid, cpu) + +namespace op::logsigmoid::cpu { +typedef struct LogSigmoidOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + // logsigmoid(x) = log(sigmoid(x)) = -log(1 + exp(-x)) + return -std::log(T(1) + std::exp(-x)); + } +} LogSigmoidOp; +} // namespace op::logsigmoid::cpu + +#endif // __LOGSIGMOID_CPU_H__ diff --git a/src/infiniop/ops/logsigmoid/operator.cc b/src/infiniop/ops/logsigmoid/operator.cc new file mode 100644 index 000000000..5409a672a --- /dev/null +++ b/src/infiniop/ops/logsigmoid/operator.cc @@ -0,0 +1,100 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/logsigmoid.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logsigmoid_cpu.h" +#endif + +__C infiniStatus_t infiniopCreateLogSigmoidDescriptor( + infiniopHandle_t handle, + infiniopLogSigmoidDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logsigmoid::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogSigmoidWorkspaceSize(infiniopLogSigmoidDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLogSigmoid( + infiniopLogSigmoidDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLogSigmoidDescriptor(infiniopLogSigmoidDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/vdot/cpu/vdot_cpu.cc b/src/infiniop/ops/vdot/cpu/vdot_cpu.cc new file mode 100644 index 000000000..e3511ebba --- /dev/null +++ b/src/infiniop/ops/vdot/cpu/vdot_cpu.cc @@ -0,0 +1,135 @@ +#include "vdot_cpu.h" +#include "../../../../utils.h" + +namespace op::vdot::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + + auto handle = reinterpret_cast(handle_); + auto in_dtype = a_desc->dtype(); + auto b_dtype = b_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + // Inputs must be 1D vectors with same length + if (a_desc->ndim() != 1 || b_desc->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (a_desc->numel() != b_desc->numel()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // Input dtypes must match and be in supported set + CHECK_OR_RETURN(in_dtype == b_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_DTYPE( + in_dtype, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_BF16); + + // Simplest: output dtype equals input dtype + CHECK_OR_RETURN(out_dtype == in_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + size_t length = a_desc->numel(); + ptrdiff_t a_stride = a_desc->stride(0); + ptrdiff_t b_stride = b_desc->stride(0); + + *desc_ptr = new Descriptor( + in_dtype, + out_dtype, + length, + a_stride, + b_stride, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +static void vdot_impl( + Tout *out, + const Tin *a_base, + const Tin *b_base, + size_t n, + ptrdiff_t a_stride, + ptrdiff_t b_stride) { + + if constexpr (std::is_same_v || std::is_same_v) { + // Accumulate in float for half/bfloat16, then cast back + float acc = 0.0f; + +#pragma omp parallel for reduction(+ : acc) + for (ptrdiff_t i = 0; i < static_cast(n); ++i) { + const Tin &av = a_base[i * a_stride]; + const Tin &bv = b_base[i * b_stride]; + float av_f = utils::cast(av); + float bv_f = utils::cast(bv); + acc += av_f * bv_f; + } + + *out = utils::cast(acc); + } else { + Tout acc{}; + +#pragma omp parallel for reduction(+ : acc) + for (ptrdiff_t i = 0; i < static_cast(n); ++i) { + const Tin &av = a_base[i * a_stride]; + const Tin &bv = b_base[i * b_stride]; + acc += static_cast(av) * static_cast(bv); + } + + *out = acc; + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *out, + const void *a, + const void *b, + void *stream) const { + + switch (_in_dtype) { + case INFINI_DTYPE_F16: + vdot_impl( + reinterpret_cast(out), + reinterpret_cast(a), + reinterpret_cast(b), + _length, + _a_stride, + _b_stride); + break; + case INFINI_DTYPE_F32: + vdot_impl( + reinterpret_cast(out), + reinterpret_cast(a), + reinterpret_cast(b), + _length, + _a_stride, + _b_stride); + break; + case INFINI_DTYPE_BF16: + vdot_impl( + reinterpret_cast(out), + reinterpret_cast(a), + reinterpret_cast(b), + _length, + _a_stride, + _b_stride); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::vdot::cpu diff --git a/src/infiniop/ops/vdot/cpu/vdot_cpu.h b/src/infiniop/ops/vdot/cpu/vdot_cpu.h new file mode 100644 index 000000000..08a01ff6e --- /dev/null +++ b/src/infiniop/ops/vdot/cpu/vdot_cpu.h @@ -0,0 +1,54 @@ +#ifndef __VDOT_CPU_H__ +#define __VDOT_CPU_H__ + +#include "../../../devices/cpu/common_cpu.h" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::vdot::cpu { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _in_dtype; + infiniDtype_t _out_dtype; + size_t _length; + ptrdiff_t _a_stride; + ptrdiff_t _b_stride; + +public: + Descriptor(infiniDtype_t in_dtype, + infiniDtype_t out_dtype, + size_t length, + ptrdiff_t a_stride, + ptrdiff_t b_stride, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _in_dtype(in_dtype), + _out_dtype(out_dtype), + _length(length), + _a_stride(a_stride), + _b_stride(b_stride) {} + + ~Descriptor(); + + size_t workspaceSize() const { return 0; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void *out, + const void *a, + const void *b, + void *stream) const; +}; + +} // namespace op::vdot::cpu + +#endif // __VDOT_CPU_H__ diff --git a/src/infiniop/ops/vdot/cuda/kernel.cuh b/src/infiniop/ops/vdot/cuda/kernel.cuh new file mode 100644 index 000000000..7e12ec090 --- /dev/null +++ b/src/infiniop/ops/vdot/cuda/kernel.cuh @@ -0,0 +1,43 @@ +#ifndef __VDOT_CUDA_KERNEL_CUH__ +#define __VDOT_CUDA_KERNEL_CUH__ + +#include +// 不在这里包含 cuda_runtime.h,让各个平台的头文件提供必要的定义 + +namespace op::vdot::cuda { + +template +__global__ void vdotKernel(Tcompute *out, const Tdata *a, const Tdata *b, + size_t length, ptrdiff_t a_stride, + ptrdiff_t b_stride) { + + // 每个线程计算部分点积 + Tcompute local_sum = 0; + for (size_t i = threadIdx.x; i < length; i += BLOCK_SIZE) { + Tcompute a_val = static_cast(a[i * a_stride]); + Tcompute b_val = static_cast(b[i * b_stride]); + local_sum += a_val * b_val; + } + + // 使用共享内存进行 block 内归约(不依赖 CUB) + __shared__ Tcompute sdata[BLOCK_SIZE]; + sdata[threadIdx.x] = local_sum; + __syncthreads(); + + // 标准的二分归约算法 + for (unsigned int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + sdata[threadIdx.x] += sdata[threadIdx.x + s]; + } + __syncthreads(); + } + + // Thread 0 写入结果 + if (threadIdx.x == 0) { + *out = sdata[0]; + } +} + +} // namespace op::vdot::cuda + +#endif // __VDOT_CUDA_KERNEL_CUH__ diff --git a/src/infiniop/ops/vdot/metax/vdot_metax.h b/src/infiniop/ops/vdot/metax/vdot_metax.h new file mode 100644 index 000000000..bd589e79c --- /dev/null +++ b/src/infiniop/ops/vdot/metax/vdot_metax.h @@ -0,0 +1,45 @@ +#ifndef __VDOT_METAX_API_H__ +#define __VDOT_METAX_API_H__ + +#include "../../../devices/metax/metax_handle.h" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::vdot::metax { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _in_dtype; + infiniDtype_t _out_dtype; + size_t _length; + ptrdiff_t _a_stride; + ptrdiff_t _b_stride; + +public: + Descriptor(infiniDtype_t in_dtype, infiniDtype_t out_dtype, size_t length, + ptrdiff_t a_stride, ptrdiff_t b_stride, infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, _in_dtype(in_dtype), + _out_dtype(out_dtype), _length(length), _a_stride(a_stride), + _b_stride(b_stride) {} + + ~Descriptor(); + + size_t workspaceSize() const { + // Need workspace for FP16/BF16 to accumulate in float + return (_in_dtype == INFINI_DTYPE_F16 || _in_dtype == INFINI_DTYPE_BF16) + ? sizeof(float) + : 0; + } + + static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + + infiniStatus_t calculate(void *workspace, size_t workspace_size, void *out, + const void *a, const void *b, void *stream) const; +}; + +} // namespace op::vdot::metax + +#endif // __VDOT_METAX_API_H__ diff --git a/src/infiniop/ops/vdot/metax/vdot_metax.maca b/src/infiniop/ops/vdot/metax/vdot_metax.maca new file mode 100644 index 000000000..c914d5856 --- /dev/null +++ b/src/infiniop/ops/vdot/metax/vdot_metax.maca @@ -0,0 +1,125 @@ +#include "../../../devices/metax/metax_common.h" +#include "vdot_metax.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../cuda/kernel.cuh" + +namespace op::vdot::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + + auto handle = reinterpret_cast(handle_); + auto in_dtype = a_desc->dtype(); + auto b_dtype = b_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + // Inputs must be 1D vectors with same length + if (a_desc->ndim() != 1 || b_desc->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (a_desc->numel() != b_desc->numel()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // Input dtypes must match and be in supported set + CHECK_OR_RETURN(in_dtype == b_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_DTYPE(in_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + // Output dtype equals input dtype + CHECK_OR_RETURN(out_dtype == in_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + size_t length = a_desc->numel(); + ptrdiff_t a_stride = a_desc->stride(0); + ptrdiff_t b_stride = b_desc->stride(0); + + *desc_ptr = new Descriptor(in_dtype, out_dtype, length, a_stride, b_stride, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *out, const void *a, const void *b, + void *stream) const { + + auto hc_stream = reinterpret_cast(stream); + constexpr unsigned int BLOCK_SIZE = 256; + + switch (_in_dtype) { + case INFINI_DTYPE_F32: { + float *out_f = reinterpret_cast(out); + const float *a_f = reinterpret_cast(a); + const float *b_f = reinterpret_cast(b); + op::vdot::cuda::vdotKernel + <<<1, BLOCK_SIZE, 0, hc_stream>>>(out_f, a_f, b_f, _length, _a_stride, + _b_stride); + break; + } + case INFINI_DTYPE_F64: { + double *out_d = reinterpret_cast(out); + const double *a_d = reinterpret_cast(a); + const double *b_d = reinterpret_cast(b); + op::vdot::cuda::vdotKernel + <<<1, BLOCK_SIZE, 0, hc_stream>>>(out_d, a_d, b_d, _length, _a_stride, + _b_stride); + break; + } + case INFINI_DTYPE_F16: { + // For FP16, accumulate in float, then cast back to half + if (workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + float *tmp_out = reinterpret_cast(workspace); + { + const __half *a_h = reinterpret_cast(a); + const __half *b_h = reinterpret_cast(b); + op::vdot::cuda::vdotKernel + <<<1, BLOCK_SIZE, 0, hc_stream>>>(tmp_out, a_h, b_h, _length, + _a_stride, _b_stride); + float result_f; + CHECK_METAX(hcMemcpyAsync(&result_f, tmp_out, sizeof(float), + hcMemcpyDeviceToHost, hc_stream)); + CHECK_METAX(hcStreamSynchronize(hc_stream)); + __half h_result = __float2half(result_f); + CHECK_METAX(hcMemcpyAsync(out, &h_result, sizeof(__half), + hcMemcpyHostToDevice, hc_stream)); + } + break; + } + case INFINI_DTYPE_BF16: { + // For BF16, accumulate in float, then cast back + if (workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + float *tmp_out = reinterpret_cast(workspace); + { + const __hpcc_bfloat16 *a_bf = reinterpret_cast(a); + const __hpcc_bfloat16 *b_bf = reinterpret_cast(b); + op::vdot::cuda::vdotKernel + <<<1, BLOCK_SIZE, 0, hc_stream>>>(tmp_out, a_bf, b_bf, _length, + _a_stride, _b_stride); + float result_f; + CHECK_METAX(hcMemcpyAsync(&result_f, tmp_out, sizeof(float), + hcMemcpyDeviceToHost, hc_stream)); + CHECK_METAX(hcStreamSynchronize(hc_stream)); + __hpcc_bfloat16 bf_result = __float2bfloat16(result_f); + CHECK_METAX(hcMemcpyAsync(out, &bf_result, sizeof(__hpcc_bfloat16), + hcMemcpyHostToDevice, hc_stream)); + } + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::vdot::metax + diff --git a/src/infiniop/ops/vdot/moore/vdot_moore.h b/src/infiniop/ops/vdot/moore/vdot_moore.h new file mode 100644 index 000000000..6d30b76f3 --- /dev/null +++ b/src/infiniop/ops/vdot/moore/vdot_moore.h @@ -0,0 +1,45 @@ +#ifndef __VDOT_MOORE_API_H__ +#define __VDOT_MOORE_API_H__ + +#include "../../../devices/moore/moore_handle.h" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::vdot::moore { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _in_dtype; + infiniDtype_t _out_dtype; + size_t _length; + ptrdiff_t _a_stride; + ptrdiff_t _b_stride; + +public: + Descriptor(infiniDtype_t in_dtype, infiniDtype_t out_dtype, size_t length, + ptrdiff_t a_stride, ptrdiff_t b_stride, infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, _in_dtype(in_dtype), + _out_dtype(out_dtype), _length(length), _a_stride(a_stride), + _b_stride(b_stride) {} + + ~Descriptor(); + + size_t workspaceSize() const { + // Need workspace for FP16/BF16 to accumulate in float + return (_in_dtype == INFINI_DTYPE_F16 || _in_dtype == INFINI_DTYPE_BF16) + ? sizeof(float) + : 0; + } + + static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + + infiniStatus_t calculate(void *workspace, size_t workspace_size, void *out, + const void *a, const void *b, void *stream) const; +}; + +} // namespace op::vdot::moore + +#endif // __VDOT_MOORE_API_H__ diff --git a/src/infiniop/ops/vdot/moore/vdot_moore.mu b/src/infiniop/ops/vdot/moore/vdot_moore.mu new file mode 100644 index 000000000..91b4dee5c --- /dev/null +++ b/src/infiniop/ops/vdot/moore/vdot_moore.mu @@ -0,0 +1,125 @@ +#include "../../../devices/moore/moore_common.h" +#include "vdot_moore.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "../cuda/kernel.cuh" + +namespace op::vdot::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + + auto handle = reinterpret_cast(handle_); + auto in_dtype = a_desc->dtype(); + auto b_dtype = b_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + // Inputs must be 1D vectors with same length + if (a_desc->ndim() != 1 || b_desc->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (a_desc->numel() != b_desc->numel()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // Input dtypes must match and be in supported set + CHECK_OR_RETURN(in_dtype == b_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_DTYPE(in_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + // Output dtype equals input dtype + CHECK_OR_RETURN(out_dtype == in_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + size_t length = a_desc->numel(); + ptrdiff_t a_stride = a_desc->stride(0); + ptrdiff_t b_stride = b_desc->stride(0); + + *desc_ptr = new Descriptor(in_dtype, out_dtype, length, a_stride, b_stride, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *out, const void *a, const void *b, + void *stream) const { + + auto musa_stream = reinterpret_cast(stream); + constexpr unsigned int BLOCK_SIZE = 256; + + switch (_in_dtype) { + case INFINI_DTYPE_F32: { + float *out_f = reinterpret_cast(out); + const float *a_f = reinterpret_cast(a); + const float *b_f = reinterpret_cast(b); + op::vdot::cuda::vdotKernel + <<<1, BLOCK_SIZE, 0, musa_stream>>>(out_f, a_f, b_f, _length, _a_stride, + _b_stride); + break; + } + case INFINI_DTYPE_F64: { + double *out_d = reinterpret_cast(out); + const double *a_d = reinterpret_cast(a); + const double *b_d = reinterpret_cast(b); + op::vdot::cuda::vdotKernel + <<<1, BLOCK_SIZE, 0, musa_stream>>>(out_d, a_d, b_d, _length, _a_stride, + _b_stride); + break; + } + case INFINI_DTYPE_F16: { + // For FP16, accumulate in float, then cast back to half + if (workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + float *tmp_out = reinterpret_cast(workspace); + { + const __half *a_h = reinterpret_cast(a); + const __half *b_h = reinterpret_cast(b); + op::vdot::cuda::vdotKernel + <<<1, BLOCK_SIZE, 0, musa_stream>>>(tmp_out, a_h, b_h, _length, + _a_stride, _b_stride); + float result_f; + CHECK_MOORE(musaMemcpyAsync(&result_f, tmp_out, sizeof(float), + musaMemcpyDeviceToHost, musa_stream)); + CHECK_MOORE(musaStreamSynchronize(musa_stream)); + __half h_result = __float2half(result_f); + CHECK_MOORE(musaMemcpyAsync(out, &h_result, sizeof(__half), + musaMemcpyHostToDevice, musa_stream)); + } + break; + } + case INFINI_DTYPE_BF16: { + // For BF16, accumulate in float, then cast back + if (workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + float *tmp_out = reinterpret_cast(workspace); + { + const __mt_bfloat16 *a_bf = reinterpret_cast(a); + const __mt_bfloat16 *b_bf = reinterpret_cast(b); + op::vdot::cuda::vdotKernel + <<<1, BLOCK_SIZE, 0, musa_stream>>>(tmp_out, a_bf, b_bf, _length, + _a_stride, _b_stride); + float result_f; + CHECK_MOORE(musaMemcpyAsync(&result_f, tmp_out, sizeof(float), + musaMemcpyDeviceToHost, musa_stream)); + CHECK_MOORE(musaStreamSynchronize(musa_stream)); + __mt_bfloat16 bf_result = __float2bfloat16(result_f); + CHECK_MOORE(musaMemcpyAsync(out, &bf_result, sizeof(__mt_bfloat16), + musaMemcpyHostToDevice, musa_stream)); + } + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::vdot::moore + diff --git a/src/infiniop/ops/vdot/nvidia/vdot_nvidia.cu b/src/infiniop/ops/vdot/nvidia/vdot_nvidia.cu new file mode 100644 index 000000000..1b9061bac --- /dev/null +++ b/src/infiniop/ops/vdot/nvidia/vdot_nvidia.cu @@ -0,0 +1,166 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "vdot_nvidia.cuh" + +namespace op::vdot::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + + auto handle = reinterpret_cast(handle_); + auto in_dtype = a_desc->dtype(); + auto b_dtype = b_desc->dtype(); + auto out_dtype = out_desc->dtype(); + + // Inputs must be 1D vectors with same length + if (a_desc->ndim() != 1 || b_desc->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (a_desc->numel() != b_desc->numel()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // Input dtypes must match and be in supported set + CHECK_OR_RETURN(in_dtype == b_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_DTYPE(in_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + // Output dtype equals input dtype + CHECK_OR_RETURN(out_dtype == in_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + size_t length = a_desc->numel(); + ptrdiff_t a_stride = a_desc->stride(0); + ptrdiff_t b_stride = b_desc->stride(0); + + *desc_ptr = new Descriptor(in_dtype, out_dtype, length, a_stride, b_stride, + handle->internal(), handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *out, const void *a, const void *b, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + + // For FP16/BF16, use CUDA kernel instead of cuBLAS + if (_in_dtype == INFINI_DTYPE_F16 || _in_dtype == INFINI_DTYPE_BF16) { + switch (_in_dtype) { + case INFINI_DTYPE_F16: { + if (workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + float *tmp_out = reinterpret_cast(workspace); + const __half *a_h = reinterpret_cast(a); + const __half *b_h = reinterpret_cast(b); + constexpr unsigned int BLOCK_SIZE = 256; + op::vdot::cuda::vdotKernel + <<<1, BLOCK_SIZE, 0, cuda_stream>>>(tmp_out, a_h, b_h, _length, + _a_stride, _b_stride); + CHECK_CUDA(cudaGetLastError()); + float result_f; + CHECK_CUDA(cudaMemcpyAsync(&result_f, tmp_out, sizeof(float), + cudaMemcpyDeviceToHost, cuda_stream)); + CHECK_CUDA(cudaStreamSynchronize(cuda_stream)); + __half h_result = __float2half(result_f); + CHECK_CUDA(cudaMemcpyAsync(out, &h_result, sizeof(__half), + cudaMemcpyHostToDevice, cuda_stream)); + break; + } + case INFINI_DTYPE_BF16: { + if (workspace_size < sizeof(float)) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + float *tmp_out = reinterpret_cast(workspace); + const __nv_bfloat16 *a_bf = reinterpret_cast(a); + const __nv_bfloat16 *b_bf = reinterpret_cast(b); + constexpr unsigned int BLOCK_SIZE = 256; + op::vdot::cuda::vdotKernel + <<<1, BLOCK_SIZE, 0, cuda_stream>>>(tmp_out, a_bf, b_bf, _length, + _a_stride, _b_stride); + CHECK_CUDA(cudaGetLastError()); + float result_f; + CHECK_CUDA(cudaMemcpyAsync(&result_f, tmp_out, sizeof(float), + cudaMemcpyDeviceToHost, cuda_stream)); + CHECK_CUDA(cudaStreamSynchronize(cuda_stream)); + __nv_bfloat16 bf_result = __float2bfloat16(result_f); + CHECK_CUDA(cudaMemcpyAsync(out, &bf_result, sizeof(__nv_bfloat16), + cudaMemcpyHostToDevice, cuda_stream)); + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + } + + // Use cuBLAS for F32/F64 + CHECK_STATUS(_internal->useCublas(cuda_stream, [&](cublasHandle_t handle) { + switch (_in_dtype) { + case INFINI_DTYPE_F32: { + if (_a_stride == 1 && _b_stride == 1) { + // Contiguous case: use cublasSdot + float result; + CHECK_CUBLAS(cublasSdot(handle, static_cast(_length), + reinterpret_cast(a), + static_cast(_a_stride), + reinterpret_cast(b), + static_cast(_b_stride), &result)); + CHECK_CUDA(cudaMemcpyAsync(out, &result, sizeof(float), + cudaMemcpyHostToDevice, cuda_stream)); + } else { + // Strided case: use cublasDotEx + float result; + CHECK_CUBLAS(cublasDotEx(handle, static_cast(_length), + reinterpret_cast(a), CUDA_R_32F, + static_cast(_a_stride), + reinterpret_cast(b), CUDA_R_32F, + static_cast(_b_stride), &result, + CUDA_R_32F, CUDA_R_32F)); + CHECK_CUDA(cudaMemcpyAsync(out, &result, sizeof(float), + cudaMemcpyHostToDevice, cuda_stream)); + } + break; + } + case INFINI_DTYPE_F64: { + if (_a_stride == 1 && _b_stride == 1) { + // Contiguous case: use cublasDdot + double result; + CHECK_CUBLAS(cublasDdot(handle, static_cast(_length), + reinterpret_cast(a), + static_cast(_a_stride), + reinterpret_cast(b), + static_cast(_b_stride), &result)); + CHECK_CUDA(cudaMemcpyAsync(out, &result, sizeof(double), + cudaMemcpyHostToDevice, cuda_stream)); + } else { + // Strided case: use cublasDotEx + double result; + CHECK_CUBLAS(cublasDotEx(handle, static_cast(_length), + reinterpret_cast(a), + CUDA_R_64F, static_cast(_a_stride), + reinterpret_cast(b), + CUDA_R_64F, static_cast(_b_stride), + &result, CUDA_R_64F, CUDA_R_64F)); + CHECK_CUDA(cudaMemcpyAsync(out, &result, sizeof(double), + cudaMemcpyHostToDevice, cuda_stream)); + } + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::vdot::nvidia diff --git a/src/infiniop/ops/vdot/nvidia/vdot_nvidia.cuh b/src/infiniop/ops/vdot/nvidia/vdot_nvidia.cuh new file mode 100644 index 000000000..d4d082705 --- /dev/null +++ b/src/infiniop/ops/vdot/nvidia/vdot_nvidia.cuh @@ -0,0 +1,47 @@ +#ifndef __VDOT_NVIDIA_CUH__ +#define __VDOT_NVIDIA_CUH__ + +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::vdot::nvidia { + +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _in_dtype; + infiniDtype_t _out_dtype; + size_t _length; + ptrdiff_t _a_stride; + ptrdiff_t _b_stride; + std::shared_ptr _internal; + +public: + Descriptor(infiniDtype_t in_dtype, infiniDtype_t out_dtype, size_t length, + ptrdiff_t a_stride, ptrdiff_t b_stride, + std::shared_ptr internal, + infiniDevice_t device_type, int device_id) + : InfiniopDescriptor{device_type, device_id}, _in_dtype(in_dtype), + _out_dtype(out_dtype), _length(length), _a_stride(a_stride), + _b_stride(b_stride), _internal(internal) {} + + ~Descriptor(); + + size_t workspaceSize() const { + // Need workspace for FP16/BF16 to accumulate in float + return (_in_dtype == INFINI_DTYPE_F16 || _in_dtype == INFINI_DTYPE_BF16) + ? sizeof(float) + : 0; + } + + static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc); + + infiniStatus_t calculate(void *workspace, size_t workspace_size, void *out, + const void *a, const void *b, void *stream) const; +}; + +} // namespace op::vdot::nvidia + +#endif // __VDOT_NVIDIA_CUH__ diff --git a/src/infiniop/ops/vdot/operator.cc b/src/infiniop/ops/vdot/operator.cc new file mode 100644 index 000000000..ca25c18a4 --- /dev/null +++ b/src/infiniop/ops/vdot/operator.cc @@ -0,0 +1,165 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/vdot.h" + +#ifdef ENABLE_CPU_API +#include "cpu/vdot_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/vdot_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/vdot_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/vdot_moore.h" +#endif + +__C infiniStatus_t infiniopCreateVdotDescriptor( + infiniopHandle_t handle, + infiniopVdotDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::vdot::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + out_desc, \ + a_desc, b_desc) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetVdotWorkspaceSize( + infiniopVdotDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopVdot( + infiniopVdotDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, out, a, b, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyVdotDescriptor(infiniopVdotDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/where/cpu/where_cpu.cc b/src/infiniop/ops/where/cpu/where_cpu.cc new file mode 100644 index 000000000..3bf58e55f --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.cc @@ -0,0 +1,90 @@ +#include "where_cpu.h" +#include "../../../devices/cpu/common_cpu.h" + +namespace op::where::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + // Expected inputs: 0: cond, 1: x, 2: y + if (input_desc_vec.size() != 3) { + return INFINI_STATUS_BAD_PARAM; + } + + auto out_dtype = out_desc->dtype(); + + const auto &cond_desc = input_desc_vec.at(0); + const auto &x_desc = input_desc_vec.at(1); + const auto &y_desc = input_desc_vec.at(2); + + const auto &out_shape = out_desc->shape(); + const auto &cond_shape = cond_desc->shape(); + const auto &x_shape = x_desc->shape(); + const auto &y_shape = y_desc->shape(); + + // cond must be bool + CHECK_DTYPE(cond_desc->dtype(), INFINI_DTYPE_BOOL); + + // x, y and output must share the same dtype + auto x_dtype = x_desc->dtype(); + auto y_dtype = y_desc->dtype(); + CHECK_OR_RETURN(x_dtype == y_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_OR_RETURN(x_dtype == out_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE); + + // Supported value dtypes (extend if needed) + CHECK_DTYPE( + out_dtype, + INFINI_DTYPE_F16, + INFINI_DTYPE_F32, + INFINI_DTYPE_F64, + INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, + INFINI_DTYPE_I64, + INFINI_DTYPE_U8); + + // For now, require all shapes to match (no broadcasting) + CHECK_SAME_SHAPE(out_shape, cond_shape, x_shape, y_shape); + + // Create CPU elementwise descriptor with output dtype + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, out_dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::where::cpu diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h new file mode 100644 index 000000000..c706e1edb --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.h @@ -0,0 +1,31 @@ +#ifndef __WHERE_CPU_H__ +#define __WHERE_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +// Define Descriptor: op::where::cpu::Descriptor +ELEMENTWISE_DESCRIPTOR(where, cpu) + +namespace op::where::cpu { + +struct WhereOp { +public: + // Three inputs: cond, x, y + static constexpr size_t num_inputs = 3; + + // Homogeneous version: cond is already bool, x/y have same type as output + template + T operator()(const bool &cond, const T &x, const T &y) const { + return cond ? x : y; + } + + // Heterogeneous version: support non-bool cond or explicit Tout + template + Tout operator()(const Tcond &cond, const Tx &x, const Ty &y) const { + return static_cast(cond) ? static_cast(x) : static_cast(y); + } +}; + +} // namespace op::where::cpu + +#endif // __WHERE_CPU_H__ diff --git a/src/infiniop/ops/where/cpu/where_indices_cpu.cc b/src/infiniop/ops/where/cpu/where_indices_cpu.cc new file mode 100644 index 000000000..864dab738 --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_indices_cpu.cc @@ -0,0 +1,86 @@ +#include "where_indices_cpu.h" +#include +#include +#include + +namespace op::where::cpu { + +infiniStatus_t IndicesDescriptor::create( + infiniopHandle_t handle_, + IndicesDescriptor **desc_ptr, + infiniopTensorDescriptor_t cond_desc) { + + // 检查条件必须是 bool 类型 + if (cond_desc->dtype() != INFINI_DTYPE_BOOL) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + size_t numel = cond_desc->numel(); + int ndim = static_cast(cond_desc->ndim()); + + std::vector shape(ndim); + std::vector strides(ndim); + for (int i = 0; i < ndim; ++i) { + shape[i] = cond_desc->shape()[i]; + strides[i] = cond_desc->stride(i); + } + + *desc_ptr = new IndicesDescriptor( + numel, ndim, shape.data(), strides.data(), + INFINI_DEVICE_CPU, 0); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t IndicesDescriptor::calculate( + void *workspace, + size_t workspace_size, + void **outputs, + const void *cond, + void *stream, + size_t *num_true) const { + + const bool *cond_ptr = reinterpret_cast(cond); + int64_t **output_ptrs = new int64_t *[_ndim]; + for (int i = 0; i < _ndim; ++i) { + output_ptrs[i] = reinterpret_cast(outputs[i]); + } + + // 使用递归函数遍历所有多维索引,正确处理 strided tensor + std::vector indices(_ndim, 0); + size_t output_idx = 0; + + // 递归函数来遍历所有多维索引 + std::function traverse = [&](int dim) { + if (dim == _ndim) { + // 计算内存偏移(考虑 stride) + size_t offset = 0; + for (int i = 0; i < _ndim; ++i) { + offset += indices[i] * static_cast(_strides[i]); + } + + // 检查条件是否为 True + if (cond_ptr[offset]) { + // 记录多维索引 + for (int i = 0; i < _ndim; ++i) { + output_ptrs[i][output_idx] = static_cast(indices[i]); + } + output_idx++; + } + } else { + // 递归遍历当前维度的所有可能值 + for (size_t i = 0; i < _shape[dim]; ++i) { + indices[dim] = i; + traverse(dim + 1); + } + } + }; + + traverse(0); + + *num_true = output_idx; + delete[] output_ptrs; + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::where::cpu diff --git a/src/infiniop/ops/where/cpu/where_indices_cpu.h b/src/infiniop/ops/where/cpu/where_indices_cpu.h new file mode 100644 index 000000000..3712533f7 --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_indices_cpu.h @@ -0,0 +1,57 @@ +#ifndef __WHERE_INDICES_CPU_H__ +#define __WHERE_INDICES_CPU_H__ + +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::where::cpu { + +class IndicesDescriptor final : public InfiniopDescriptor { + size_t _numel; + int _ndim; + size_t *_shape; + ptrdiff_t *_strides; + +public: + IndicesDescriptor( + size_t numel, + int ndim, + const size_t *shape, + const ptrdiff_t *strides, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _numel(numel), + _ndim(ndim) { + _shape = new size_t[ndim]; + _strides = new ptrdiff_t[ndim]; + for (int i = 0; i < ndim; ++i) { + _shape[i] = shape[i]; + _strides[i] = strides[i]; + } + } + + ~IndicesDescriptor() { + delete[] _shape; + delete[] _strides; + } + + size_t workspaceSize() const { return 0; } + + static infiniStatus_t create( + infiniopHandle_t handle, + IndicesDescriptor **desc_ptr, + infiniopTensorDescriptor_t cond_desc); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void **outputs, + const void *cond, + void *stream, + size_t *num_true) const; +}; + +} // namespace op::where::cpu + +#endif // __WHERE_INDICES_CPU_H__ diff --git a/src/infiniop/ops/where/cuda/where_indices_kernel.cuh b/src/infiniop/ops/where/cuda/where_indices_kernel.cuh new file mode 100644 index 000000000..834aeab80 --- /dev/null +++ b/src/infiniop/ops/where/cuda/where_indices_kernel.cuh @@ -0,0 +1,74 @@ +#ifndef __WHERE_INDICES_KERNEL_CUH__ +#define __WHERE_INDICES_KERNEL_CUH__ + +#include +// 不在这里包含 cuda_runtime.h,让各个平台的头文件提供必要的定义 + +namespace op::where::cuda { + +// 阶段1: 标记 True 元素 (将 bool 转换为 int64_t: 1 或 0) +// 支持 strided tensor:使用线性索引转换为多维索引,然后使用 stride 计算内存偏移 +template +__global__ void markTrueElements( + Tidx *flags, // 输出:每个元素是否为 True (1/0) + const bool *cond, // 输入:条件张量 + const size_t *shape, // 输入:张量形状 + const ptrdiff_t *strides, // 输入:张量 stride + size_t numel, + int ndim) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel) { + // 将线性索引转换为多维索引 + size_t remaining = idx; + size_t offset = 0; + for (int dim = ndim - 1; dim >= 0; --dim) { + size_t dim_idx = remaining % shape[dim]; + offset += dim_idx * static_cast(strides[dim]); + remaining /= shape[dim]; + } + flags[idx] = cond[offset] ? static_cast(1) : static_cast(0); + } +} + +// 阶段2: 收集每个维度的索引 +// 对于 N 维张量,需要为每个维度收集索引 +template +__global__ void collectIndices( + Tidx **outputs, // 输出:NDIM 个索引张量的指针数组(在设备上) + const Tidx *flags, // 输入:标记数组(前缀和后) + const bool *cond, // 输入:条件张量(未使用,可为 nullptr) + const size_t *shape, // 输入:张量形状(在设备上) + const ptrdiff_t *strides, // 输入:张量 stride(在设备上,未在此 kernel 中使用) + size_t numel, + int ndim) { + (void)cond; + (void)strides; + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel) { + // 通过 flags 判断该线性索引是否对应 True 元素: + // 对于 True 元素,flags[idx] 会比前一个位置的值大 1 + Tidx curr = flags[idx]; + Tidx prev = (idx == 0) ? static_cast(0) : flags[idx - 1]; + if (curr == prev) { + return; + } + + // 计算当前元素在输出中的位置(使用前缀和结果) + // flags[idx] 是 inclusive sum,所以 flags[idx] - 1 是当前元素在输出中的位置 + // 对于第一个元素(idx=0),如果它是 True,flags[0] = 1,所以 output_idx = 0 + Tidx output_idx = curr - 1; + + // 线性索引 -> 多维索引 + size_t remaining = idx; + for (int dim = ndim - 1; dim >= 0; --dim) { + size_t dim_idx = remaining % shape[dim]; + outputs[dim][output_idx] = static_cast(dim_idx); + remaining /= shape[dim]; + } + } +} + +} // namespace op::where::cuda + +#endif // __WHERE_INDICES_KERNEL_CUH__ diff --git a/src/infiniop/ops/where/metax/where_indices_metax.h b/src/infiniop/ops/where/metax/where_indices_metax.h new file mode 100644 index 000000000..42c606153 --- /dev/null +++ b/src/infiniop/ops/where/metax/where_indices_metax.h @@ -0,0 +1,58 @@ +#ifndef __WHERE_INDICES_METAX_H__ +#define __WHERE_INDICES_METAX_H__ + +#include "../../../devices/metax/metax_handle.h" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::where::metax { + +class IndicesDescriptor final : public InfiniopDescriptor { + size_t _numel; + int _ndim; + size_t *_shape; + ptrdiff_t *_strides; + +public: + IndicesDescriptor( + size_t numel, + int ndim, + const size_t *shape, + const ptrdiff_t *strides, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _numel(numel), + _ndim(ndim) { + _shape = new size_t[ndim]; + _strides = new ptrdiff_t[ndim]; + for (int i = 0; i < ndim; ++i) { + _shape[i] = shape[i]; + _strides[i] = strides[i]; + } + } + + ~IndicesDescriptor() { + delete[] _shape; + delete[] _strides; + } + + size_t workspaceSize() const; + + static infiniStatus_t create( + infiniopHandle_t handle, + IndicesDescriptor **desc_ptr, + infiniopTensorDescriptor_t cond_desc); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void **outputs, + const void *cond, + void *stream, + size_t *num_true) const; +}; + +} // namespace op::where::metax + +#endif // __WHERE_INDICES_METAX_H__ diff --git a/src/infiniop/ops/where/metax/where_indices_metax.maca b/src/infiniop/ops/where/metax/where_indices_metax.maca new file mode 100644 index 000000000..01a43b10f --- /dev/null +++ b/src/infiniop/ops/where/metax/where_indices_metax.maca @@ -0,0 +1,146 @@ +#include "../../../devices/metax/metax_common.h" +#include "where_indices_metax.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../cuda/where_indices_kernel.cuh" +#include + +namespace op::where::metax { + +// 封装 CUB InclusiveSum +template +static hcError_t inclusiveSum(void *workspace_ptr, size_t &workspace_len, + T *data, int n, hcStream_t stream) { + return cub::DeviceScan::InclusiveSum(workspace_ptr, workspace_len, data, data, + n, stream); +} + +// 地址对齐到 256 +static constexpr size_t align256(size_t size) { return (size + 255) & (~255); } + +size_t IndicesDescriptor::workspaceSize() const { + const auto n = static_cast(_numel); + + // flags 数组大小 + size_t flags_size = align256(sizeof(int64_t) * _numel); + + // CUB scan workspace + size_t scan_workspace = 0; + int64_t *dummy = nullptr; + CHECK_METAX(inclusiveSum(nullptr, scan_workspace, dummy, n, nullptr)); + + return flags_size + scan_workspace; +} + +infiniStatus_t IndicesDescriptor::create(infiniopHandle_t handle_, + IndicesDescriptor **desc_ptr, + infiniopTensorDescriptor_t cond_desc) { + + auto handle = reinterpret_cast(handle_); + + // 检查条件必须是 bool 类型 + if (cond_desc->dtype() != INFINI_DTYPE_BOOL) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + size_t numel = cond_desc->numel(); + int ndim = static_cast(cond_desc->ndim()); + + std::vector shape(ndim); + std::vector strides(ndim); + for (int i = 0; i < ndim; ++i) { + shape[i] = cond_desc->shape()[i]; + strides[i] = cond_desc->stride(i); + } + + *desc_ptr = new IndicesDescriptor(numel, ndim, shape.data(), strides.data(), + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t IndicesDescriptor::calculate(void *workspace, + size_t workspace_size, + void **outputs, const void *cond, + void *stream, + size_t *num_true) const { + + auto hc_stream = reinterpret_cast(stream); + const bool *cond_ptr = reinterpret_cast(cond); + + // 分配 workspace 中的内存 + int64_t *flags = reinterpret_cast(workspace); + size_t flags_size = align256(sizeof(int64_t) * _numel); + void *scan_workspace = static_cast(workspace) + flags_size; + size_t scan_workspace_size = workspace_size - flags_size; + + // 复制 shape 和 strides 到设备(用于 markTrueElements) + size_t *d_shape; + ptrdiff_t *d_strides; + void *d_shape_void; + void *d_strides_void; + CHECK_METAX(hcMallocAsync(&d_shape_void, sizeof(size_t) * _ndim, hc_stream)); + CHECK_METAX( + hcMallocAsync(&d_strides_void, sizeof(ptrdiff_t) * _ndim, hc_stream)); + d_shape = reinterpret_cast(d_shape_void); + d_strides = reinterpret_cast(d_strides_void); + CHECK_METAX(hcMemcpyAsync(d_shape, _shape, sizeof(size_t) * _ndim, + hcMemcpyHostToDevice, hc_stream)); + CHECK_METAX(hcMemcpyAsync(d_strides, _strides, sizeof(ptrdiff_t) * _ndim, + hcMemcpyHostToDevice, hc_stream)); + + // 阶段1: 标记 True 元素 + constexpr int BLOCK_SIZE = 256; + int grid_size = static_cast((_numel + BLOCK_SIZE - 1) / BLOCK_SIZE); + op::where::cuda::markTrueElements + <<>>(flags, cond_ptr, d_shape, + d_strides, _numel, _ndim); + + // 阶段2: 计算前缀和(inclusive scan) + size_t temp_workspace_size = scan_workspace_size; + CHECK_METAX(inclusiveSum(scan_workspace, temp_workspace_size, flags, + static_cast(_numel), hc_stream)); + + // 获取 True 元素的总数 + int64_t num_true_val = 0; + CHECK_METAX(hcMemcpyAsync(&num_true_val, flags + _numel - 1, sizeof(int64_t), + hcMemcpyDeviceToHost, hc_stream)); + CHECK_METAX(hcStreamSynchronize(hc_stream)); + *num_true = static_cast(num_true_val); + + // 阶段3: 收集每个维度的索引 + int64_t **output_ptrs = new int64_t *[_ndim]; + for (int i = 0; i < _ndim; ++i) { + output_ptrs[i] = reinterpret_cast(outputs[i]); + } + + // d_shape / d_strides 已经在阶段1中分配和复制了,这里直接使用 + + // 复制 output_ptrs 到设备 + int64_t **d_output_ptrs; + void *d_output_ptrs_void; + CHECK_METAX( + hcMallocAsync(&d_output_ptrs_void, sizeof(int64_t *) * _ndim, hc_stream)); + d_output_ptrs = reinterpret_cast(d_output_ptrs_void); + CHECK_METAX(hcMemcpyAsync(d_output_ptrs, output_ptrs, + sizeof(int64_t *) * _ndim, hcMemcpyHostToDevice, + hc_stream)); + + // 启动收集索引的 kernel + op::where::cuda::collectIndices + <<>>( + d_output_ptrs, flags, cond_ptr, d_shape, d_strides, _numel, _ndim); + + // 同步流,确保所有异步操作(包括 collectIndices kernel)完成 + CHECK_METAX(hcStreamSynchronize(hc_stream)); + + // 清理 + CHECK_METAX(hcFreeAsync(d_shape_void, hc_stream)); + CHECK_METAX(hcFreeAsync(d_strides_void, hc_stream)); + CHECK_METAX(hcFreeAsync(d_output_ptrs_void, hc_stream)); + delete[] output_ptrs; + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::where::metax + diff --git a/src/infiniop/ops/where/moore/where_indices_moore.h b/src/infiniop/ops/where/moore/where_indices_moore.h new file mode 100644 index 000000000..ccb126f31 --- /dev/null +++ b/src/infiniop/ops/where/moore/where_indices_moore.h @@ -0,0 +1,58 @@ +#ifndef __WHERE_INDICES_MOORE_H__ +#define __WHERE_INDICES_MOORE_H__ + +#include "../../../devices/moore/moore_handle.h" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::where::moore { + +class IndicesDescriptor final : public InfiniopDescriptor { + size_t _numel; + int _ndim; + size_t *_shape; + ptrdiff_t *_strides; + +public: + IndicesDescriptor( + size_t numel, + int ndim, + const size_t *shape, + const ptrdiff_t *strides, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _numel(numel), + _ndim(ndim) { + _shape = new size_t[ndim]; + _strides = new ptrdiff_t[ndim]; + for (int i = 0; i < ndim; ++i) { + _shape[i] = shape[i]; + _strides[i] = strides[i]; + } + } + + ~IndicesDescriptor() { + delete[] _shape; + delete[] _strides; + } + + size_t workspaceSize() const; + + static infiniStatus_t create( + infiniopHandle_t handle, + IndicesDescriptor **desc_ptr, + infiniopTensorDescriptor_t cond_desc); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void **outputs, + const void *cond, + void *stream, + size_t *num_true) const; +}; + +} // namespace op::where::moore + +#endif // __WHERE_INDICES_MOORE_H__ diff --git a/src/infiniop/ops/where/moore/where_indices_moore.mu b/src/infiniop/ops/where/moore/where_indices_moore.mu new file mode 100644 index 000000000..ab7d5b59d --- /dev/null +++ b/src/infiniop/ops/where/moore/where_indices_moore.mu @@ -0,0 +1,154 @@ +#include "../../../devices/moore/moore_common.h" +#include "where_indices_moore.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "../cuda/where_indices_kernel.cuh" +#include + +namespace op::where::moore { + +// 封装 CUB InclusiveSum +template +static musaError_t inclusiveSum( + void *workspace_ptr, size_t &workspace_len, + T *data, int n, + musaStream_t stream) { + return cub::DeviceScan::InclusiveSum( + workspace_ptr, workspace_len, + data, data, n, + stream); +} + +// 地址对齐到 256 +static constexpr size_t align256(size_t size) { + return (size + 255) & (~255); +} + +size_t IndicesDescriptor::workspaceSize() const { + const auto n = static_cast(_numel); + + // flags 数组大小 + size_t flags_size = align256(sizeof(int64_t) * _numel); + + // CUB scan workspace + size_t scan_workspace = 0; + int64_t *dummy = nullptr; + CHECK_MOORE(inclusiveSum( + nullptr, scan_workspace, + dummy, n, + nullptr)); + + return flags_size + scan_workspace; +} + +infiniStatus_t IndicesDescriptor::create( + infiniopHandle_t handle_, + IndicesDescriptor **desc_ptr, + infiniopTensorDescriptor_t cond_desc) { + + auto handle = reinterpret_cast(handle_); + + // 检查条件必须是 bool 类型 + if (cond_desc->dtype() != INFINI_DTYPE_BOOL) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + size_t numel = cond_desc->numel(); + int ndim = static_cast(cond_desc->ndim()); + + std::vector shape(ndim); + std::vector strides(ndim); + for (int i = 0; i < ndim; ++i) { + shape[i] = cond_desc->shape()[i]; + strides[i] = cond_desc->stride(i); + } + + *desc_ptr = new IndicesDescriptor( + numel, ndim, shape.data(), strides.data(), + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t IndicesDescriptor::calculate( + void *workspace, + size_t workspace_size, + void **outputs, + const void *cond, + void *stream, + size_t *num_true) const { + + auto musa_stream = reinterpret_cast(stream); + const bool *cond_ptr = reinterpret_cast(cond); + + // 分配 workspace 中的内存 + int64_t *flags = reinterpret_cast(workspace); + size_t flags_size = align256(sizeof(int64_t) * _numel); + void *scan_workspace = static_cast(workspace) + flags_size; + size_t scan_workspace_size = workspace_size - flags_size; + + // 复制 shape 和 strides 到设备(用于 markTrueElements) + size_t *d_shape; + ptrdiff_t *d_strides; + CHECK_MOORE(musaMalloc(&d_shape, sizeof(size_t) * _ndim)); + CHECK_MOORE(musaMalloc(&d_strides, sizeof(ptrdiff_t) * _ndim)); + CHECK_MOORE(musaMemcpyAsync( + d_shape, _shape, sizeof(size_t) * _ndim, + musaMemcpyHostToDevice, musa_stream)); + CHECK_MOORE(musaMemcpyAsync( + d_strides, _strides, sizeof(ptrdiff_t) * _ndim, + musaMemcpyHostToDevice, musa_stream)); + + // 阶段1: 标记 True 元素 + constexpr int BLOCK_SIZE = 256; + int grid_size = static_cast((_numel + BLOCK_SIZE - 1) / BLOCK_SIZE); + op::where::cuda::markTrueElements<<>>( + flags, cond_ptr, d_shape, d_strides, _numel, _ndim); + + // 阶段2: 计算前缀和(inclusive scan) + size_t temp_workspace_size = scan_workspace_size; + CHECK_MOORE(inclusiveSum( + scan_workspace, temp_workspace_size, + flags, static_cast(_numel), + musa_stream)); + + // 获取 True 元素的总数 + int64_t num_true_val = 0; + CHECK_MOORE(musaMemcpyAsync( + &num_true_val, flags + _numel - 1, sizeof(int64_t), + musaMemcpyDeviceToHost, musa_stream)); + CHECK_MOORE(musaStreamSynchronize(musa_stream)); + *num_true = static_cast(num_true_val); + + // 阶段3: 收集每个维度的索引 + int64_t **output_ptrs = new int64_t *[_ndim]; + for (int i = 0; i < _ndim; ++i) { + output_ptrs[i] = reinterpret_cast(outputs[i]); + } + + // d_shape 已经在阶段1中分配和复制了,这里直接使用 + + // 复制 output_ptrs 到设备 + int64_t **d_output_ptrs; + CHECK_MOORE(musaMalloc(&d_output_ptrs, sizeof(int64_t *) * _ndim)); + CHECK_MOORE(musaMemcpyAsync( + d_output_ptrs, output_ptrs, sizeof(int64_t *) * _ndim, + musaMemcpyHostToDevice, musa_stream)); + + // 启动收集索引的 kernel + op::where::cuda::collectIndices<<>>( + d_output_ptrs, flags, cond_ptr, d_shape, d_strides, _numel, _ndim); + + // 同步流,确保所有异步操作(包括 collectIndices kernel)完成 + CHECK_MOORE(musaStreamSynchronize(musa_stream)); + + // 清理 + CHECK_MOORE(musaFree(d_shape)); + CHECK_MOORE(musaFree(d_strides)); + CHECK_MOORE(musaFree(d_output_ptrs)); + delete[] output_ptrs; + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::where::moore + diff --git a/src/infiniop/ops/where/nvidia/where_indices_nvidia.cu b/src/infiniop/ops/where/nvidia/where_indices_nvidia.cu new file mode 100644 index 000000000..ea4aeee09 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_indices_nvidia.cu @@ -0,0 +1,159 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/where_indices_kernel.cuh" +#include "where_indices_nvidia.h" +#include +#include +#include + +namespace op::where::nvidia { + +// 封装 CUB InclusiveSum +template +static cudaError inclusiveSum( + void *workspace_ptr, size_t &workspace_len, + T *data, int n, + cudaStream_t stream) { + return cub::DeviceScan::InclusiveSum( + workspace_ptr, workspace_len, + data, data, n, + stream); +} + +// 地址对齐到 256 +static constexpr size_t align256(size_t size) { + return (size + 255) & (~255); +} + +size_t IndicesDescriptor::workspaceSize() const { + const auto n = static_cast(_numel); + + // flags 数组大小 + size_t flags_size = align256(sizeof(int64_t) * _numel); + + // CUB scan workspace + size_t scan_workspace = 0; + int64_t *dummy = nullptr; + CHECK_CUDA(inclusiveSum( + nullptr, scan_workspace, + dummy, n, + nullptr)); + + return flags_size + scan_workspace; +} + +infiniStatus_t IndicesDescriptor::create( + infiniopHandle_t handle_, + IndicesDescriptor **desc_ptr, + infiniopTensorDescriptor_t cond_desc) { + + auto handle = reinterpret_cast(handle_); + + // 检查条件必须是 bool 类型 + if (cond_desc->dtype() != INFINI_DTYPE_BOOL) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + size_t numel = cond_desc->numel(); + int ndim = static_cast(cond_desc->ndim()); + + std::vector shape(ndim); + std::vector strides(ndim); + for (int i = 0; i < ndim; ++i) { + shape[i] = cond_desc->shape()[i]; + strides[i] = cond_desc->stride(i); + } + + *desc_ptr = new IndicesDescriptor( + numel, ndim, shape.data(), strides.data(), + handle->internal(), + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t IndicesDescriptor::calculate( + void *workspace, + size_t workspace_size, + void **outputs, // outputs[i] 是第 i 维的索引张量 + const void *cond, + void *stream, + size_t *num_true) const { + + auto cuda_stream = reinterpret_cast(stream); + const bool *cond_ptr = reinterpret_cast(cond); + + // 分配 workspace 中的内存 + int64_t *flags = reinterpret_cast(workspace); + size_t flags_size = align256(sizeof(int64_t) * _numel); + void *scan_workspace = static_cast(workspace) + flags_size; + size_t scan_workspace_size = workspace_size - flags_size; + + // 复制 shape 和 strides 到设备(用于 markTrueElements / collectIndices) + size_t *d_shape; + ptrdiff_t *d_strides; + CHECK_CUDA(cudaMallocAsync(&d_shape, sizeof(size_t) * _ndim, cuda_stream)); + CHECK_CUDA(cudaMallocAsync(&d_strides, sizeof(ptrdiff_t) * _ndim, cuda_stream)); + CHECK_CUDA(cudaMemcpyAsync( + d_shape, _shape, sizeof(size_t) * _ndim, + cudaMemcpyHostToDevice, cuda_stream)); + CHECK_CUDA(cudaMemcpyAsync( + d_strides, _strides, sizeof(ptrdiff_t) * _ndim, + cudaMemcpyHostToDevice, cuda_stream)); + + // 阶段1: 标记 True 元素 + constexpr int BLOCK_SIZE = 256; + int grid_size = static_cast((_numel + BLOCK_SIZE - 1) / BLOCK_SIZE); + op::where::cuda::markTrueElements<<>>( + flags, cond_ptr, d_shape, d_strides, _numel, _ndim); + CHECK_CUDA(cudaGetLastError()); + + // 阶段2: 计算前缀和(inclusive scan) + size_t temp_workspace_size = scan_workspace_size; + CHECK_CUDA(inclusiveSum( + scan_workspace, temp_workspace_size, + flags, static_cast(_numel), + cuda_stream)); + + // 获取 True 元素的总数(最后一个元素的前缀和值) + int64_t num_true_val = 0; + CHECK_CUDA(cudaMemcpyAsync( + &num_true_val, flags + _numel - 1, sizeof(int64_t), + cudaMemcpyDeviceToHost, cuda_stream)); + CHECK_CUDA(cudaStreamSynchronize(cuda_stream)); + *num_true = static_cast(num_true_val); + + // 阶段3: 收集每个维度的索引 + // 将 outputs 转换为 int64_t* 数组并复制到设备 + int64_t **output_ptrs = new int64_t *[_ndim]; + for (int i = 0; i < _ndim; ++i) { + output_ptrs[i] = reinterpret_cast(outputs[i]); + } + + // d_shape / d_strides 已经在阶段1中分配和复制了,这里直接使用 + + // 复制 output_ptrs 到设备 + int64_t **d_output_ptrs; + CHECK_CUDA(cudaMallocAsync(&d_output_ptrs, sizeof(int64_t *) * _ndim, cuda_stream)); + CHECK_CUDA(cudaMemcpyAsync( + d_output_ptrs, output_ptrs, sizeof(int64_t *) * _ndim, + cudaMemcpyHostToDevice, cuda_stream)); + + // 启动收集索引的 kernel + op::where::cuda::collectIndices<<>>( + d_output_ptrs, flags, cond_ptr, d_shape, d_strides, _numel, _ndim); + CHECK_CUDA(cudaGetLastError()); + + // 同步流,确保所有异步操作(包括 collectIndices kernel)完成 + CHECK_CUDA(cudaStreamSynchronize(cuda_stream)); + + // 清理 + CHECK_CUDA(cudaFreeAsync(d_shape, cuda_stream)); + CHECK_CUDA(cudaFreeAsync(d_strides, cuda_stream)); + CHECK_CUDA(cudaFreeAsync(d_output_ptrs, cuda_stream)); + delete[] output_ptrs; + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::where::nvidia diff --git a/src/infiniop/ops/where/nvidia/where_indices_nvidia.h b/src/infiniop/ops/where/nvidia/where_indices_nvidia.h new file mode 100644 index 000000000..a07f6e4b0 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_indices_nvidia.h @@ -0,0 +1,61 @@ +#ifndef __WHERE_INDICES_NVIDIA_H__ +#define __WHERE_INDICES_NVIDIA_H__ + +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../operator.h" +#include "../../../tensor.h" + +namespace op::where::nvidia { + +class IndicesDescriptor final : public InfiniopDescriptor { + size_t _numel; + int _ndim; + size_t *_shape; + ptrdiff_t *_strides; + std::shared_ptr _internal; + +public: + IndicesDescriptor( + size_t numel, + int ndim, + const size_t *shape, + const ptrdiff_t *strides, + std::shared_ptr internal, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _numel(numel), + _ndim(ndim), + _internal(internal) { + _shape = new size_t[ndim]; + _strides = new ptrdiff_t[ndim]; + for (int i = 0; i < ndim; ++i) { + _shape[i] = shape[i]; + _strides[i] = strides[i]; + } + } + + ~IndicesDescriptor() { + delete[] _shape; + delete[] _strides; + } + + size_t workspaceSize() const; + + static infiniStatus_t create( + infiniopHandle_t handle, + IndicesDescriptor **desc_ptr, + infiniopTensorDescriptor_t cond_desc); + + infiniStatus_t calculate( + void *workspace, + size_t workspace_size, + void **outputs, // NDIM 个输出张量的指针数组 + const void *cond, + void *stream, + size_t *num_true) const; // 输出:True 元素的数量 +}; + +} // namespace op::where::nvidia + +#endif // __WHERE_INDICES_NVIDIA_H__ diff --git a/src/infiniop/ops/where/operator.cc b/src/infiniop/ops/where/operator.cc new file mode 100644 index 000000000..e13dbd35d --- /dev/null +++ b/src/infiniop/ops/where/operator.cc @@ -0,0 +1,264 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/where.h" + +#ifdef ENABLE_CPU_API +#include "cpu/where_cpu.h" +#include "cpu/where_indices_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/where_indices_nvidia.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/where_indices_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/where_indices_moore.h" +#endif + +__C infiniStatus_t infiniopCreateWhereDescriptor( + infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t cond_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t y_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::where::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + out_desc, \ + {cond_desc, x_desc, y_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetWhereWorkspaceSize( + infiniopWhereDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopWhere( + infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *cond, + const void *x, + const void *y, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, out, {cond, x, y}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} + +// where(cond) -> indices tuple +__C infiniStatus_t infiniopCreateWhereIndicesDescriptor( + infiniopHandle_t handle, + infiniopWhereIndicesDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t cond_desc) { + +#define CREATE_INDICES(CASE, NAMESPACE) \ + case CASE: \ + return op::where::NAMESPACE::IndicesDescriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + cond_desc) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE_INDICES(INFINI_DEVICE_CPU, cpu); +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) + CREATE_INDICES(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE_INDICES(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE_INDICES(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE_INDICES(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE_INDICES(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE_INDICES +} + +__C infiniStatus_t infiniopGetWhereIndicesWorkspaceSize( + infiniopWhereIndicesDescriptor_t desc, + size_t *size) { + +#define GET_INDICES(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET_INDICES(INFINI_DEVICE_CPU, cpu) +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) + GET_INDICES(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET_INDICES(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_QY_API + GET_INDICES(INFINI_DEVICE_QY, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET_INDICES(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_MOORE_API + GET_INDICES(INFINI_DEVICE_MOORE, moore) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET_INDICES + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopWhereIndices( + infiniopWhereIndicesDescriptor_t desc, + void *workspace, + size_t workspace_size, + void **outputs, + const void *cond, + void *stream, + size_t *num_true) { + +#define CALCULATE_INDICES(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, outputs, cond, stream, num_true) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE_INDICES(INFINI_DEVICE_CPU, cpu); +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) + CALCULATE_INDICES(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE_INDICES(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE_INDICES(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE_INDICES(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE_INDICES(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE_INDICES +} + +__C infiniStatus_t infiniopDestroyWhereIndicesDescriptor(infiniopWhereIndicesDescriptor_t desc) { + +#define DELETE_INDICES(CASE, NAMESPACE) \ + case CASE: \ + if (desc != nullptr) { \ + delete reinterpret_cast( \ + const_cast(reinterpret_cast(desc))); \ + } \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE_INDICES(INFINI_DEVICE_CPU, cpu); +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) + DELETE_INDICES(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE_INDICES(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE_INDICES(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE_INDICES(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE_INDICES(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE_INDICES +} diff --git a/test/infinicore/framework/benchmark.py b/test/infinicore/framework/benchmark.py index 74466feea..c4b105b93 100644 --- a/test/infinicore/framework/benchmark.py +++ b/test/infinicore/framework/benchmark.py @@ -117,10 +117,10 @@ def _clear_cache(): if infinicore.use_ntops: import triton - cache = triton.runtime.driver.active.get_empty_cache_for_benchmark() + # cache = triton.runtime.driver.active.get_empty_cache_for_benchmark() - def _clear_cache(): - triton.runtime.driver.active.clear_cache(cache) + # def _clear_cache(): + # triton.runtime.driver.active.clear_cache(cache) # Create pairs of DeviceEvents for each iteration start_events = [ diff --git a/test/infinicore/framework/utils.py b/test/infinicore/framework/utils.py index 8f30c4c78..a1ac154ea 100644 --- a/test/infinicore/framework/utils.py +++ b/test/infinicore/framework/utils.py @@ -28,14 +28,21 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True): elif actual.dtype == torch.bfloat16 or desired.dtype == torch.bfloat16: actual = actual.to(torch.float32) desired = desired.to(torch.float32) + # Note: bool tensors are handled inside print_discrepancy print_discrepancy(actual, desired, atol, rtol, equal_nan, verbose) import numpy as np - np.testing.assert_allclose( - actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True - ) + # For bool tensors, use assert_equal instead of assert_allclose + if actual.dtype == torch.bool or desired.dtype == torch.bool: + np.testing.assert_equal( + actual.cpu().numpy(), desired.cpu().numpy() + ) + else: + np.testing.assert_allclose( + actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True + ) def print_discrepancy( @@ -49,19 +56,40 @@ def print_discrepancy( import sys is_terminal = sys.stdout.isatty() + + # Handle bool tensors specially - PyTorch doesn't support subtraction for bool + is_bool = actual.dtype == torch.bool or expected.dtype == torch.bool + + if is_bool: + # For bool tensors, convert to int8 for comparison operations + actual_for_calc = actual.to(torch.int8) if actual.dtype == torch.bool else actual + expected_for_calc = expected.to(torch.int8) if expected.dtype == torch.bool else expected + else: + actual_for_calc = actual + expected_for_calc = expected - actual_isnan = torch.isnan(actual) - expected_isnan = torch.isnan(expected) + actual_isnan = torch.isnan(actual_for_calc) + expected_isnan = torch.isnan(expected_for_calc) # Calculate difference mask - nan_mismatch = ( - actual_isnan ^ expected_isnan if equal_nan else actual_isnan | expected_isnan - ) - diff_mask = nan_mismatch | ( - torch.abs(actual - expected) > (atol + rtol * torch.abs(expected)) - ) + if is_bool: + # For bool tensors, just check equality + diff_mask = actual != expected + else: + nan_mismatch = ( + actual_isnan ^ expected_isnan if equal_nan else actual_isnan | expected_isnan + ) + diff_mask = nan_mismatch | ( + torch.abs(actual_for_calc - expected_for_calc) > (atol + rtol * torch.abs(expected_for_calc)) + ) + diff_indices = torch.nonzero(diff_mask, as_tuple=False) - delta = actual - expected + + # Calculate delta (difference) - convert bool to int if needed + if is_bool: + delta = (actual.to(torch.int8) - expected.to(torch.int8)) + else: + delta = actual_for_calc - expected_for_calc # Display formatting col_width = [18, 20, 20, 20] @@ -77,11 +105,20 @@ def add_color(text, color_code): if verbose: for idx in diff_indices: index_tuple = tuple(idx.tolist()) - actual_str = f"{actual[index_tuple]:<{col_width[1]}.{decimal_places[1]}f}" - expected_str = ( - f"{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}" - ) - delta_str = f"{delta[index_tuple]:<{col_width[3]}.{decimal_places[3]}f}" + if is_bool: + # For bool, display as True/False + actual_val = actual[index_tuple].item() + expected_val = expected[index_tuple].item() + actual_str = f"{str(actual_val):<{col_width[1]}}" + expected_str = f"{str(expected_val):<{col_width[2]}}" + delta_val = delta[index_tuple].item() + delta_str = f"{delta_val:<{col_width[3]}}" + else: + actual_str = f"{actual[index_tuple]:<{col_width[1]}.{decimal_places[1]}f}" + expected_str = ( + f"{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}" + ) + delta_str = f"{delta[index_tuple]:<{col_width[3]}.{decimal_places[3]}f}" print( f" > Index: {str(index_tuple):<{col_width[0]}}" f"actual: {add_color(actual_str, 31)}" @@ -97,15 +134,17 @@ def add_color(text, color_code): print( f" - Mismatched elements: {len(diff_indices)} / {actual.numel()} ({len(diff_indices) / actual.numel() * 100}%)" ) - print( - f" - Min(actual) : {torch.min(actual):<{col_width[1]}} | Max(actual) : {torch.max(actual):<{col_width[2]}}" - ) - print( - f" - Min(desired): {torch.min(expected):<{col_width[1]}} | Max(desired): {torch.max(expected):<{col_width[2]}}" - ) - print( - f" - Min(delta) : {torch.min(delta):<{col_width[1]}} | Max(delta) : {torch.max(delta):<{col_width[2]}}" - ) + + if not is_bool: + print( + f" - Min(actual) : {torch.min(actual):<{col_width[1]}} | Max(actual) : {torch.max(actual):<{col_width[2]}}" + ) + print( + f" - Min(desired): {torch.min(expected):<{col_width[1]}} | Max(desired): {torch.max(expected):<{col_width[2]}}" + ) + print( + f" - Min(delta) : {torch.min(delta):<{col_width[1]}} | Max(delta) : {torch.max(delta):<{col_width[2]}}" + ) print("-" * total_width) return diff_indices diff --git a/test/infinicore/ops/logical_or.py b/test/infinicore/ops/logical_or.py index a51b7384f..5fc39cc77 100644 --- a/test/infinicore/ops/logical_or.py +++ b/test/infinicore/ops/logical_or.py @@ -107,9 +107,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.logical_or(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.logical_or(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.logical_or(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/logical_xor.py b/test/infinicore/ops/logical_xor.py index 4f82cc613..f6e2ac92d 100644 --- a/test/infinicore/ops/logical_xor.py +++ b/test/infinicore/ops/logical_xor.py @@ -107,9 +107,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.logical_xor(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.logical_xor(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.logical_xor(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/logsigmoid.py b/test/infinicore/ops/logsigmoid.py index 0c9131323..44212ec80 100644 --- a/test/infinicore/ops/logsigmoid.py +++ b/test/infinicore/ops/logsigmoid.py @@ -68,9 +68,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.logsigmoid(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.logsigmoid(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.logsigmoid(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/vdot.py b/test/infinicore/ops/vdot.py index 2baf715f0..e294bc6dc 100644 --- a/test/infinicore/ops/vdot.py +++ b/test/infinicore/ops/vdot.py @@ -62,9 +62,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.vdot(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.vdot(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation.""" + return infinicore.vdot(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/where.py b/test/infinicore/ops/where.py index bc0013bd6..d1fc6fd50 100644 --- a/test/infinicore/ops/where.py +++ b/test/infinicore/ops/where.py @@ -71,9 +71,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.where(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.where(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.where(*args, **kwargs) def main():