diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index a7249ec9d..83ea4e894 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -1,9 +1,11 @@ #pragma once #include "ops/add.hpp" +#include "ops/asin.hpp" #include "ops/add_rms_norm.hpp" #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" +#include "ops/fmin.hpp" #include "ops/matmul.hpp" #include "ops/ones.hpp" #include "ops/paged_attention.hpp" @@ -14,4 +16,4 @@ #include "ops/rms_norm.hpp" #include "ops/rope.hpp" #include "ops/silu.hpp" -#include "ops/swiglu.hpp" +#include "ops/swiglu.hpp" \ No newline at end of file diff --git a/include/infinicore/ops/adaptive_avg_pool3d.hpp b/include/infinicore/ops/adaptive_avg_pool3d.hpp new file mode 100644 index 000000000..ab1dc9253 --- /dev/null +++ b/include/infinicore/ops/adaptive_avg_pool3d.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class AdaptiveAvgPool3D { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor y, Tensor x); + static common::OpDispatcher &dispatcher(); +}; + +Tensor adaptive_avg_pool3d(Tensor x, std::vector output_size); +void adaptive_avg_pool3d_(Tensor y, Tensor x); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/addr.hpp b/include/infinicore/ops/addr.hpp new file mode 100644 index 000000000..92d97fae8 --- /dev/null +++ b/include/infinicore/ops/addr.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Addr { +public: + using schema = void (*)(Tensor, Tensor, Tensor, Tensor, float, float); + static void execute(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f); + static common::OpDispatcher &dispatcher(); +}; + +Tensor addr(Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f); +void addr_(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta = 1.0f, float alpha = 1.0f); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/argwhere.hpp b/include/infinicore/ops/argwhere.hpp new file mode 100644 index 000000000..f0c629adb --- /dev/null +++ b/include/infinicore/ops/argwhere.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Argwhere { +public: + using schema = void (*)(void **, size_t *, Tensor); + static void execute(void **, size_t *count, Tensor x); + static common::OpDispatcher &dispatcher(); +}; +Tensor argwhere(Tensor x); +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/asin.hpp b/include/infinicore/ops/asin.hpp new file mode 100644 index 000000000..e8667b037 --- /dev/null +++ b/include/infinicore/ops/asin.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Asin { +public: + using schema = void (*)(Tensor, Tensor); + static void execute(Tensor output, Tensor input); + static common::OpDispatcher &dispatcher(); +}; + +Tensor asin(Tensor input); +void asin_(Tensor output, Tensor input); + +} // namespace infinicore::op \ No newline at end of file diff --git a/include/infinicore/ops/fmin.hpp b/include/infinicore/ops/fmin.hpp new file mode 100644 index 000000000..4ea00787f --- /dev/null +++ b/include/infinicore/ops/fmin.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { + +class Fmin { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor c, Tensor a, Tensor b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor fmin(Tensor a, Tensor b); +void fmin_(Tensor c, Tensor a, Tensor b); + +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index c0a09fcb4..2cd81e9ea 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -2,13 +2,17 @@ #define __INFINIOP_API_H__ #include "infiniop/handle.h" +#include "infiniop/ops/adaptive_avg_pool3d.h" #include "infiniop/ops/add.h" +#include "infiniop/ops/addr.h" #include "infiniop/ops/add_rms_norm.h" +#include "infiniop/ops/asin.h" #include "infiniop/ops/attention.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" #include "infiniop/ops/dequantize_awq.h" +#include "infiniop/ops/fmin.h" #include "infiniop/ops/gelu.h" #include "infiniop/ops/gemm.h" #include "infiniop/ops/layer_norm.h" diff --git a/include/infiniop/ops/adaptive_avg_pool3d.h b/include/infiniop/ops/adaptive_avg_pool3d.h new file mode 100644 index 000000000..178a509e1 --- /dev/null +++ b/include/infiniop/ops/adaptive_avg_pool3d.h @@ -0,0 +1,30 @@ +#ifndef INFINIOP_ADAPTIVE_AVG_POOL3D_H_ +#define INFINIOP_ADAPTIVE_AVG_POOL3D_H_ + +#include "../operator_descriptor.h" +#include + +typedef struct InfiniopDescriptor *infiniopAdaptiveAvgPool3DDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor( + infiniopHandle_t handle, + infiniopAdaptiveAvgPool3DDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + size_t *output_size); + +__C __export infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize( + infiniopAdaptiveAvgPool3DDescriptor_t desc, + size_t *size); + +__C __export infiniStatus_t infiniopAdaptiveAvgPool3D( + infiniopAdaptiveAvgPool3DDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAvgPool3DDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/addr.h b/include/infiniop/ops/addr.h new file mode 100644 index 000000000..6cae92d97 --- /dev/null +++ b/include/infiniop/ops/addr.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_ADDR_API_H__ +#define __INFINIOP_ADDR_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAddrDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAddrDescriptor(infiniopHandle_t handle, + infiniopAddrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t vec1, + infiniopTensorDescriptor_t vec2, + float beta, + float alpha); + +__C __export infiniStatus_t infiniopGetAddrWorkspaceSize(infiniopAddrDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAddr(infiniopAddrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *vec1, + const void *vec2, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/argwhere.h b/include/infiniop/ops/argwhere.h new file mode 100644 index 000000000..653cbba94 --- /dev/null +++ b/include/infiniop/ops/argwhere.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_ARGWHERE_API_H__ +#define __INFINIOP_ARGWHERE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopArgwhereDescriptor_t; + +__C __export infiniStatus_t infiniopCreateArgwhereDescriptor( + infiniopHandle_t handle, + infiniopArgwhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc); + +__C __export infiniStatus_t infiniopGetArgwhereWorkspaceSize( + infiniopArgwhereDescriptor_t desc, + size_t *size); + +__C __export infiniStatus_t infiniopArgwhere( + infiniopArgwhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void **output, + size_t *count, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyArgwhereDescriptor( + infiniopArgwhereDescriptor_t desc); + +#endif // __INFINIOP_ARGWHERE_API_H__ \ No newline at end of file diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h new file mode 100644 index 000000000..ee40514d0 --- /dev/null +++ b/include/infiniop/ops/asin.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_ASIN_API_H__ +#define __INFINIOP_ASIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle, + infiniopAsinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/fmin.h b/include/infiniop/ops/fmin.h new file mode 100644 index 000000000..274e06a46 --- /dev/null +++ b/include/infiniop/ops/fmin.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_FMIN_H__ +#define __INFINIOP_FMIN_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopFminDescriptor_t; + +__C __export infiniStatus_t infiniopCreateFminDescriptor(infiniopHandle_t handle, + infiniopFminDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopFmin(infiniopFminDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc); + +#endif diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index b7288f3ac..a4ce58c9d 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -40,6 +40,8 @@ uint8, ) from infinicore.ops.add import add +from infinicore.ops.addr import addr +from infinicore.ops.asin import asin from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_ from infinicore.ops.attention import attention from infinicore.ops.matmul import matmul @@ -49,8 +51,11 @@ from infinicore.ops.paged_attention_prefill import paged_attention_prefill from infinicore.ops.paged_caching import paged_caching from infinicore.ops.rearrange import rearrange +from infinicore.ops.argwhere import argwhere +from infinicore.ops.fmin import fmin from infinicore.ops.squeeze import squeeze from infinicore.ops.unsqueeze import unsqueeze + from infinicore.tensor import ( Tensor, empty, @@ -106,8 +111,11 @@ "uint8", # Operations. "add", + "addr", "add_rms_norm", "add_rms_norm_", + "argwhere", + "asin", "attention", "matmul", "mul", @@ -121,6 +129,7 @@ "from_list", "from_numpy", "from_torch", + "fmin", "paged_caching", "paged_attention", "paged_attention_prefill", diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py index 255079790..61dbab685 100644 --- a/python/infinicore/nn/functional/__init__.py +++ b/python/infinicore/nn/functional/__init__.py @@ -6,6 +6,8 @@ from .rope import RopeAlgo, rope from .silu import silu from .swiglu import swiglu +from .adaptive_avg_pool3d import adaptive_avg_pool3d + __all__ = [ "causal_softmax", @@ -17,4 +19,5 @@ "embedding", "rope", "RopeAlgo", + "adaptive_avg_pool3d", ] diff --git a/python/infinicore/nn/functional/adaptive_avg_pool3d.py b/python/infinicore/nn/functional/adaptive_avg_pool3d.py new file mode 100644 index 000000000..9f8a3a4a6 --- /dev/null +++ b/python/infinicore/nn/functional/adaptive_avg_pool3d.py @@ -0,0 +1,16 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor +from typing import List + + +def adaptive_avg_pool3d(x: Tensor, output_size: List[int] = {1, 1, 1}) -> Tensor: + r"""Applies a 3D adaptive average pooling over an input signal composed of several input planes. + + Args: + x (Tensor): The input tensor of shape (N, C, D, H, W) + output_size (List[int]): The target output size of the form (d, h, w) + + Returns: + Tensor: The pooled output tensor + """ + return Tensor(_infinicore.adaptive_avg_pool3d(x._underlying, output_size)) diff --git a/python/infinicore/ops/addr.py b/python/infinicore/ops/addr.py new file mode 100644 index 000000000..f9a607961 --- /dev/null +++ b/python/infinicore/ops/addr.py @@ -0,0 +1,28 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def addr( + input: Tensor, + vec1: Tensor, + vec2: Tensor, + beta: float = 1.0, + alpha: float = 1.0, + out=None, +) -> Tensor: + if out is None: + return Tensor( + _infinicore.addr( + input._underlying, vec1._underlying, vec2._underlying, beta, alpha + ) + ) + + _infinicore.addr_( + out._underlying, + input._underlying, + vec1._underlying, + vec2._underlying, + beta, + alpha, + ) + return out diff --git a/python/infinicore/ops/argwhere.py b/python/infinicore/ops/argwhere.py new file mode 100644 index 000000000..1e558c898 --- /dev/null +++ b/python/infinicore/ops/argwhere.py @@ -0,0 +1,6 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def argwhere(x: Tensor) -> Tensor: + return Tensor(_infinicore.argwhere(x._underlying)) diff --git a/python/infinicore/ops/asin.py b/python/infinicore/ops/asin.py new file mode 100644 index 000000000..fbb230737 --- /dev/null +++ b/python/infinicore/ops/asin.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def asin(input: Tensor, *, out=None): + """Arcsin activation function.""" + if out is None: + return Tensor(_infinicore.asin(input._underlying)) + + _infinicore.asin_(out._underlying, input._underlying) + return out diff --git a/python/infinicore/ops/fmin.py b/python/infinicore/ops/fmin.py new file mode 100644 index 000000000..56e274a10 --- /dev/null +++ b/python/infinicore/ops/fmin.py @@ -0,0 +1,10 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def fmin(input, other, *, out=None): + if out is None: + return Tensor(_infinicore.fmin(input._underlying, other._underlying)) + + _infinicore.fmin_(out._underlying, input._underlying, other._underlying) + return out diff --git a/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc new file mode 100644 index 000000000..61d2cecfa --- /dev/null +++ b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.cc @@ -0,0 +1,36 @@ +#include "infinicore/ops/adaptive_avg_pool3d.hpp" +#include +#include +namespace infinicore::op { + +common::OpDispatcher &AdaptiveAvgPool3D::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void AdaptiveAvgPool3D::execute(Tensor y, Tensor x) { + auto device_type = context::getDevice().getType(); + auto func = dispatcher().lookup(device_type); + if (func == nullptr) { + throw std::runtime_error("No AdaptiveAvgPool3D implementation found for device type: " + std::to_string(static_cast(device_type))); + } + func(y, x); +} + +Tensor adaptive_avg_pool3d(Tensor x, std::vector output_size) { + + // Create output tensor shap + Shape y_shape = x->shape(); + y_shape[2] = output_size[0]; // D dimension + y_shape[3] = output_size[1]; // H dimension + y_shape[4] = output_size[2]; // W dimension + + auto y = Tensor::empty(y_shape, x->dtype(), x->device()); + adaptive_avg_pool3d_(y, x); + return y; +} + +void adaptive_avg_pool3d_(Tensor y, Tensor x) { + AdaptiveAvgPool3D::execute(y, x); +} +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc new file mode 100644 index 000000000..1f4ef3211 --- /dev/null +++ b/src/infinicore/ops/adaptive_avg_pool3d/adaptive_avg_pool3d_infiniop.cc @@ -0,0 +1,62 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/adaptive_avg_pool3d.hpp" +#include "infinicore/ops/common/cache.hpp" + +namespace infinicore::op::adaptive_avg_pool3d_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopAdaptiveAvgPool3DDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAdaptiveAvgPool3DDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopAdaptiveAvgPool3DDescriptor_t desc = nullptr; + + if (!desc_opt) { + // Convert vector to array for output_size + std::vector output_size_vec = {output->size(2), output->size(3), output->size(4)}; + + INFINICORE_CHECK_ERROR(infiniopCreateAdaptiveAvgPool3DDescriptor( + context::getInfiniopHandle(output->device()), + &desc, + output->desc(), + input->desc(), + output_size_vec.data())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + // Get workspace size and allocate if needed + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAdaptiveAvgPool3DWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR( + infiniopAdaptiveAvgPool3D( + desc, + workspace->data(), workspace_size, + output->data(), + input->data(), + context::getStream())); +} + +static bool registered = []() { + AdaptiveAvgPool3D::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::adaptive_avg_pool3d_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/addr/addr.cc b/src/infinicore/ops/addr/addr.cc new file mode 100644 index 000000000..af7493b6f --- /dev/null +++ b/src/infinicore/ops/addr/addr.cc @@ -0,0 +1,32 @@ +#include "infinicore/ops/addr.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Addr::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Addr::execute(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, vec1, vec2); + infinicore::context::setDevice(out->device()); + dispatcher().lookup(out->device().getType())(out, input, vec1, vec2, beta, alpha); +} + +Tensor addr(Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) { + + size_t n = vec1->shape()[0]; + size_t m = vec2->shape()[0]; + + // Create output tensor + Tensor out = Tensor::empty({n, m}, input->dtype(), input->device()); + addr_(out, input, vec1, vec2, beta, alpha); + return out; +} + +void addr_(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) { + Addr::execute(out, input, vec1, vec2, beta, alpha); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/addr/addr_infiniop.cc b/src/infinicore/ops/addr/addr_infiniop.cc new file mode 100644 index 000000000..a215a94bf --- /dev/null +++ b/src/infinicore/ops/addr/addr_infiniop.cc @@ -0,0 +1,53 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/addr.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::addr_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopAddrDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAddrDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor out, Tensor input, Tensor vec1, Tensor vec2, float beta, float alpha) { + // Hash the inputs including beta and alpha + size_t seed = hash_combine(out, input, vec1, vec2, beta, alpha); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopAddrDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateAddrDescriptor( + context::getInfiniopHandle(out->device()), &desc, + out->desc(), input->desc(), vec1->desc(), vec2->desc(), beta, alpha)); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAddrWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopAddr( + desc, workspace->data(), workspace_size, + out->data(), input->data(), vec1->data(), vec2->data(), context::getStream())); +} + +static bool registered = []() { + Addr::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::addr_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/argwhere/argwhere.cc b/src/infinicore/ops/argwhere/argwhere.cc new file mode 100644 index 000000000..a9d45de48 --- /dev/null +++ b/src/infinicore/ops/argwhere/argwhere.cc @@ -0,0 +1,33 @@ +#include "infinicore/ops/argwhere.hpp" +#include "../../utils.hpp" +#include "infinicore.h" +#include "infinicore/dtype.hpp" +#include "infinicore/tensor.hpp" +#include + +namespace infinicore::op { + +common::OpDispatcher &Argwhere::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Argwhere::execute(void **y, size_t *count, Tensor x) { + auto device_type = context::getDevice().getType(); + + auto func = dispatcher().lookup(device_type); + if (func == nullptr) { + throw std::runtime_error("Argwhere op not implemented for device type " + std::to_string(static_cast(device_type))); + } + func(y, count, x); +} +Tensor argwhere(Tensor x) { + void *y = nullptr; + size_t count = 0; + Argwhere::execute(&y, &count, x); + auto result = Tensor::from_blob(y, Shape{count, x->ndim()}, DataType::I64, Device::cpu()); + result = result->to(x->device()); + return result; +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/argwhere/argwhere_infiniop.cc b/src/infinicore/ops/argwhere/argwhere_infiniop.cc new file mode 100644 index 000000000..bc6c919db --- /dev/null +++ b/src/infinicore/ops/argwhere/argwhere_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/argwhere.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infiniop/ops/argwhere.h" + +namespace infinicore::op::argwhere_impl::infiniop { +thread_local common::OpCache caches( + 100, // capacity + [](infiniopArgwhereDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyArgwhereDescriptor(desc)); + desc = nullptr; + } + }); +void calculate(void **y, size_t *count, Tensor x) { + size_t seed = hash_combine(x); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopArgwhereDescriptor_t desc = nullptr; + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateArgwhereDescriptor( + context::getInfiniopHandle(x->device()), + &desc, + x->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetArgwhereWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopArgwhere( + desc, + workspace->data(), workspace_size, + y, + count, + x->data(), + context::getStream())); +} +static bool registered = []() { + Argwhere::dispatcher().registerAll(&calculate, false); + return true; +}(); +} // namespace infinicore::op::argwhere_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/asin/asin.cc b/src/infinicore/ops/asin/asin.cc new file mode 100644 index 000000000..f3a8efb7c --- /dev/null +++ b/src/infinicore/ops/asin/asin.cc @@ -0,0 +1,36 @@ +#include "infinicore/ops/asin.hpp" + +#include "../../utils.hpp" + +#include + +namespace infinicore::op { + +common::OpDispatcher &Asin::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; +void Asin::execute(Tensor output, Tensor input) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input); + infinicore::context::setDevice(output->device()); + auto device_type = output->device().getType(); + auto func = dispatcher().lookup(device_type); + + if (func == nullptr) { + throw std::runtime_error("No Asin implementation found for device type: " + std::to_string(static_cast(device_type))); + } + + func(output, input); +} + +Tensor asin(Tensor input) { + Shape shape = input->shape(); + auto output = Tensor::empty(shape, input->dtype(), input->device()); + asin_(output, input); + return output; +} + +void asin_(Tensor output, Tensor input) { + Asin::execute(output, input); +} +} // namespace infinicore::op diff --git a/src/infinicore/ops/asin/asin_infiniop.cc b/src/infinicore/ops/asin/asin_infiniop.cc new file mode 100644 index 000000000..4dec4c220 --- /dev/null +++ b/src/infinicore/ops/asin/asin_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/asin.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +namespace infinicore::op::asin_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopAsinDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyAsinDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor output, Tensor input) { + size_t seed = hash_combine(output, input); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopAsinDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateAsinDescriptor( + context::getInfiniopHandle(output->device()), &desc, + output->desc(), input->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetAsinWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopAsin( + desc, workspace->data(), workspace_size, + output->data(), input->data(), context::getStream())); +} + +static bool registered = []() { + Asin::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::asin_impl::infiniop \ No newline at end of file diff --git a/src/infinicore/ops/fmin/fmin.cc b/src/infinicore/ops/fmin/fmin.cc new file mode 100644 index 000000000..54d056c63 --- /dev/null +++ b/src/infinicore/ops/fmin/fmin.cc @@ -0,0 +1,27 @@ +#include "infinicore/ops/fmin.hpp" +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Fmin::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +} + +void Fmin::execute(Tensor c, Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); + infinicore::context::setDevice(c->device()); + dispatcher().lookup(c->device().getType())(c, a, b); +} + +Tensor fmin(Tensor a, Tensor b) { + auto c = Tensor::empty(a->shape(), a->dtype(), a->device()); + fmin_(c, a, b); + return c; +} + +void fmin_(Tensor c, Tensor a, Tensor b) { + Fmin::execute(c, a, b); +} + +} // namespace infinicore::op \ No newline at end of file diff --git a/src/infinicore/ops/fmin/fmin_infiniop.cc b/src/infinicore/ops/fmin/fmin_infiniop.cc new file mode 100644 index 000000000..c9560479a --- /dev/null +++ b/src/infinicore/ops/fmin/fmin_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/fmin.hpp" +#include + +namespace infinicore::op::fmin_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopFminDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyFminDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor c, Tensor a, Tensor b) { + size_t seed = hash_combine(c, b, a); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopFminDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateFminDescriptor( + context::getInfiniopHandle(c->device()), &desc, + c->desc(), a->desc(), b->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetFminWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopFmin( + desc, workspace->data(), workspace_size, + c->data(), a->data(), b->data(), context::getStream())); +} + +static bool registered = []() { + Fmin::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::fmin_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 431c3a37b..2563efb00 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -2,11 +2,16 @@ #include +#include "ops/adaptive_avg_pool3d.hpp" #include "ops/add.hpp" +#include "ops/addr.hpp" +#include "ops/argwhere.hpp" +#include "ops/asin.hpp" #include "ops/add_rms_norm.hpp" #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" #include "ops/embedding.hpp" +#include "ops/fmin.hpp" #include "ops/linear.hpp" #include "ops/matmul.hpp" #include "ops/mul.hpp" @@ -25,9 +30,14 @@ namespace infinicore::ops { inline void bind(py::module &m) { bind_add(m); + bind_adaptive_avg_pool3d(m); + bind_argwhere(m); + bind_addr(m); + bind_asin(m); bind_add_rms_norm(m); bind_attention(m); bind_causal_softmax(m); + bind_fmin(m); bind_random_sample(m); bind_linear(m); bind_matmul(m); @@ -42,4 +52,4 @@ inline void bind(py::module &m) { bind_embedding(m); } -} // namespace infinicore::ops +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp b/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp new file mode 100644 index 000000000..3b2b8fdc7 --- /dev/null +++ b/src/infinicore/pybind11/ops/adaptive_avg_pool3d.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "infinicore/ops/adaptive_avg_pool3d.hpp" + +namespace py = pybind11; +namespace infinicore::ops { +inline void bind_adaptive_avg_pool3d(py::module &m) { + m.def("adaptive_avg_pool3d", + &op::adaptive_avg_pool3d, + py::arg("x"), + py::arg("output_size"), + R"doc( Adaptive Average Pooling 3D.)doc"); + + m.def("adaptive_avg_pool3d_", + &op::adaptive_avg_pool3d_, + py::arg("y"), + py::arg("x"), + R"doc(In-place, Adaptive Average Pooling 3D.)doc"); +} +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/addr.hpp b/src/infinicore/pybind11/ops/addr.hpp new file mode 100644 index 000000000..2777fc513 --- /dev/null +++ b/src/infinicore/pybind11/ops/addr.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "infinicore/ops/addr.hpp" +#include + +namespace py = pybind11; +namespace infinicore::ops { +inline void bind_addr(py::module_ &m) { + m.def( + "addr", + &op::addr, + py::arg("input"), + py::arg("vec1"), + py::arg("vec2"), + py::arg("alpha"), + py::arg("beta"), + R"doc(Addr.)doc"); + m.def( + "addr_", + &op::addr_, + py::arg("out"), + py::arg("input"), + py::arg("vec1"), + py::arg("vec2"), + py::arg("beta"), + py::arg("alpha"), + R"doc(Addr.)doc"); +} +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/argwhere.hpp b/src/infinicore/pybind11/ops/argwhere.hpp new file mode 100644 index 000000000..0e6e4a4af --- /dev/null +++ b/src/infinicore/pybind11/ops/argwhere.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include "infinicore/ops/argwhere.hpp" +#include +namespace py = pybind11; +namespace infinicore::ops { +inline void bind_argwhere(py::module &m) { + m.def("argwhere", + &op::argwhere, + py::arg("x"), + R"doc(Argwhere.)doc"); +} +} // namespace infinicore::ops \ No newline at end of file diff --git a/src/infinicore/pybind11/ops/asin.hpp b/src/infinicore/pybind11/ops/asin.hpp new file mode 100644 index 000000000..ebd14adbc --- /dev/null +++ b/src/infinicore/pybind11/ops/asin.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include "infinicore/ops/asin.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_asin(py::module &m) { + m.def("asin", + &op::asin, + py::arg("input"), + R"doc(Arcsin activation function.)doc"); + + m.def("asin_", + &op::asin_, + py::arg("output"), + py::arg("input"), + R"doc(In-place arcsin activation function.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infinicore/pybind11/ops/fmin.hpp b/src/infinicore/pybind11/ops/fmin.hpp new file mode 100644 index 000000000..afdbe4b33 --- /dev/null +++ b/src/infinicore/pybind11/ops/fmin.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include "infinicore/ops/fmin.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_fmin(py::module &m) { + m.def("fmin", + &op::fmin, + py::arg("a"), + py::arg("b"), + R"doc(fmin of two tensors.)doc"); + + m.def("fmin_", + &op::fmin_, + py::arg("c"), + py::arg("a"), + py::arg("b"), + R"doc(In-place tensor fmin.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h b/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h new file mode 100644 index 000000000..feea6e701 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/adaptive_avg_pool3d.h @@ -0,0 +1,90 @@ +#ifndef INFINIOP_ADAPTIVE_AVG_POOL3D_DESCRIPTOR_H_ +#define INFINIOP_ADAPTIVE_AVG_POOL3D_DESCRIPTOR_H_ +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "infiniop/ops/adaptive_avg_pool3d.h" +#include + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::adaptive_avg_pool3d::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + AdaptiveAvgPool3DInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + AdaptiveAvgPool3DInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + size_t *output_size); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +class AdaptiveAvgPool3DInfo { +private: + AdaptiveAvgPool3DInfo() = default; + +public: + infiniDtype_t dtype; + size_t x_d, x_h, x_w; + size_t y_d, y_h, y_w; + size_t N, C; + std::vector x_strides; + std::vector y_strides; + + static utils::Result + create( + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t *output_size) { + CHECK_OR_RETURN(x_desc != nullptr && output_size != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t data_type = x_desc->dtype(); + CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + const size_t ndim = x_desc->ndim(); + CHECK_OR_RETURN(ndim == 5, INFINI_STATUS_BAD_TENSOR_SHAPE); + + return utils::Result(AdaptiveAvgPool3DInfo{ + data_type, + x_desc->dim(2), + x_desc->dim(3), + x_desc->dim(4), + output_size[0], + output_size[1], + output_size[2], + x_desc->dim(0), + x_desc->dim(1), + x_desc->strides(), + y_desc->strides()}); + } +}; + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc new file mode 100644 index 000000000..45b53a973 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.cc @@ -0,0 +1,113 @@ +#include "adaptive_avg_pool3d_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +namespace op::adaptive_avg_pool3d::cpu { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t *output_size) { + + auto handle = reinterpret_cast(handle_); + + auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + nullptr, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculateAdaptiveAvgPool3D( + const AdaptiveAvgPool3DInfo &info, + Tdata *y, + const Tdata *x) { + std::array y_strides; + y_strides[size_t(4)] = 1; + y_strides[size_t(3)] = info.y_w * y_strides[size_t(4)]; + y_strides[size_t(2)] = info.y_h * y_strides[size_t(3)]; + y_strides[size_t(1)] = info.y_d * y_strides[size_t(2)]; + y_strides[size_t(0)] = info.C * y_strides[size_t(1)]; + { +#pragma omp for collapse(5) + for (ptrdiff_t n = 0; n < ptrdiff_t(info.N); n++) { + for (ptrdiff_t c = 0; c < ptrdiff_t(info.C); c++) { + for (ptrdiff_t od = 0; od < ptrdiff_t(info.y_d); od++) { + for (ptrdiff_t oh = 0; oh < ptrdiff_t(info.y_h); oh++) { + for (ptrdiff_t ow = 0; ow < ptrdiff_t(info.y_w); ow++) { + size_t x_start_d = od * info.x_d / info.y_d; + size_t x_end_d = ((od + 1) * info.x_d + info.y_d - 1) / info.y_d; + size_t x_start_h = oh * info.x_h / info.y_h; + size_t x_end_h = ((oh + 1) * info.x_h + info.y_h - 1) / info.y_h; + size_t x_start_w = ow * info.x_w / info.y_w; + size_t x_end_w = ((ow + 1) * info.x_w + info.y_w - 1) / info.y_w; + size_t count = (x_end_d - x_start_d) * (x_end_h - x_start_h) * (x_end_w - x_start_w); + + // Handle floating point types with casting + if constexpr (std::is_same::value || std::is_same::value) { + float sum = 0.0f; + for (size_t id = x_start_d; id < x_end_d; id++) { + for (size_t ih = x_start_h; ih < x_end_h; ih++) { + for (size_t iw = x_start_w; iw < x_end_w; iw++) { + size_t x_offset = n * info.x_strides[0] + c * info.x_strides[1] + id * info.x_strides[2] + ih * info.x_strides[3] + iw * info.x_strides[4]; + sum += utils::cast(x[x_offset]); + } + } + } + size_t y_offset = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; + y[y_offset] = utils::cast(sum / static_cast(count)); + } else { + Tdata sum = (Tdata)0; + for (size_t id = x_start_d; id < x_end_d; id++) { + for (size_t ih = x_start_h; ih < x_end_h; ih++) { + for (size_t iw = x_start_w; iw < x_end_w; iw++) { + size_t x_offset = n * info.x_strides[0] + c * info.x_strides[1] + id * info.x_strides[2] + ih * info.x_strides[3] + iw * info.x_strides[4]; + sum += x[x_offset]; + } + } + } + size_t y_offset = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; + // For integer types, we might want to handle division differently + y[y_offset] = sum / static_cast(count); + } + } + } + } + } + } + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F16: + return calculateAdaptiveAvgPool3D(_info, reinterpret_cast(y), reinterpret_cast(x)); + case INFINI_DTYPE_F32: + return calculateAdaptiveAvgPool3D(_info, reinterpret_cast(y), reinterpret_cast(x)); + case INFINI_DTYPE_F64: + return calculateAdaptiveAvgPool3D(_info, reinterpret_cast(y), reinterpret_cast(x)); + case INFINI_DTYPE_BF16: + return calculateAdaptiveAvgPool3D(_info, reinterpret_cast(y), reinterpret_cast(x)); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::adaptive_avg_pool3d::cpu \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h new file mode 100644 index 000000000..94753979a --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/cpu/adaptive_avg_pool3d_cpu.h @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_ADAPTIVE_AVG_POOL3D_CPU_H__ +#define __INFINIOP_ADAPTIVE_AVG_POOL3D_CPU_H__ + +#include "../adaptive_avg_pool3d.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh b/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh new file mode 100644 index 000000000..a348c5e30 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/cuda/kernel.cuh @@ -0,0 +1,44 @@ +#ifndef __ADAPTIVE_AVG_POOL3D_CUDA_H__ +#define __ADAPTIVE_AVG_POOL3D_CUDA_H__ + +#include + +template +__device__ void adaptiveAvgPool3DKernel(T *y, const T *x, size_t N, size_t C, + size_t x_d, size_t x_h, size_t x_w, + size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, + const ptrdiff_t *y_strides) { + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = N * C * y_d * y_h * y_w; + size_t n, c, od, oh, ow; + n = index / (C * y_d * y_h * y_w); + c = (index / (y_d * y_h * y_w)) % C; + od = (index / (y_h * y_w)) % y_d; + oh = (index / y_w) % y_h; + ow = index % y_w; + + if (index < total) { + size_t x_d_start = (od * x_d) / y_d; + size_t x_d_end = ((od + 1) * x_d + y_d - 1) / y_d; + size_t x_h_start = (oh * x_h) / y_h; + size_t x_h_end = ((oh + 1) * x_h + y_h - 1) / y_h; + size_t x_w_start = (ow * x_w) / y_w; + size_t x_w_end = ((ow + 1) * x_w + y_w - 1) / y_w; + + T sum = static_cast(0); + size_t count = (x_d_end - x_d_start) * (x_h_end - x_h_start) * (x_w_end - x_w_start); + for (size_t id = x_d_start; id < x_d_end; ++id) { + for (size_t ih = x_h_start; ih < x_h_end; ++ih) { + for (size_t iw = x_w_start; iw < x_w_end; ++iw) { + size_t x_index = n * x_strides[0] + c * x_strides[1] + id * x_strides[2] + ih * x_strides[3] + iw * x_strides[4]; + sum += x[x_index]; + } + } + } + size_t y_index = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; + y[y_index] = sum / static_cast(static_cast(count)); + } +} +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h new file mode 100644 index 000000000..9421aef35 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.h @@ -0,0 +1,8 @@ +#ifndef INFINIOP_OPS_ADAPTIVE_AVG_POOL3D_METAX_ADAPTIVE_AVG_POOL3D_METAX_H_ +#define INFINIOP_OPS_ADAPTIVE_AVG_POOL3D_METAX_ADAPTIVE_AVG_POOL3D_METAX_H_ + +#include "../adaptive_avg_pool3d.h" + +DESCRIPTOR(metax) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca new file mode 100644 index 000000000..48794fe17 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/metax/adaptive_avg_pool3d_metax.maca @@ -0,0 +1,88 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../cuda/kernel.cuh" +#include "adaptive_avg_pool3d_metax.h" + +template +INFINIOP_METAX_KERNEL +adaptiveAvgPool3D(T *y, const T *x, size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, const ptrdiff_t *y_strides) { + adaptiveAvgPool3DKernel(y, x, N, C, x_d, x_h, x_w, y_d, y_h, + y_w, x_strides, y_strides); +} + +namespace op::adaptive_avg_pool3d::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t *output_size) { + auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(info); + size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, + const ptrdiff_t *y_strides, hcStream_t stream) { + size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE; + if (dtype == INFINI_DTYPE_F16) { + adaptiveAvgPool3D<<>>( + (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else if (dtype == INFINI_DTYPE_F32) { + adaptiveAvgPool3D<<>>( + (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + + // Prepare strides + ptrdiff_t *x_strides = (ptrdiff_t *)workspace; + ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size(); + hcMemcpyAsync(x_strides, _info.x_strides.data(), + _info.x_strides.size() * sizeof(ptrdiff_t), + hcMemcpyHostToDevice, stream); + hcMemcpyAsync(y_strides, _info.y_strides.data(), + _info.y_strides.size() * sizeof(ptrdiff_t), + hcMemcpyHostToDevice, stream); + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::adaptive_avg_pool3d::metax \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h new file mode 100644 index 000000000..a99c02483 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_kernel.h @@ -0,0 +1,42 @@ +#ifndef ADAPTIVE_AVG_POOL3D_KERNEL_H_ +#define ADAPTIVE_AVG_POOL3D_KERNEL_H_ + +template +__device__ void adaptiveAvgPool3DKernel(T *y, const T *x, size_t N, size_t C, + size_t x_d, size_t x_h, size_t x_w, + size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, + const ptrdiff_t *y_strides) { + + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = N * C * y_d * y_h * y_w; + size_t n, c, od, oh, ow; + n = index / (C * y_d * y_h * y_w); + c = (index / (y_d * y_h * y_w)) % C; + od = (index / (y_h * y_w)) % y_d; + oh = (index / y_w) % y_h; + ow = index % y_w; + + if (index < total) { + size_t x_d_start = (od * x_d) / y_d; + size_t x_d_end = ((od + 1) * x_d + y_d - 1) / y_d; + size_t x_h_start = (oh * x_h) / y_h; + size_t x_h_end = ((oh + 1) * x_h + y_h - 1) / y_h; + size_t x_w_start = (ow * x_w) / y_w; + size_t x_w_end = ((ow + 1) * x_w + y_w - 1) / y_w; + + T sum = static_cast(0); + size_t count = (x_d_end - x_d_start) * (x_h_end - x_h_start) * (x_w_end - x_w_start); + for (size_t id = x_d_start; id < x_d_end; ++id) { + for (size_t ih = x_h_start; ih < x_h_end; ++ih) { + for (size_t iw = x_w_start; iw < x_w_end; ++iw) { + size_t x_index = n * x_strides[0] + c * x_strides[1] + id * x_strides[2] + ih * x_strides[3] + iw * x_strides[4]; + sum += x[x_index]; + } + } + } + size_t y_index = n * y_strides[0] + c * y_strides[1] + od * y_strides[2] + oh * y_strides[3] + ow * y_strides[4]; + y[y_index] = sum / static_cast(count); + } +} +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.h b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.h new file mode 100644 index 000000000..37967bf16 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.h @@ -0,0 +1,7 @@ +#ifndef ADAPTIVE_AVG_POOL3D_MOORE_H_ +#define ADAPTIVE_AVG_POOL3D_MOORE_H_ +#include "../adaptive_avg_pool3d.h" + +DESCRIPTOR(moore) + +#endif // ADAPTIVE_AVG_POOL3D_MOORE_H_ diff --git a/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu new file mode 100644 index 000000000..9dd2259bf --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/moore/adaptive_avg_pool3d_moore.mu @@ -0,0 +1,89 @@ +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "adaptive_avg_pool3d_moore.h" +#include "adaptive_avg_pool3d_kernel.h" + +template +INFINIOP_MOORE_KERNEL +adaptiveAvgPool3D(T *y, const T *x, size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, const ptrdiff_t *y_strides) { + adaptiveAvgPool3DKernel(y, x, N, C, x_d, x_h, x_w, y_d, y_h, + y_w, x_strides, y_strides); +} +namespace op::adaptive_avg_pool3d::moore { +struct Descriptor::Opaque { + std::shared_ptr internal; +}; +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t *output_size) { + auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(info); + size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, + const ptrdiff_t *y_strides, musaStream_t stream) { + size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE; + if (dtype == INFINI_DTYPE_F16) { + adaptiveAvgPool3D<<>>( + (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else if (dtype == INFINI_DTYPE_F32) { + adaptiveAvgPool3D<<>>( + (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream_) const { + musaStream_t stream = (musaStream_t)stream_; + + // Prepare strides + ptrdiff_t *x_strides = (ptrdiff_t *)workspace; + ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size(); + musaMemcpyAsync(x_strides, _info.x_strides.data(), + _info.x_strides.size() * sizeof(ptrdiff_t), + musaMemcpyHostToDevice, stream); + musaMemcpyAsync(y_strides, _info.y_strides.data(), + _info.y_strides.size() * sizeof(ptrdiff_t), + musaMemcpyHostToDevice, stream); + + if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if(_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048){ + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + }else{ + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} +} \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu new file mode 100644 index 000000000..b522b2da3 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cu @@ -0,0 +1,93 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "adaptive_avg_pool3d_nvidia.cuh" +#include + +template +INFINIOP_CUDA_KERNEL +adaptiveAvgPool3D(T *y, const T *x, size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, const ptrdiff_t *y_strides) { + adaptiveAvgPool3DKernel(y, x, N, C, x_d, x_h, x_w, y_d, y_h, + y_w, x_strides, y_strides); +} + +namespace op::adaptive_avg_pool3d::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t *output_size) { + auto info = AdaptiveAvgPool3DInfo::create(y_desc, x_desc, output_size); + CHECK_RESULT(info); + size_t workspace_size = 10 * sizeof(ptrdiff_t); // for x_strides and y_strides + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t N, size_t C, size_t x_d, size_t x_h, + size_t x_w, size_t y_d, size_t y_h, size_t y_w, + const ptrdiff_t *x_strides, + const ptrdiff_t *y_strides, cudaStream_t stream) { + size_t num_blocks = (N * C * y_d * y_h * y_w + BLOCK_SIZE - 1) / BLOCK_SIZE; + if (dtype == INFINI_DTYPE_F16) { + adaptiveAvgPool3D<<>>( + (half *)y, (const half *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else if (dtype == INFINI_DTYPE_F32) { + adaptiveAvgPool3D<<>>( + (float *)y, (const float *)x, N, C, x_d, x_h, x_w, y_d, y_h, y_w, + x_strides, y_strides); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, const void *x, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + + // Prepare strides + ptrdiff_t *x_strides = (ptrdiff_t *)workspace; + ptrdiff_t *y_strides = (ptrdiff_t *)workspace + _info.x_strides.size(); + cudaMemcpyAsync(x_strides, _info.x_strides.data(), + _info.x_strides.size() * sizeof(ptrdiff_t), + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(y_strides, _info.y_strides.data(), + _info.y_strides.size() * sizeof(ptrdiff_t), + cudaMemcpyHostToDevice, stream); + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.N, _info.C, _info.x_d, _info.x_h, _info.x_w, + _info.y_d, _info.y_h, _info.y_w, x_strides, y_strides, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::adaptive_avg_pool3d::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh new file mode 100644 index 000000000..973a0a54a --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/nvidia/adaptive_avg_pool3d_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ADAPTIVE_AVG_POOL3D_NVIDIA_H__ +#define __ADAPTIVE_AVG_POOL3D_NVIDIA_H__ + +#include "../adaptive_avg_pool3d.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/adaptive_avg_pool3d/operator.cc b/src/infiniop/ops/adaptive_avg_pool3d/operator.cc new file mode 100644 index 000000000..738402862 --- /dev/null +++ b/src/infiniop/ops/adaptive_avg_pool3d/operator.cc @@ -0,0 +1,216 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infinicore.h" +#include "infiniop/ops/adaptive_avg_pool3d.h" + +#ifdef ENABLE_CPU_API +#include "cpu/adaptive_avg_pool3d_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/adaptive_avg_pool3d_nvidia.cuh" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/adaptive_avg_pool3d_ascend.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/adaptive_avg_pool3d_bang.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/adaptive_avg_pool3d_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/adaptive_avg_pool3d_kunlun.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/adaptive_avg_pool3d_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAdaptiveAvgPool3DDescriptor( + infiniopHandle_t handle, + infiniopAdaptiveAvgPool3DDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + size_t *output_size) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::adaptive_avg_pool3d::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y, \ + x, \ + output_size) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + } + +#undef CREATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetAdaptiveAvgPool3DWorkspaceSize( + infiniopAdaptiveAvgPool3DDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAdaptiveAvgPool3D( + infiniopAdaptiveAvgPool3DDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, stream); + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + } +#undef CALCULATE + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyAdaptiveAvgPool3DDescriptor(infiniopAdaptiveAvgPool3DDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + DELETE(INFINI_DEVICE_ASCEND, ascend); +#endif + } +#undef DELETE + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} \ No newline at end of file diff --git a/src/infiniop/ops/addr/addr.h b/src/infiniop/ops/addr/addr.h new file mode 100644 index 000000000..6cf5b20fc --- /dev/null +++ b/src/infiniop/ops/addr/addr.h @@ -0,0 +1,95 @@ +#ifndef INFINIOP_ADDR_DESCRIPTOR_H_ +#define INFINIOP_ADDR_DESCRIPTOR_H_ +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "infiniop/ops/addr.h" +#include + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::addr::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + AddrInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + AddrInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t vec1_desc, \ + infiniopTensorDescriptor_t vec2_desc, \ + float beta, \ + float alpha); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *out, \ + const void *input, \ + const void *vec1, \ + const void *vec2, \ + void *stream) const; \ + }; \ + } + +struct AddrInfo { + infiniDtype_t dtype; + size_t vec1_size; + size_t vec2_size; + float beta; + float alpha; + ptrdiff_t input_stride0, input_stride1; + ptrdiff_t output_stride0, output_stride1; + ptrdiff_t vec1_stride; + ptrdiff_t vec2_stride; + + static utils::Result + create(infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t vec1_desc, + infiniopTensorDescriptor_t vec2_desc, + float beta = 1.0f, float alpha = 1.0f) { + CHECK_OR_RETURN(input_desc->ndim() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_OR_RETURN(vec1_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_OR_RETURN(vec2_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_OR_RETURN(input_desc->dim(0) == vec1_desc->dim(0) && input_desc->dim(1) == vec2_desc->dim(0), INFINI_STATUS_BAD_TENSOR_SHAPE); + const infiniDtype_t data_type = input_desc->dtype(); + CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + return utils::Result(AddrInfo{ + data_type, + vec1_desc->dim(0), + vec2_desc->dim(0), + beta, + alpha, + input_desc->stride(0), + input_desc->stride(1), + output_desc->stride(0), + output_desc->stride(1), + vec1_desc->stride(0), + vec2_desc->stride(0), + }); + } +}; + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.cc b/src/infiniop/ops/addr/cpu/addr_cpu.cc new file mode 100644 index 000000000..52339164c --- /dev/null +++ b/src/infiniop/ops/addr/cpu/addr_cpu.cc @@ -0,0 +1,93 @@ +#include "addr_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include +namespace op::addr::cpu { +Descriptor::~Descriptor() = default; + +// Template function to handle different data types +template +infiniStatus_t addr_impl(Tdata *out, + const Tdata *input, + const Tdata *vec1, + const Tdata *vec2, + const AddrInfo &info, + void *workspace, + size_t workspace_size) { + size_t n = info.vec1_size; + size_t m = info.vec2_size; + float beta = info.beta; + float alpha = info.alpha; + +#pragma omp parallel for collapse(2) + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < m; ++j) { + if constexpr (std::is_same::value || std::is_same::value) { + float a = utils::cast(vec1[i * info.vec1_stride]); + float b = utils::cast(vec2[j * info.vec2_stride]); + float c = utils::cast(input[i * info.input_stride0 + j * info.input_stride1]); + out[i * info.output_stride0 + j * info.output_stride1] = utils::cast(alpha * a * b + beta * c); + } else { + float a = vec1[i * info.vec1_stride], b = vec2[j * info.vec2_stride], c = input[i * info.input_stride0 + j * info.input_stride1]; + out[i * info.output_stride0 + j * info.output_stride1] = utils::cast(alpha * a * b + beta * c); + } + } + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t vec1_desc, + infiniopTensorDescriptor_t vec2_desc, + float beta, + float alpha) { + + auto handle = reinterpret_cast(handle_); + auto info = AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor(info.take(), 0, nullptr, + INFINI_DEVICE_CPU, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *vec1, + const void *vec2, + void *stream) const { + + switch (_info.dtype) { + case INFINI_DTYPE_F32: + return addr_impl(reinterpret_cast(out), + reinterpret_cast(input), + reinterpret_cast(vec1), + reinterpret_cast(vec2), + _info, workspace, workspace_size); + break; + case INFINI_DTYPE_F16: + return addr_impl(reinterpret_cast(out), + reinterpret_cast(input), + reinterpret_cast(vec1), + reinterpret_cast(vec2), + _info, workspace, workspace_size); + case INFINI_DTYPE_BF16: + return addr_impl(reinterpret_cast(out), + reinterpret_cast(input), + reinterpret_cast(vec1), + reinterpret_cast(vec2), + _info, workspace, workspace_size); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::addr::cpu \ No newline at end of file diff --git a/src/infiniop/ops/addr/cpu/addr_cpu.h b/src/infiniop/ops/addr/cpu/addr_cpu.h new file mode 100644 index 000000000..e606c1816 --- /dev/null +++ b/src/infiniop/ops/addr/cpu/addr_cpu.h @@ -0,0 +1,7 @@ +#ifndef INFINIOP_ADDR_CPU_H_ +#define INFINIOP_ADDR_CPU_H_ +#include "../addr.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/cuda/kernel.cu b/src/infiniop/ops/addr/cuda/kernel.cu new file mode 100644 index 000000000..b6af54e05 --- /dev/null +++ b/src/infiniop/ops/addr/cuda/kernel.cu @@ -0,0 +1,38 @@ +#ifndef __ADDR_CUDA_H__ +#define __ADDR_CUDA_H__ +#include +#include + +template +__device__ void addr_kernel(T *out, const T *input, const T *vec1, + const T *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + size_t j = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= n || j >= m) { + return; + } + size_t out_idx = i * out_stride_0 + j * out_stride_1; + size_t in_idx = i * in_stride_0 + j * in_stride_1; + size_t vec1_idx = i * stride1; + size_t vec2_idx = j * stride2; + T in_val = input[in_idx]; + T vec1_val = vec1[vec1_idx]; + T vec2_val = vec2[vec2_idx]; + T out_val; + if constexpr (std::is_same_v) { + out_val = __hadd(__hmul(__hmul(vec1_val, vec2_val), __float2half(alpha)), + __hmul(__float2half(beta), in_val)); + } else if constexpr (std::is_same_v) { + out_val = __hadd(__hmul(__hmul(vec1_val, vec2_val), __float2bfloat16(alpha)), + __hmul(__float2bfloat16(beta), in_val)); + } else { + out_val = beta * in_val + alpha * vec1_val * vec2_val; + } + + out[out_idx] = out_val; + __syncthreads(); +} +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/metax/addr_metax.h b/src/infiniop/ops/addr/metax/addr_metax.h new file mode 100644 index 000000000..9a8acdbe2 --- /dev/null +++ b/src/infiniop/ops/addr/metax/addr_metax.h @@ -0,0 +1,8 @@ +#ifndef INFINIOP_ADDR_METAX_H_ +#define INFINIOP_ADDR_METAX_H_ + +#include "../addr.h" + +DESCRIPTOR(metax) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/metax/addr_nvidia.maca b/src/infiniop/ops/addr/metax/addr_nvidia.maca new file mode 100644 index 000000000..443d59cff --- /dev/null +++ b/src/infiniop/ops/addr/metax/addr_nvidia.maca @@ -0,0 +1,108 @@ + +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../cuda/kernel.cu" +#include "addr_metax.h" +#include "infinicore.h" +#include + +template +INFINIOP_METAX_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1, + const Tdata *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + addr_kernel(output, input, vec1, vec2, n, m, beta, alpha, stride1, + stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); +} +namespace op::addr::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t vec1_desc, + infiniopTensorDescriptor_t vec2_desc, + float beta, float alpha) { + + auto info = + AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); + CHECK_RESULT(info); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t +launchKernel(infiniDtype_t dtype, void *output, const void *input, + const void *vec1, const void *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1, hcStream_t stream) { + unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE), + dm = std::min((unsigned int)m, BLOCK_SIZE); + dim3 grid = {(unsigned int)((n + dn - 1) / dn), + (unsigned int)((m + dm - 1) / dm)}; + dim3 block = {dn, dm}; + switch (dtype) { + case INFINI_DTYPE_F32: + Addr<<>>( + (float *)output, (const float *)input, (const float *)vec1, + (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0, + out_stride_1, in_stride_0, in_stride_1); + break; + case INFINI_DTYPE_F16: + Addr<__half><<>>( + (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m, + beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); + break; + case INFINI_DTYPE_BF16: + Addr<__hpcc_bfloat16><<>>( + (__hpcc_bfloat16 *)output, (__hpcc_bfloat16 *)input, + (__hpcc_bfloat16 *)vec1, (__hpcc_bfloat16 *)vec2, n, m, beta, alpha, + stride1, stride2, out_stride_0, out_stride_1, in_stride_0, in_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *out, const void *input, + const void *vec1, const void *vec2, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::addr::metax \ No newline at end of file diff --git a/src/infiniop/ops/addr/moore/addr_kernel.h b/src/infiniop/ops/addr/moore/addr_kernel.h new file mode 100644 index 000000000..49e83aced --- /dev/null +++ b/src/infiniop/ops/addr/moore/addr_kernel.h @@ -0,0 +1,36 @@ +#ifndef __ADDR_MOORE_H__ +#define __ADDR_MOORE_H__ + +template +__device__ void addr_kernel(T *out, const T *input, const T *vec1, + const T *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + size_t j = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= n || j >= m) { + return; + } + size_t out_idx = i * out_stride_0 + j * out_stride_1; + size_t in_idx = i * in_stride_0 + j * in_stride_1; + size_t vec1_idx = i * stride1; + size_t vec2_idx = j * stride2; + T in_val = input[in_idx]; + T vec1_val = vec1[vec1_idx]; + T vec2_val = vec2[vec2_idx]; + T out_val; + if constexpr (std::is_same_v) { + out_val = __hadd(__hmul(__hmul(vec1_val, vec2_val), __float2half(alpha)), + __hmul(__float2half(beta), in_val)); + } else if constexpr (std::is_same_v) { + float a = __bfloat162float(vec1_val), b = __bfloat162float(vec2_val), in = __bfloat162float(in_val); + out_val = __float2bfloat16_rn(a * b * alpha + in * beta); + } else { + out_val = beta * in_val + alpha * vec1_val * vec2_val; + } + + out[out_idx] = out_val; + __syncthreads(); +} +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/moore/addr_moore.h b/src/infiniop/ops/addr/moore/addr_moore.h new file mode 100644 index 000000000..d32182597 --- /dev/null +++ b/src/infiniop/ops/addr/moore/addr_moore.h @@ -0,0 +1,8 @@ +#ifndef INFINIOP_ADDR_MOORE_H_ +#define INFINIOP_ADDR_MOORE_H_ + +#include "../addr.h" + +DESCRIPTOR(moore) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/moore/addr_moore.mu b/src/infiniop/ops/addr/moore/addr_moore.mu new file mode 100644 index 000000000..95626044b --- /dev/null +++ b/src/infiniop/ops/addr/moore/addr_moore.mu @@ -0,0 +1,108 @@ +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "addr_kernel.h" +#include "addr_moore.h" +#include "infinicore.h" +#include + +template +INFINIOP_MOORE_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1, + const Tdata *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + addr_kernel(output, input, vec1, vec2, n, m, beta, alpha, stride1, + stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); +} +namespace op::addr::moore { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t vec1_desc, + infiniopTensorDescriptor_t vec2_desc, + float beta, float alpha) { + + auto info = + AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); + CHECK_RESULT(info); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(infiniDtype_t dtype, void *output, + const void *input, const void *vec1, + const void *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1, + musaStream_t stream) { + unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE), + dm = std::min((unsigned int)m, BLOCK_SIZE); + dim3 grid = {(unsigned int)((n + dn - 1) / dn), + (unsigned int)((m + dm - 1) / dm)}; + dim3 block = {dn, dm}; + switch (dtype) { + case INFINI_DTYPE_F32: + Addr<<>>( + (float *)output, (const float *)input, (const float *)vec1, + (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0, + out_stride_1, in_stride_0, in_stride_1); + break; + case INFINI_DTYPE_F16: + Addr<__half><<>>( + (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m, + beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); + break; + case INFINI_DTYPE_BF16: + Addr<<>>( + (cuda_bfloat16 *)output, (cuda_bfloat16 *)input, (cuda_bfloat16 *)vec1, + (cuda_bfloat16 *)vec2, n, m, beta, alpha, stride1, stride2, + out_stride_0, out_stride_1, in_stride_0, in_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *out, const void *input, + const void *vec1, const void *vec2, + void *stream_) const { + musaStream_t stream = (musaStream_t)stream_; + + if (_opaque->internal->maxThreadsPerBlock() ==MOORE_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::addr::moore \ No newline at end of file diff --git a/src/infiniop/ops/addr/nvidia/addr_nvidia.cu b/src/infiniop/ops/addr/nvidia/addr_nvidia.cu new file mode 100644 index 000000000..cbd5279ce --- /dev/null +++ b/src/infiniop/ops/addr/nvidia/addr_nvidia.cu @@ -0,0 +1,114 @@ + +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cu" +#include "addr_nvidia.cuh" +#include "infinicore.h" +#include + +template +INFINIOP_CUDA_KERNEL Addr(Tdata *output, const Tdata *input, const Tdata *vec1, + const Tdata *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1) { + addr_kernel(output, input, vec1, vec2, n, m, beta, alpha, stride1, + stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); +} +namespace op::addr::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t vec1_desc, + infiniopTensorDescriptor_t vec2_desc, + float beta, float alpha) { + + auto info = AddrInfo::create(input_desc, out_desc, vec1_desc, vec2_desc, beta, alpha); + CHECK_RESULT(info); + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(infiniDtype_t dtype, void *output, + const void *input, const void *vec1, + const void *vec2, size_t n, size_t m, float beta, + float alpha, ptrdiff_t stride1, ptrdiff_t stride2, + ptrdiff_t out_stride_0, ptrdiff_t out_stride_1, + ptrdiff_t in_stride_0, ptrdiff_t in_stride_1, + cudaStream_t stream) { + unsigned int dn = std::min((unsigned int)n, BLOCK_SIZE), + dm = std::min((unsigned int)m, BLOCK_SIZE); + dim3 grid = {(unsigned int)((n + dn - 1) / dn), + (unsigned int)((m + dm - 1) / dm)}; + dim3 block = {dn, dm}; + switch (dtype) { + case INFINI_DTYPE_F32: + Addr<<>>( + (float *)output, (const float *)input, (const float *)vec1, + (const float *)vec2, n, m, beta, alpha, stride1, stride2, out_stride_0, + out_stride_1, in_stride_0, in_stride_1); + break; + case INFINI_DTYPE_F16: + Addr<__half><<>>( + (__half *)output, (__half *)input, (__half *)vec1, (__half *)vec2, n, m, + beta, alpha, stride1, stride2, out_stride_0, out_stride_1, in_stride_0, + in_stride_1); + break; + case INFINI_DTYPE_BF16: + Addr<__nv_bfloat16><<>>( + (__nv_bfloat16 *)output, (__nv_bfloat16 *)input, (__nv_bfloat16 *)vec1, + (__nv_bfloat16 *)vec2, n, m, beta, alpha, stride1, stride2, + out_stride_0, out_stride_1, in_stride_0, in_stride_1); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *out, const void *input, + const void *vec1, const void *vec2, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + _info.dtype, out, input, vec1, vec2, _info.vec1_size, _info.vec2_size, + _info.beta, _info.alpha, _info.vec1_stride, _info.vec2_stride, + _info.output_stride0, _info.output_stride1, _info.input_stride0, + _info.input_stride1, stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::addr::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh b/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh new file mode 100644 index 000000000..9ebe7f2c3 --- /dev/null +++ b/src/infiniop/ops/addr/nvidia/addr_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef INFINIOP_ADDR_NVIDIA_H_ +#define INFINIOP_ADDR_NVIDIA_H_ +#include "../addr.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/addr/operator.cc b/src/infiniop/ops/addr/operator.cc new file mode 100644 index 000000000..6e3724743 --- /dev/null +++ b/src/infiniop/ops/addr/operator.cc @@ -0,0 +1,222 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infinicore.h" +#include "infiniop/ops/addr.h" + +#ifdef ENABLE_CPU_API +#include "cpu/addr_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) +#include "nvidia/addr_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/addr_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/addr_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAddrDescriptor( + infiniopHandle_t handle, + infiniopAddrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t out, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t vec1, + infiniopTensorDescriptor_t vec2, + float beta, + float alpha) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::addr::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + out, \ + input, \ + vec1, \ + vec2, \ + beta, \ + alpha) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CREATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + } + +#undef CREATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetAddrWorkspaceSize( + infiniopAddrDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + GET(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAddr( + infiniopAddrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *out, + const void *input, + const void *vec1, + const void *vec2, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, out, input, vec1, vec2, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + CALCULATE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + } + +#undef CALCULATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyAddrDescriptor(infiniopAddrDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_HYGON_API + DELETE(INFINI_DEVICE_HYGON, nvidia); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + DELETE(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef DELETE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/argwhere.h b/src/infiniop/ops/argwhere/argwhere.h new file mode 100644 index 000000000..afb067576 --- /dev/null +++ b/src/infiniop/ops/argwhere/argwhere.h @@ -0,0 +1,75 @@ +#ifndef __INFINIOP_ARGWHERE_H__ +#define __INFINIOP_ARGWHERE_H__ +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "infinicore.h" +#include + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::argwhere::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ArgwhereInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + ArgwhereInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t x_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void **y, \ + size_t *count, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +class ArgwhereInfo { +private: + ArgwhereInfo() = default; + +public: + infiniDtype_t dtype; + std::vector strides; + std::vector shapes; + size_t num_elements; + + static utils::Result + create( + infiniopTensorDescriptor_t x_desc) { + CHECK_OR_RETURN(x_desc != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t data_type = x_desc->dtype(); + CHECK_DTYPE(data_type, INFINI_DTYPE_F32); + + return utils::Result(ArgwhereInfo{ + data_type, + x_desc->strides(), + x_desc->shape(), + x_desc->numel()}); + } +}; +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc new file mode 100644 index 000000000..6a92047a2 --- /dev/null +++ b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.cc @@ -0,0 +1,80 @@ +#include "argwhere_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "infinicore.h" +#include +#include +#include +namespace op::argwhere::cpu { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc) { + + auto handle = reinterpret_cast(handle_); + + auto info = ArgwhereInfo::create(x_desc); + CHECK_RESULT(info); + + *desc_ptr = new Descriptor( + info.take(), + 0, + nullptr, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculateArgWhere( + const ArgwhereInfo &info, + void *workspace, + size_t workspace_size, + void **y, + size_t *count, + const void *x) { + + const Tdata *x_data = reinterpret_cast(x); + // int64_t *y_data = reinterpret_cast(y); + std::vector positions; + // #pragma omp parallel for + for (size_t i = 0; i < info.num_elements; i++) { + size_t pos = 0, tem = i; + std::vector position(info.strides.size()); + for (int j = info.strides.size() - 1; j >= 0; j--) { + position[j] = tem % info.shapes[j]; + tem /= info.shapes[j]; + pos += position[j] * info.strides[j]; + } + if (fabs(x_data[pos] - 0.0f) > 1e-5) { + for (auto p : position) { + positions.push_back(p); + } + } + } + + *y = new int64_t[positions.size()]; + memcpy(*y, positions.data(), positions.size() * sizeof(int64_t)); + *count = positions.size() / info.strides.size(); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void **y, + size_t *count, + const void *x, + void *stream) const { + switch (_info.dtype) { + case INFINI_DTYPE_F32: + return calculateArgWhere(_info, workspace, workspace_size, y, count, x); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::argwhere::cpu \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h new file mode 100644 index 000000000..1e6461e75 --- /dev/null +++ b/src/infiniop/ops/argwhere/cpu/argwhere_cpu.h @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_ARGWHERE_CPU_API_H__ +#define __INFINIOP_ARGWHERE_CPU_API_H__ + +#include "../argwhere.h" + +DESCRIPTOR(cpu) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/cuda/kernel.cuh b/src/infiniop/ops/argwhere/cuda/kernel.cuh new file mode 100644 index 000000000..ff4aae126 --- /dev/null +++ b/src/infiniop/ops/argwhere/cuda/kernel.cuh @@ -0,0 +1,215 @@ +#ifndef ARGWHERE_NVIDIA_KERNEL_H +#define ARGWHERE_NVIDIA_KERNEL_H +#include +#include + +__device__ void Index2Pos(size_t index, size_t ndim, const size_t *shapes, + size_t *pos) { + for (int i = ndim - 1; i >= 0; i--) { + pos[i] = index % shapes[i]; + index /= shapes[i]; + } +} +__device__ size_t pos2dest(size_t *pos, size_t ndim, const ptrdiff_t *strides) { + size_t dest = 0; + for (size_t i = 0; i < ndim; i++) { + dest += pos[i] * strides[i]; + } + return dest; +} + +template +__global__ void parallel_block_argwhere_kernel(T *data, int64_t *results, + size_t N, const size_t *shapes, const ptrdiff_t *strides, size_t ndim, size_t *count) { + extern __shared__ size_t tmp[]; + size_t pos1[5], pos2[5]; // 两个数的在tensor中的索引 + bool is_zero1 = false, is_zero2 = false; + int tid = threadIdx.x; + int leaf_num = blockDim.x * 2; // equals to length of tmp + + if (tid * 2 < N) { + Index2Pos(tid * 2, ndim, shapes, pos1); + is_zero1 = fabs(data[pos2dest(pos1, ndim, strides)]) <= 1e-5; + tmp[tid * 2] = !is_zero1; + } + if (tid * 2 + 1 < N) { + Index2Pos(tid * 2 + 1, ndim, shapes, pos2); + is_zero2 = fabs(data[pos2dest(pos2, ndim, strides)]) <= 1e-5; + tmp[tid * 2 + 1] = !is_zero2; + } + + __syncthreads(); + + int offset = 1; + for (int d = leaf_num >> 1; d > 0; d >>= 1) { + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + tmp[bi] += tmp[ai]; + } + offset *= 2; + __syncthreads(); + } + + if (tid == 0) { + tmp[leaf_num - 1] = 0; + } + __syncthreads(); + + for (int d = 1; d < leaf_num; d *= 2) { + offset >>= 1; + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + + int v = tmp[ai]; + tmp[ai] = tmp[bi]; + tmp[bi] += v; + } + __syncthreads(); + } + + // 写入最终结果 + if (!is_zero1 && tid * 2 < N) { + for (int i = 0; i < ndim; i++) { + results[tmp[2 * tid] * ndim + i] = pos1[i]; + } + } + if (!is_zero2 && tid * 2 + 1 < N) { + for (int i = 0; i < ndim; i++) { + results[tmp[2 * tid + 1] * ndim + i] = pos2[i]; + } + } + if (tid == blockDim.x - 1) { + // printf("blockIdxDim = %d\n", blockDim.x); + *count = tmp[N - 1] + (N == blockDim.x * 2 ? 1 : tmp[N] != tmp[N - 1]); + // printf("finally: count = %d\n", tmp[leaf_num - 1]); + } +} + +template +__global__ void parallel_block_scan_kernel(size_t N, int64_t *pre_sum) { + // single block scan + extern __shared__ int64_t tmp[]; + int tid = threadIdx.x; + int leaf_num = blockDim.x * 2; // equals to length of tmp + + tmp[tid * 2] = tid * 2 < N - 1 ? pre_sum[tid * 2] : 0; + tmp[tid * 2 + 1] = tid * 2 + 1 < N - 1 ? pre_sum[tid * 2 + 1] : 0; + __syncthreads(); + + int offset = 1; + for (int d = leaf_num >> 1; d > 0; d >>= 1) { + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + tmp[bi] += tmp[ai]; + } + offset *= 2; + __syncthreads(); + } + if (tid == 0) { + tmp[leaf_num - 1] = 0; + } + __syncthreads(); + for (int d = 1; d < leaf_num; d <<= 1) { + offset >>= 1; + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + + int v = tmp[ai]; + tmp[ai] = tmp[bi]; + tmp[bi] += v; + } + __syncthreads(); + } + if (tid * 2 < N) { + pre_sum[tid * 2] = tmp[tid * 2]; + } + if (tid * 2 + 1 < N) { + pre_sum[tid * 2 + 1] = tmp[tid * 2 + 1]; + } +} + +template +__global__ void +parallel_large_argwhere_kernel(const T *data, int64_t *block_sum, + int64_t *results, size_t N, const size_t *shapes, + const ptrdiff_t *strides, size_t ndim) { + // To be implemented for large N + extern __shared__ int64_t tmp_argwhere[]; + int bid = blockIdx.x, tid = threadIdx.x; + size_t pos1[5], pos2[5]; // 两个数的在tensor中的索引 + bool is_zero1 = false, is_zero2 = false; + int block_offset = bid * blockDim.x * 2, leaf_num = blockDim.x * 2; + tmp_argwhere[2 * tid] = tmp_argwhere[2 * tid + 1] = 0; + if (block_offset + tid * 2 < N) { + Index2Pos(block_offset + tid * 2, ndim, shapes, pos1); + is_zero1 = fabs(data[pos2dest(pos1, ndim, strides)]) <= 1e-9; + tmp_argwhere[2 * tid] = !is_zero1; + } + if (block_offset + tid * 2 + 1 < N) { + Index2Pos(block_offset + tid * 2 + 1, ndim, shapes, pos2); + is_zero2 = fabs(data[pos2dest(pos2, ndim, strides)]) <= 1e-9; + tmp_argwhere[2 * tid + 1] = !is_zero2; + } + __syncthreads(); + int offset = 1; + for (int d = leaf_num >> 1; d > 0; d >>= 1) { + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + tmp_argwhere[bi] += tmp_argwhere[ai]; + } + offset *= 2; + __syncthreads(); + } + if (tid == 0) { + block_sum[bid] = tmp_argwhere[leaf_num - 1]; + // printf("tmp_argwhere[%d] = %lld\n", leaf_num - 1, + // tmp_argwhere[leaf_num - 1]); + // printf("block_sum[%d] = %lld\n", bid, block_sum[bid]); + tmp_argwhere[leaf_num - 1] = 0; + } + __syncthreads(); + for (int d = 1; d < leaf_num; d <<= 1) { + offset >>= 1; + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + int v = tmp_argwhere[ai]; + tmp_argwhere[ai] = tmp_argwhere[bi]; + tmp_argwhere[bi] += v; + } + __syncthreads(); + } + + if (!is_zero1 && block_offset + tid * 2 < N) { + for (int i = 0; i < ndim; i++) { + results[(tmp_argwhere[2 * tid] + block_offset) * ndim + i] = pos1[i]; + } + } + if (!is_zero2 && block_offset + tid * 2 + 1 < N) { + for (int i = 0; i < ndim; i++) { + results[(tmp_argwhere[2 * tid + 1] + block_offset) * ndim + i] = pos2[i]; + } + } +} + +__global__ void add_block_offset_kernel(int64_t *results, int64_t *tmp, + int64_t *block_sums, size_t ndim) { + int bid = blockIdx.x, tid = threadIdx.x; + size_t block_offset = block_sums[bid], origin_offset = bid * blockDim.x * 2; + if (2 * tid < block_sums[bid + 1] - block_sums[bid]) { + for (int i = 0; i < ndim; i++) { + results[(block_offset + 2 * tid) * ndim + i] = tmp[(origin_offset + tid * 2) * ndim + i]; + } + } + if (2 * tid + 1 < block_sums[bid + 1] - block_sums[bid]) { + for (int i = 0; i < ndim; i++) { + results[(block_offset + 2 * tid + 1) * ndim + i] = tmp[(origin_offset + tid * 2 + 1) * ndim + i]; + } + } +} +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/metax/argwhere_metax.h b/src/infiniop/ops/argwhere/metax/argwhere_metax.h new file mode 100644 index 000000000..4e85f870c --- /dev/null +++ b/src/infiniop/ops/argwhere/metax/argwhere_metax.h @@ -0,0 +1,8 @@ +#ifndef ARGWHERE_METAX_H +#define ARGWHERE_METAX_H + +#include "../argwhere.h" + +DESCRIPTOR(metax) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/metax/argwhere_metax.maca b/src/infiniop/ops/argwhere/metax/argwhere_metax.maca new file mode 100644 index 000000000..7f3c5fb4a --- /dev/null +++ b/src/infiniop/ops/argwhere/metax/argwhere_metax.maca @@ -0,0 +1,94 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../cuda/kernel.cuh" +#include "argwhere_metax.h" +#include "infinicore.h" +#include +#include + +infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N, + size_t M, const size_t *shapes, + const ptrdiff_t *strides, size_t ndim, + infiniDtype_t dtype, size_t *count) { + + if (dtype == INFINI_DTYPE_F32) { + parallel_block_argwhere_kernel<<<1, M / 2, M * sizeof(size_t)>>>( + (float *)data, results, N, shapes, strides, ndim, count); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +namespace op::argwhere::metax { +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc) { + auto info = ArgwhereInfo::create(x_desc); + CHECK_RESULT(info); + size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 + x_desc->ndim() * sizeof(int64_t) * x_desc->numel() + sizeof(size_t); + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +int nextPowerOfTwo(int x) { + int power = 1; + while (power < x) { + power *= 2; + } + return power; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void **y, size_t *count, const void *x, + void *stream) const { + hcStream_t hc_stream = static_cast(stream); + size_t ndim = _info.strides.size(); + ptrdiff_t *strides = static_cast(workspace); + size_t *shapes = reinterpret_cast(strides + ndim); + int64_t *result = reinterpret_cast(shapes + ndim); + size_t *count_cuda = reinterpret_cast(result + _info.num_elements * ndim); + + hcMemcpyAsync(shapes, _info.shapes.data(), + _info.shapes.size() * sizeof(size_t), hcMemcpyHostToDevice, + hc_stream); + hcMemcpyAsync(strides, _info.strides.data(), + _info.strides.size() * sizeof(ptrdiff_t), hcMemcpyHostToDevice, + hc_stream); + // hcStreamSynchronize(hc_stream); + size_t M = nextPowerOfTwo(_info.num_elements); + CHECK_STATUS(launchKernel(x, result, _info.num_elements, M, shapes, strides, + ndim, INFINI_DTYPE_F32, count_cuda)); + // 从设备内存中读取 count_cuda 的值 + hcMemcpyAsync(count, count_cuda, sizeof(size_t), hcMemcpyDeviceToHost); + hcStreamSynchronize(hc_stream); + + // 写回结果 + *y = new int64_t[(*count) * ndim]; + // cudaStreamSynchronize(cuda_stream); + + // spdlog::debug("count_cuda:{}", *count_cuda); + // spdlog::debug("N:{}", N); + + hcMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim, + hcMemcpyDeviceToHost, hc_stream); + + // hcStreamSynchronize(hc_stream); + // for (size_t i = 0; i < (*count) * ndim; i++) { + // spdlog::debug("(*y)[{}]:{}", i, static_cast(*y)[i]); + // } + // hcFreeAsync(result, hc_stream); + // hcFreeAsync(count_cuda, hc_stream); + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::argwhere::metax \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/moore/argwhere_kernel.h b/src/infiniop/ops/argwhere/moore/argwhere_kernel.h new file mode 100644 index 000000000..dd67c877c --- /dev/null +++ b/src/infiniop/ops/argwhere/moore/argwhere_kernel.h @@ -0,0 +1,91 @@ +#ifndef ARGWHERE_MOORE_KERNEL_H +#define ARGWHERE_MOORE_KERNEL_H + +__device__ void Index2Pos(size_t index, size_t ndim, const size_t *shapes, + size_t *pos) { + for (int i = ndim - 1; i >= 0; i--) { + pos[i] = index % shapes[i]; + index /= shapes[i]; + } +} +__device__ size_t pos2dest(size_t *pos, size_t ndim, const size_t *shapes, + const ptrdiff_t *strides) { + size_t dest = 0; + for (size_t i = 0; i < ndim; i++) { + dest += pos[i] * strides[i]; + } + return dest; +} +template +__global__ void parallel_block_argwhere_kernel(T *data, int64_t *results, + size_t N, const size_t *shapes, + const ptrdiff_t *strides, + size_t ndim, size_t *count) { + extern __shared__ size_t tmp[]; + size_t pos1[5], pos2[5]; // 两个数的在tensor中的索引 + bool is_zero1 = false, is_zero2 = false; + int tid = threadIdx.x; + int leaf_num = blockDim.x * 2; + + if (tid * 2 < N) { + Index2Pos(tid * 2, ndim, shapes, pos1); + is_zero1 = fabs(data[pos2dest(pos1, ndim, shapes, strides)]) <= 1e-5; + tmp[tid * 2] = !is_zero1; + } else { + tmp[tid * 2] = 0; + } + + if (tid * 2 + 1 < N) { + Index2Pos(tid * 2 + 1, ndim, shapes, pos2); + is_zero2 = fabs(data[pos2dest(pos2, ndim, shapes, strides)]) <= 1e-5; + tmp[tid * 2 + 1] = !is_zero2; + } else { + tmp[tid * 2 + 1] = 0; + } + + __syncthreads(); + + int offset = 1; + for (int d = leaf_num >> 1; d > 0; d >>= 1) { + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + tmp[bi] += tmp[ai]; + } + offset *= 2; + __syncthreads(); + } + + if (tid == 0) { + tmp[leaf_num - 1] = 0; + } + __syncthreads(); + + for (int d = 1; d < leaf_num; d *= 2) { + offset >>= 1; + if (tid < d) { + int ai = offset * (2 * tid + 1) - 1; + int bi = offset * (2 * tid + 2) - 1; + + int v = tmp[ai]; + tmp[ai] = tmp[bi]; + tmp[bi] += v; + } + __syncthreads(); + } + if (!is_zero1 && tid * 2 < N) { + for (int i = 0; i < ndim; i++) { + results[tmp[2 * tid] * ndim + i] = pos1[i]; + } + } + if (!is_zero2 && tid * 2 + 1 < N) { + for (int i = 0; i < ndim; i++) { + results[tmp[2 * tid + 1] * ndim + i] = pos2[i]; + } + } + if (tid == blockDim.x - 1) { + *count = tmp[N - 1] + (N == blockDim.x * 2 ? 1 : tmp[N] != tmp[N - 1]); + } +} + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.h b/src/infiniop/ops/argwhere/moore/argwhere_moore.h new file mode 100644 index 000000000..63149291a --- /dev/null +++ b/src/infiniop/ops/argwhere/moore/argwhere_moore.h @@ -0,0 +1,8 @@ +#ifndef ARGWHERE_MOORE_H +#define ARGWHERE_MOORE_H + +#include "../argwhere.h" + +DESCRIPTOR(moore) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/moore/argwhere_moore.mu b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu new file mode 100644 index 000000000..4d5007731 --- /dev/null +++ b/src/infiniop/ops/argwhere/moore/argwhere_moore.mu @@ -0,0 +1,107 @@ +#include "../../../devices/moore/moore_common.h" +#include "../../../devices/moore/moore_kernel_common.h" +#include "argwhere_kernel.h" +#include "argwhere_moore.h" +#include "infinicore.h" +#include + +// template +// INFINIOP_MOORE_KERNEL parallel_block_argwhere(T *data, int64_t *results, size_t N, +// size_t M, const size_t *shapes, +// const ptrdiff_t *strides, size_t ndim, +// size_t *count) { +// parallel_block_argwhere_kernel<<<1, M / 2, M>>>( +// data, results, N, shapes, strides, ndim, count); +// } + + +infiniStatus_t launchKernel(const void *data, int64_t *results, size_t N, + size_t M, const size_t *shapes, + const ptrdiff_t *strides, size_t ndim, + infiniDtype_t dtype, size_t *count) { + + if (dtype == INFINI_DTYPE_F32) { + parallel_block_argwhere_kernel<<<1, M / 2, M*sizeof(size_t)>>>( + (float *)data, results, N, shapes, strides, ndim, count); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +namespace op::argwhere::moore { +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc) { + auto info = ArgwhereInfo::create(x_desc); + CHECK_RESULT(info); + size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 + + x_desc->ndim() * sizeof(int64_t) * x_desc->numel() + + sizeof(size_t); + *desc_ptr = new Descriptor( + info.take(), workspace_size, + new Opaque{ + reinterpret_cast(handle)->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +int nextPowerOfTwo(int x) { + int power = 1; + while (power < x) { + power *= 2; + } + return power; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void **y, size_t *count, const void *x, + void *stream) const { + musaStream_t moore_stream = static_cast(stream); + size_t ndim = _info.strides.size(); + ptrdiff_t *strides = static_cast(workspace); + size_t *shapes = reinterpret_cast(strides + ndim); + int64_t *result = reinterpret_cast(shapes + ndim); + size_t *count_cuda = + reinterpret_cast(result + _info.num_elements * ndim); + + musaMemcpyAsync(shapes, _info.shapes.data(), + _info.shapes.size() * sizeof(size_t), musaMemcpyHostToDevice, + moore_stream); + musaMemcpyAsync(strides, _info.strides.data(), + _info.strides.size() * sizeof(ptrdiff_t), + musaMemcpyHostToDevice, moore_stream); + // musaStreamSynchronize(moore_stream); + size_t M = nextPowerOfTwo(_info.num_elements); +// musaStreamSynchronize(moore_stream); + CHECK_STATUS(launchKernel(x, result, _info.num_elements, M, shapes, strides, + ndim, INFINI_DTYPE_F32, count_cuda)); + // 从设备内存中读取 count_cuda 的值 + musaMemcpyAsync(count, count_cuda, sizeof(size_t), musaMemcpyDeviceToHost, + moore_stream); + musaStreamSynchronize(moore_stream); + + //写回结果 + *y = new int64_t[(*count) * ndim]; + // cudaStreamSynchronize(cuda_stream); + + + musaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim, + musaMemcpyDeviceToHost, moore_stream); + + // cudaStreamSynchronize(cuda_stream); + // for (size_t i = 0; i < (*count) * ndim; i++) { + // spdlog::debug("(*y)[{}]:{}", i, static_cast(*y)[i]); + // } + // cudaFreeAsync(result, cuda_stream); + // cudaFreeAsync(count_cuda, cuda_stream); + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::argwhere::moore \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cu b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cu new file mode 100644 index 000000000..20e46f7bc --- /dev/null +++ b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cu @@ -0,0 +1,107 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "argwhere_nvidia.cuh" +#include "infinicore.h" +#include +#include + +int nextPowerOfTwo(int x) { + int power = 1; + while (power < x) { + power *= 2; + } + return power; +} + +infiniStatus_t launchKernel(const void *data, int64_t *tmp, int64_t *results, + int64_t *block_sum, size_t N, const size_t *shapes, + const ptrdiff_t *strides, size_t ndim, + infiniDtype_t dtype, size_t *count, + int maxThreadsPerBlock, cudaStream_t stream) { + + if (dtype == INFINI_DTYPE_F32) { + int block_size = maxThreadsPerBlock * 2; + int num_blocks = (N + block_size - 1) / block_size; + parallel_large_argwhere_kernel + <<>>( + static_cast(data), block_sum, tmp, N, shapes, + strides, ndim); + if (num_blocks > 1) { + // 计算前缀和 + parallel_block_scan_kernel + <<<1, maxThreadsPerBlock, sizeof(int64_t) * block_size>>>( + num_blocks + 1, block_sum); + // 重新整理结果 + add_block_offset_kernel<<>>( + results, tmp, block_sum, ndim); + cudaMemcpyAsync(count, &block_sum[num_blocks], sizeof(size_t), + cudaMemcpyDeviceToHost, stream); + } else { + cudaMemcpyAsync(count, &block_sum[0], sizeof(size_t), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(results, tmp, sizeof(int64_t) * (*count) * ndim, + cudaMemcpyDeviceToDevice, stream); + } + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +namespace op::argwhere::nvidia { +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc) { + auto info = ArgwhereInfo::create(x_desc); + CHECK_RESULT(info); + Opaque *opaque = new Opaque{ + reinterpret_cast(handle)->internal()}; + size_t workspace_size = x_desc->ndim() * sizeof(size_t) * 2 + x_desc->ndim() * sizeof(int64_t) * x_desc->numel() * 2 + sizeof(int64_t) * opaque->internal->maxThreadsPerBlock(); + *desc_ptr = new Descriptor(info.take(), workspace_size, opaque, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void **y, size_t *count, const void *x, + void *stream) const { + cudaStream_t cuda_stream = static_cast(stream); + size_t ndim = _info.strides.size(); + ptrdiff_t *strides = static_cast(workspace); + size_t *shapes = reinterpret_cast(strides + ndim); + int64_t *tmp = reinterpret_cast(shapes + ndim); + int64_t *result = reinterpret_cast(tmp + _info.num_elements * ndim); + int64_t *block_sum = reinterpret_cast(result + _info.num_elements * ndim); + + cudaMemcpyAsync(shapes, _info.shapes.data(), + _info.shapes.size() * sizeof(size_t), cudaMemcpyHostToDevice, + cuda_stream); + cudaMemcpyAsync(strides, _info.strides.data(), + _info.strides.size() * sizeof(ptrdiff_t), + cudaMemcpyHostToDevice, cuda_stream); + + CHECK_STATUS(launchKernel(x, tmp, result, block_sum, _info.num_elements, + shapes, strides, ndim, INFINI_DTYPE_F32, count, + _opaque->internal->maxThreadsPerBlock(), + cuda_stream)); + // 从设备内存中读取 count_cuda 的值 + + cudaStreamSynchronize(cuda_stream); + + // 写回结果 + *y = new int64_t[(*count) * ndim]; + + cudaMemcpyAsync(*y, result, sizeof(int64_t) * (*count) * ndim, + cudaMemcpyDeviceToHost, cuda_stream); + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::argwhere::nvidia diff --git a/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh new file mode 100644 index 000000000..fdc1dd8a5 --- /dev/null +++ b/src/infiniop/ops/argwhere/nvidia/argwhere_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef ARGWHERE_NVIDIA_H +#define ARGWHERE_NVIDIA_H + +#include "../argwhere.h" + +DESCRIPTOR(nvidia) + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/argwhere/operator.cc b/src/infiniop/ops/argwhere/operator.cc new file mode 100644 index 000000000..d6c5dfa7f --- /dev/null +++ b/src/infiniop/ops/argwhere/operator.cc @@ -0,0 +1,200 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/argwhere.h" + +#ifdef ENABLE_CPU_API +#include "cpu/argwhere_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/argwhere_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/argwhere_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/argwhere_kunlun.cuh" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "cambricon/argwhere_cambricon.cuh" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/argwhere_moore.h" +#endif + +__C infiniStatus_t infiniopCreateArgwhereDescriptor( + infiniopHandle_t handle, + infiniopArgwhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::argwhere::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + input_desc) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetArgwhereWorkspaceSize( + infiniopArgwhereDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopArgwhere( + infiniopArgwhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void **output, + size_t *count, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, count, input, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyArgwhereDescriptor(infiniopArgwhereDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc new file mode 100644 index 000000000..c95eb3c24 --- /dev/null +++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc @@ -0,0 +1,52 @@ +#include "asin_cpu.h" + +namespace op::asin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::cpu \ No newline at end of file diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h new file mode 100644 index 000000000..a07404598 --- /dev/null +++ b/src/infiniop/ops/asin/cpu/asin_cpu.h @@ -0,0 +1,21 @@ +#ifndef __ASIN_CPU_H__ +#define __ASIN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(asin, cpu) + +namespace op::asin::cpu { +typedef struct AsinOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::asin(input); + } +} AsinOp; +} // namespace op::asin::cpu + +#endif // __ASIN_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh new file mode 100644 index 000000000..2d4160ca6 --- /dev/null +++ b/src/infiniop/ops/asin/cuda/kernel.cuh @@ -0,0 +1,42 @@ +#ifndef __ASIN_CUDA_H__ +#define __ASIN_CUDA_H__ + +namespace op::asin::cuda { +typedef struct AsinOp { + static constexpr size_t num_inputs = 1; + + __device__ __forceinline__ float asin_f32_func(float x) const { + return asinf(x); + } + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(asin_f32_func(vf.x), asin_f32_func(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = asin_f32_func(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + float r0 = asin_f32_func(f0); + float r1 = asin_f32_func(f1); + return __floats2bfloat162_rn(r0, r1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + float rf = asin_f32_func(xf); + return __float2bfloat16_rn(rf); + } else if constexpr (std::is_same_v) { + return asin_f32_func(input); + } else if constexpr (std::is_same_v) { + return std::asin(input); + } else { + return std::asin(input); + } + } +} AsinOp; +} // namespace op::asin::cuda + +#endif // __ASIN_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/asin/metax/asin_metax.h b/src/infiniop/ops/asin/metax/asin_metax.h new file mode 100644 index 000000000..e1805508d --- /dev/null +++ b/src/infiniop/ops/asin/metax/asin_metax.h @@ -0,0 +1,8 @@ +#ifndef __ASIN_METAX_API_H__ +#define __ASIN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(asin, metax) + +#endif // __ASIN_METAX_API_H__ diff --git a/src/infiniop/ops/asin/metax/asin_metax.maca b/src/infiniop/ops/asin/metax/asin_metax.maca new file mode 100644 index 000000000..89567ac01 --- /dev/null +++ b/src/infiniop/ops/asin/metax/asin_metax.maca @@ -0,0 +1,61 @@ +#include "../../../elementwise/metax/elementwise_metax.h" +#include "asin_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::asin::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t +Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::AsinOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::AsinOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::metax \ No newline at end of file diff --git a/src/infiniop/ops/asin/moore/asin_moore.h b/src/infiniop/ops/asin/moore/asin_moore.h new file mode 100644 index 000000000..49394ea9d --- /dev/null +++ b/src/infiniop/ops/asin/moore/asin_moore.h @@ -0,0 +1,8 @@ +#ifndef __ASIN_MOORE_API_H__ +#define __ASIN_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(asin, moore) + +#endif // __ASIN_MOORE_API_H__ diff --git a/src/infiniop/ops/asin/moore/asin_moore.mu b/src/infiniop/ops/asin/moore/asin_moore.mu new file mode 100644 index 000000000..b5bbd0eb9 --- /dev/null +++ b/src/infiniop/ops/asin/moore/asin_moore.mu @@ -0,0 +1,60 @@ +#include "asin_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "asin_moore_kernel.h" + +namespace op::asin::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::AsinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::AsinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::AsinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::AsinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::moore diff --git a/src/infiniop/ops/asin/moore/asin_moore_kernel.h b/src/infiniop/ops/asin/moore/asin_moore_kernel.h new file mode 100644 index 000000000..0401417f2 --- /dev/null +++ b/src/infiniop/ops/asin/moore/asin_moore_kernel.h @@ -0,0 +1,42 @@ +#ifndef __ASIN_MOORE_KERNEL_H__ +#define __ASIN_MOORE_KERNEL_H__ + +namespace op::asin::moore { +typedef struct AsinOp { + static constexpr size_t num_inputs = 1; + + __device__ __forceinline__ float asin_f32_func(float x) const { + return asinf(x); + } + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(asin_f32_func(vf.x), asin_f32_func(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = asin_f32_func(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + float r0 = asin_f32_func(f0); + float r1 = asin_f32_func(f1); + return __floats2bfloat162_rn(r0, r1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + float rf = asin_f32_func(xf); + return __float2bfloat16_rn(rf); + } else if constexpr (std::is_same_v) { + return asin_f32_func(input); + } else if constexpr (std::is_same_v) { + return std::asin(input); + } else { + return std::asin(input); + } + } +} AsinOp; +} // namespace op::asin::moore + +#endif // __ASIN_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu new file mode 100644 index 000000000..206c277b1 --- /dev/null +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "asin_nvidia.cuh" + +namespace op::asin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t +Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::AsinOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::AsinOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::AsinOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::AsinOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::asin::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh new file mode 100644 index 000000000..681879341 --- /dev/null +++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ASIN_CUDA_API_H__ +#define __ASIN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(asin, nvidia) + +#endif // __ASIN_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc new file mode 100644 index 000000000..41d2e7897 --- /dev/null +++ b/src/infiniop/ops/asin/operator.cc @@ -0,0 +1,157 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/asin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/asin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/asin_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/asin_metax.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/asin_moore.h" +#endif + +__C infiniStatus_t infiniopCreateAsinDescriptor( + infiniopHandle_t handle, + infiniopAsinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::asin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAsin( + infiniopAsinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.cc b/src/infiniop/ops/fmin/cpu/fmin_cpu.cc new file mode 100644 index 000000000..b9afb2eb4 --- /dev/null +++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.cc @@ -0,0 +1,58 @@ +#include "fmin_cpu.h" + +namespace op::fmin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmin::cpu diff --git a/src/infiniop/ops/fmin/cpu/fmin_cpu.h b/src/infiniop/ops/fmin/cpu/fmin_cpu.h new file mode 100644 index 000000000..f0010feea --- /dev/null +++ b/src/infiniop/ops/fmin/cpu/fmin_cpu.h @@ -0,0 +1,26 @@ +#ifndef __FMIN_CPU_H__ +#define __FMIN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(fmin, cpu) + +namespace op::fmin::cpu { +typedef struct FminOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v || std::is_same_v) { + float a_f = utils::cast(a); + float b_f = utils::cast(b); + float result = std::fminf(a_f, b_f); + return utils::cast(result); + } else { + return std::fmin(a, b); + } + } +} FminOp; +} // namespace op::fmin::cpu + +#endif // __FMIN_CPU_H__ diff --git a/src/infiniop/ops/fmin/cuda/kernel.cuh b/src/infiniop/ops/fmin/cuda/kernel.cuh new file mode 100644 index 000000000..fbe385302 --- /dev/null +++ b/src/infiniop/ops/fmin/cuda/kernel.cuh @@ -0,0 +1,33 @@ +#ifndef __FMIN_CUDA_H__ +#define __FMIN_CUDA_H__ + +#include +namespace op::fmin::cuda { +typedef struct FminOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { +#if defined(ENABLE_ILUVATAR_API) + if constexpr (std::is_same_v) { + float a_f = __bfloat162float(a), b_f = __bfloat162float(b); + return __float2bfloat16(fminf(a_f, b_f)); + } else if constexpr (std::is_same_v) { + float a_f = __half2float(a), b_f = __half2float(b); + return __float2half(fminf(a_f, b_f)); + } +#elif defined(ENABLE_NVIDIA_API) + if constexpr (std::is_same_v || std::is_same_v) { + return __hmin(a, b); + } +#endif + if constexpr (std::is_same_v) { + return fminf(a, b); + } else { + return a < b ? a : b; + } + } +} FminOp; +} // namespace op::fmin::cuda + +#endif // __ADD_CUDA_H__ diff --git a/src/infiniop/ops/fmin/metax/fmin_metax.h b/src/infiniop/ops/fmin/metax/fmin_metax.h new file mode 100644 index 000000000..9348fa599 --- /dev/null +++ b/src/infiniop/ops/fmin/metax/fmin_metax.h @@ -0,0 +1,8 @@ +#ifndef __FMIN_METAX_API_H__ +#define __FMIN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(fmin, metax) + +#endif // __FMIN_METAX_API_H__ diff --git a/src/infiniop/ops/fmin/metax/fmin_metax.maca b/src/infiniop/ops/fmin/metax/fmin_metax.maca new file mode 100644 index 000000000..049b3806c --- /dev/null +++ b/src/infiniop/ops/fmin/metax/fmin_metax.maca @@ -0,0 +1,69 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "fmin_metax.h" + +namespace op::fmin::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t +Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FminOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::FminOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FminOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::FminOp, int32_t>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::FminOp, int64_t>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::FminOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmin::metax diff --git a/src/infiniop/ops/fmin/moore/fmin_moore.h b/src/infiniop/ops/fmin/moore/fmin_moore.h new file mode 100644 index 000000000..f73d99e7d --- /dev/null +++ b/src/infiniop/ops/fmin/moore/fmin_moore.h @@ -0,0 +1,8 @@ +#ifndef __FMIN_MOORE_API_H__ +#define __FMIN_MOORE_API_H__ + +#include "../../../elementwise/moore/elementwise_moore_api.h" + +ELEMENTWISE_DESCRIPTOR(fmin, moore) + +#endif // __FMIN_MOORE_API_H__ diff --git a/src/infiniop/ops/fmin/moore/fmin_moore.mu b/src/infiniop/ops/fmin/moore/fmin_moore.mu new file mode 100644 index 000000000..d19071e06 --- /dev/null +++ b/src/infiniop/ops/fmin/moore/fmin_moore.mu @@ -0,0 +1,66 @@ +#include "fmin_moore.h" + +#include "../../../elementwise/moore/elementwise_moore.h" + +#include "fmin_moore_kernel.h" + +namespace op::fmin::moore { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create MOORE elementwise descriptor + CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, moore::FminOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, moore::FminOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, moore::FminOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, moore::FminOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, moore::FminOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, moore::FminOp, int64_t>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmin::moore diff --git a/src/infiniop/ops/fmin/moore/fmin_moore_kernel.h b/src/infiniop/ops/fmin/moore/fmin_moore_kernel.h new file mode 100644 index 000000000..cde58f26a --- /dev/null +++ b/src/infiniop/ops/fmin/moore/fmin_moore_kernel.h @@ -0,0 +1,27 @@ +#ifndef __FMIN_MOORE_KERNEL_H__ +#define __FMIN_MOORE_KERNEL_H__ + +namespace op::fmin::moore { +typedef struct FminOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __hmin2(a, b); + } else if constexpr (std::is_same_v) { + return __hmin(a, b); + } else if constexpr (std::is_same_v) { + float a_f = __bfloat162float(a); + float b_f = __bfloat162float(b); + return fminf(a_f, b_f); + } else if constexpr (std::is_same_v) { + return fminf(a, b); + } else { + return a < b ? a : b; + } + } +} FminOp; +} // namespace op::fmin::moore + +#endif // __FMIN_MOORE_KERNEL_H__ diff --git a/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu new file mode 100644 index 000000000..bfe77c7bc --- /dev/null +++ b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cu @@ -0,0 +1,66 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "fmin_nvidia.cuh" + +namespace op::fmin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t +Descriptor::create(infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::FminOp, half>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::FminOp, cuda_bfloat16>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::FminOp, float>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::FminOp, int32_t>( + _info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::FminOp, double>( + _info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::fmin::nvidia diff --git a/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh new file mode 100644 index 000000000..f9def467b --- /dev/null +++ b/src/infiniop/ops/fmin/nvidia/fmin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __FMIN_CUDA_API_H__ +#define __FMIN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(fmin, nvidia) + +#endif // __FMIN_CUDA_API_H__ diff --git a/src/infiniop/ops/fmin/operator.cc b/src/infiniop/ops/fmin/operator.cc new file mode 100644 index 000000000..da17139b9 --- /dev/null +++ b/src/infiniop/ops/fmin/operator.cc @@ -0,0 +1,202 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/fmin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/fmin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/fmin_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/fmin_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/fmin_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/fmin_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/fmin_moore.h" +#endif + +__C infiniStatus_t infiniopCreateFminDescriptor( + infiniopHandle_t handle, + infiniopFminDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::fmin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetFminWorkspaceSize(infiniopFminDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopFmin( + infiniopFminDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyFminDescriptor(infiniopFminDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/paged_attention/operator.cc b/src/infiniop/ops/paged_attention/operator.cc index f41adb2cb..9f28984cd 100644 --- a/src/infiniop/ops/paged_attention/operator.cc +++ b/src/infiniop/ops/paged_attention/operator.cc @@ -5,9 +5,9 @@ #ifdef ENABLE_NVIDIA_API #include "nvidia/paged_attention_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/paged_attention_metax.h" -#endif +// #ifdef ENABLE_METAX_API +// #include "metax/paged_attention_metax.h" +// #endif __C infiniStatus_t infiniopCreatePagedAttentionDescriptor( infiniopHandle_t handle, @@ -34,9 +34,9 @@ __C infiniStatus_t infiniopCreatePagedAttentionDescriptor( #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax) -#endif +// #ifdef ENABLE_METAX_API +// CREATE(INFINI_DEVICE_METAX, metax) +// #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -54,9 +54,9 @@ __C infiniStatus_t infiniopGetPagedAttentionWorkspaceSize( #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax) -#endif +// #ifdef ENABLE_METAX_API +// GET(INFINI_DEVICE_METAX, metax) +// #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -78,9 +78,9 @@ __C infiniStatus_t infiniopPagedAttention( #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax) -#endif +// #ifdef ENABLE_METAX_API +// CALCULATE(INFINI_DEVICE_METAX, metax) +// #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -97,9 +97,9 @@ __C infiniStatus_t infiniopDestroyPagedAttentionDescriptor( #ifdef ENABLE_NVIDIA_API DESTROY(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - DESTROY(INFINI_DEVICE_METAX, metax) -#endif +// #ifdef ENABLE_METAX_API +// DESTROY(INFINI_DEVICE_METAX, metax) +// #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/paged_caching/operator.cc b/src/infiniop/ops/paged_caching/operator.cc index a69b0e07e..3d3e08f33 100644 --- a/src/infiniop/ops/paged_caching/operator.cc +++ b/src/infiniop/ops/paged_caching/operator.cc @@ -5,9 +5,9 @@ #ifdef ENABLE_NVIDIA_API #include "nvidia/paged_caching_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/paged_caching_metax.h" -#endif +// #ifdef ENABLE_METAX_API +// #include "metax/paged_caching_metax.h" +// #endif __C infiniStatus_t infiniopCreatePagedCachingDescriptor( infiniopHandle_t handle, @@ -29,9 +29,9 @@ __C infiniStatus_t infiniopCreatePagedCachingDescriptor( #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax) -#endif +// #ifdef ENABLE_METAX_API +// CREATE(INFINI_DEVICE_METAX, metax) +// #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -49,9 +49,9 @@ __C infiniStatus_t infiniopGetPagedCachingWorkspaceSize( #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax) -#endif +// #ifdef ENABLE_METAX_API +// GET(INFINI_DEVICE_METAX, metax) +// #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -73,9 +73,9 @@ __C infiniStatus_t infiniopPagedCaching( #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax) -#endif +// #ifdef ENABLE_METAX_API +// CALCULATE(INFINI_DEVICE_METAX, metax) +// #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -92,9 +92,9 @@ __C infiniStatus_t infiniopDestroyPagedCachingDescriptor( #ifdef ENABLE_NVIDIA_API DESTROY(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - DESTROY(INFINI_DEVICE_METAX, metax) -#endif +// #ifdef ENABLE_METAX_API +// DESTROY(INFINI_DEVICE_METAX, metax) +// #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } diff --git a/test/infinicore/ops/adaptive_avg_pool3d.py b/test/infinicore/ops/adaptive_avg_pool3d.py index 5566f3a72..533c49d49 100644 --- a/test/infinicore/ops/adaptive_avg_pool3d.py +++ b/test/infinicore/ops/adaptive_avg_pool3d.py @@ -68,9 +68,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.nn.functional.adaptive_avg_pool3d(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.nn.functional.adaptive_avg_pool3d(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.nn.functional.adaptive_avg_pool3d(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/addr.py b/test/infinicore/ops/addr.py index e1f279349..9c28c36a7 100644 --- a/test/infinicore/ops/addr.py +++ b/test/infinicore/ops/addr.py @@ -93,9 +93,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.addr(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.addr(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.addr(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/argwhere.py b/test/infinicore/ops/argwhere.py index 83fa1e6a5..0a7d531a7 100644 --- a/test/infinicore/ops/argwhere.py +++ b/test/infinicore/ops/argwhere.py @@ -52,9 +52,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.argwhere(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.argwhere(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.argwhere(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/asin.py b/test/infinicore/ops/asin.py index 168c1e209..373dfdfe9 100644 --- a/test/infinicore/ops/asin.py +++ b/test/infinicore/ops/asin.py @@ -97,9 +97,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.asin(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.asin(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.asin(*args, **kwargs) def main(): diff --git a/test/infinicore/ops/fmin.py b/test/infinicore/ops/fmin.py index 280d6bf4e..8ea7ec4dd 100644 --- a/test/infinicore/ops/fmin.py +++ b/test/infinicore/ops/fmin.py @@ -96,9 +96,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.fmin(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.fmin(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.fmin(*args, **kwargs) def main():