InfiniTensor
diff --git a/‎include/infiniop/ops/hardswish.h‎
Lines changed: 24 additions & 0 deletions b/‎include/infiniop/ops/hardswish.h‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎src/infiniop-test/src/ops/hardswish.cpp‎
Lines changed: 114 additions & 0 deletions b/‎src/infiniop-test/src/ops/hardswish.cpp‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc‎
Lines changed: 52 additions & 0 deletions b/‎src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎src/infiniop/ops/hardswish/cpu/hardswish_cpu.h‎
Lines changed: 30 additions & 0 deletions b/‎src/infiniop/ops/hardswish/cpu/hardswish_cpu.h‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎src/infiniop/ops/hardswish/cuda/kernel.cuh‎
Lines changed: 56 additions & 0 deletions b/‎src/infiniop/ops/hardswish/cuda/kernel.cuh‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎src/infiniop/ops/hardswish/metax/hardswish_metax.h‎
Lines changed: 8 additions & 0 deletions b/‎src/infiniop/ops/hardswish/metax/hardswish_metax.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/infiniop/ops/hardswish/metax/hardswish_metax.maca‎
Lines changed: 60 additions & 0 deletions b/‎src/infiniop/ops/hardswish/metax/hardswish_metax.maca‎
Lines changed: 60 additions & 0 deletions
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_HARDSWISH_API_H__
+#define __INFINIOP_HARDSWISH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle,
+                                                        infiniopHardswishDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc);
+
+#endif
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::hardswish {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopHardswishDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateHardswishDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetHardswishWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopHardswish(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopHardswish(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { 
+    return {}; 
+}
+
+std::vector<std::string> Test::tensor_names() { 
+    return {"input", "output", "ans"}; 
+}
+
+std::vector<std::string> Test::output_names() { 
+    return {"output"}; 
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+}  // namespace infiniop_test::hardswish
@@ -0,0 +1,52 @@
+#include "hardswish_cpu.h"
+
+namespace op::hardswish::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<HardswishOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<HardswishOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<HardswishOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<HardswishOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::cpu
@@ -0,0 +1,30 @@
+#ifndef __HARDSWISH_CPU_H__
+#define __HARDSWISH_CPU_H__
+
+#include <algorithm>
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
+
+namespace op::hardswish::cpu {
+typedef struct HardswishOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        if constexpr (std::is_integral_v<T>) {
+            return static_cast<T>(0);
+        } else {
+            // x * clamp(x + 3, 0, 6) / 6
+            auto x = static_cast<double>(input);
+            double y = x + 3.0;
+            y = std::min(std::max(y, 0.0), 6.0);
+            double out = x * (y / 6.0);
+            return static_cast<T>(out);
+        }
+    }
+} HardswishOp;
+} // namespace op::hardswish::cpu
+
+#endif // __HARDSWISH_CPU_H__
@@ -0,0 +1,56 @@
+#ifndef __HARDSWISH_CUDA_H__
+#define __HARDSWISH_CUDA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cmath>
+
+namespace op::hardswish::cuda {
+
+typedef struct HardswishOp {
+  static constexpr size_t num_inputs = 1;
+
+  // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
+  __device__ __forceinline__ float hswish_f32(float x) const {
+    float y = x + 3.0f;
+    y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y);
+    return x * (y * (1.0f / 6.0f));
+  }
+
+  template <typename T>
+  __device__ __forceinline__ T operator()(const T &input) const {
+    if constexpr (std::is_same_v<T, half2>) {
+      float2 vf = __half22float2(input);
+      float2 vr = make_float2(
+        hswish_f32(vf.x),
+        hswish_f32(vf.y)
+      );
+      return __float22half2_rn(vr);
+    } else if constexpr (std::is_same_v<T, half>) {
+      float xf = __half2float(input);
+      float yf = hswish_f32(xf);
+      return __float2half_rn(yf);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+      float f0 = __bfloat162float(__low2bfloat16(input));
+      float f1 = __bfloat162float(__high2bfloat16(input));
+      return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1));
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+      float xf = __bfloat162float(input);
+      return __float2bfloat16_rz(hswish_f32(xf));
+    } else if constexpr (std::is_same_v<T, float>) {
+      return hswish_f32(input);
+    } else if constexpr (std::is_same_v<T, double>) {
+      double xd = static_cast<double>(input);
+      double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+      return static_cast<T>(yd);
+    } else {
+      double xd = static_cast<double>(input);
+      double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+      return static_cast<T>(yd);
+    }
+  }
+} HardswishOp;
+
+} // namespace op::hardswish::cuda
+
+#endif // __HARDSWISH_CUDA_H__
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_METAX_API_H__
+#define __HARDSWISH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, metax)
+
+#endif // __HARDSWISH_METAX_API_H__
@@ -0,0 +1,60 @@
+#include "hardswish_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::hardswish::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::metax