diff --git a/include/infiniop.h b/include/infiniop.h index d51b8d92e..df8ecc896 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -7,13 +7,22 @@ #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" +#include "infiniop/ops/crossentropyloss_backward.h" +#include "infiniop/ops/div.h" +#include "infiniop/ops/gelu.h" +#include "infiniop/ops/gelu_backward.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/logical_and.h" +#include "infiniop/ops/logical_equal.h" +#include "infiniop/ops/logical_or.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/random_sample.h" #include "infiniop/ops/rearrange.h" #include "infiniop/ops/relu.h" +#include "infiniop/ops/relu_backward.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" +#include "infiniop/ops/silu.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/tensor_descriptor.h" diff --git a/include/infiniop/ops/crossentropyloss_backward.h b/include/infiniop/ops/crossentropyloss_backward.h new file mode 100644 index 000000000..87969bc87 --- /dev/null +++ b/include/infiniop/ops/crossentropyloss_backward.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_CROSSENTROPYLOSS_BACKWARD_API_H__ +#define __INFINIOP_CROSSENTROPYLOSS_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCrossEntropyLossBackWardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCrossEntropyLossBackWardDescriptor(infiniopHandle_t handle, + infiniopCrossEntropyLossBackWardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_logits, + infiniopTensorDescriptor_t probs, + infiniopTensorDescriptor_t target); + +__C __export infiniStatus_t infiniopGetCrossEntropyLossBackWardWorkspaceSize(infiniopCrossEntropyLossBackWardDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCrossEntropyLossBackWard(infiniopCrossEntropyLossBackWardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_logits, + const void *probs, + const void *target, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCrossEntropyLossBackWardDescriptor(infiniopCrossEntropyLossBackWardDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h new file mode 100644 index 000000000..235a34a15 --- /dev/null +++ b/include/infiniop/ops/div.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_DIV_API_H__ +#define __INFINIOP_DIV_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopDivDescriptor_t; + +__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *mode, + void *stream); + +__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/gelu.h b/include/infiniop/ops/gelu.h new file mode 100644 index 000000000..061bab14d --- /dev/null +++ b/include/infiniop/ops/gelu.h @@ -0,0 +1,22 @@ +#ifndef __INFINIOP_GELU_API_H__ +#define __INFINIOP_GELU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopGeluDescriptor_t; + +__C __export infiniStatus_t infiniopCreateGeluDescriptor(infiniopHandle_t handle, + infiniopGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGelu(infiniopGeluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/gelu_backward.h b/include/infiniop/ops/gelu_backward.h new file mode 100644 index 000000000..2cb7cbf46 --- /dev/null +++ b/include/infiniop/ops/gelu_backward.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_GELU_BACKWARD_API_H__ +#define __INFINIOP_GELU_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopGeluBackWardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateGeluBackWardDescriptor(infiniopHandle_t handle, + infiniopGeluBackWardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t grad_output); + +__C __export infiniStatus_t infiniopGeluBackWard(infiniopGeluBackWardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream); + +__C __export infiniStatus_t infiniopDestroyGeluBackWardDescriptor(infiniopGeluBackWardDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/logical_and.h b/include/infiniop/ops/logical_and.h new file mode 100644 index 000000000..37cbf04d4 --- /dev/null +++ b/include/infiniop/ops/logical_and.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_AND_API_H__ +#define __INFINIOP_AND_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogicalAndDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogicalAndDescriptor(infiniopHandle_t handel, + infiniopLogicalAndDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLogicalAnd(infiniopLogicalAndDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc); +#endif diff --git a/include/infiniop/ops/logical_equal.h b/include/infiniop/ops/logical_equal.h new file mode 100644 index 000000000..e952f1dbc --- /dev/null +++ b/include/infiniop/ops/logical_equal.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_EQUAL_API_H__ +#define __INFINIOP_EQUAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogicalEqualDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogicalEqualDescriptor(infiniopHandle_t handel, + infiniopLogicalEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetLogicalEqualWorkspaceSize(infiniopLogicalEqualDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLogicalEqual(infiniopLogicalEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogicalEqualDescriptor(infiniopLogicalEqualDescriptor_t desc); +#endif diff --git a/include/infiniop/ops/logical_or.h b/include/infiniop/ops/logical_or.h new file mode 100644 index 000000000..efe19a1c1 --- /dev/null +++ b/include/infiniop/ops/logical_or.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_OR_API_H__ +#define __INFINIOP_OR_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogicalOrDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogicalOrDescriptor(infiniopHandle_t handel, + infiniopLogicalOrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLogicalOr(infiniopLogicalOrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc); +#endif diff --git a/include/infiniop/ops/relu_backward.h b/include/infiniop/ops/relu_backward.h new file mode 100644 index 000000000..3802da6b4 --- /dev/null +++ b/include/infiniop/ops/relu_backward.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_RELU_BACKWARD_API_H__ +#define __INFINIOP_RELU_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReluBackWardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReluBackWardDescriptor(infiniopHandle_t handle, + infiniopReluBackWardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t grad_output); + +__C __export infiniStatus_t infiniopGetReluBackWardWorkspaceSize(infiniopReluBackWardDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReluBackWard(infiniopReluBackWardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReluBackWardDescriptor(infiniopReluBackWardDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/silu.h b/include/infiniop/ops/silu.h new file mode 100644 index 000000000..66a82ad0d --- /dev/null +++ b/include/infiniop/ops/silu.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SILU_API_H__ +#define __INFINIOP_SILU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSiluDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSiluDescriptor(infiniopHandle_t handle, + infiniopSiluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSilu(infiniopSiluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc); + +#endif diff --git a/scripts/python_test.py b/scripts/python_test.py index eb2d4319e..827a660b9 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -12,18 +12,27 @@ def run_tests(args): failed = [] for test in [ - "add.py", - "attention.py", - "causal_softmax.py", - "clip.py", - "gemm.py", - "mul.py", - "random_sample.py", - "rearrange.py", - "rms_norm.py", - "rope.py", - "sub.py", - "swiglu.py", + # "add.py", + # "attention.py", + # "causal_softmax.py", + # "clip.py", + # "gemm.py", + # "mul.py", + # "random_sample.py", + # "rearrange.py", + # "rms_norm.py", + # "rope.py", + # "sub.py", + # "swiglu.py", + "silu.py", + "div.py", + "logical_and.py", + "logical_or.py", + "logical_equal.py", + "relu_backward.py", + "gelu.py", + "gelu_backward.py", + "crossentropyloss_backward.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..11fdbb34d 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,15 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(silu) +DECLARE_INFINIOP_TEST(div) +DECLARE_INFINIOP_TEST(logical_and) +DECLARE_INFINIOP_TEST(logical_or) +DECLARE_INFINIOP_TEST(logical_equal) +DECLARE_INFINIOP_TEST(gelu) +DECLARE_INFINIOP_TEST(gelu_backward) +DECLARE_INFINIOP_TEST(relu_backward) +DECLARE_INFINIOP_TEST(crossentropyloss_backward) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -30,19 +39,28 @@ DECLARE_INFINIOP_TEST(sub) /* * Register all the tests here */ -#define TEST_BUILDER_MAPPINGS \ - { \ - REGISTER_INFINIOP_TEST(gemm) \ - REGISTER_INFINIOP_TEST(random_sample) \ - REGISTER_INFINIOP_TEST(add) \ - REGISTER_INFINIOP_TEST(mul) \ - REGISTER_INFINIOP_TEST(clip) \ - REGISTER_INFINIOP_TEST(swiglu) \ - REGISTER_INFINIOP_TEST(rope) \ - REGISTER_INFINIOP_TEST(rms_norm) \ - REGISTER_INFINIOP_TEST(causal_softmax) \ - REGISTER_INFINIOP_TEST(rearrange) \ - REGISTER_INFINIOP_TEST(sub) \ +#define TEST_BUILDER_MAPPINGS \ + { \ + REGISTER_INFINIOP_TEST(gemm) \ + REGISTER_INFINIOP_TEST(random_sample) \ + REGISTER_INFINIOP_TEST(add) \ + REGISTER_INFINIOP_TEST(mul) \ + REGISTER_INFINIOP_TEST(clip) \ + REGISTER_INFINIOP_TEST(swiglu) \ + REGISTER_INFINIOP_TEST(rope) \ + REGISTER_INFINIOP_TEST(rms_norm) \ + REGISTER_INFINIOP_TEST(causal_softmax) \ + REGISTER_INFINIOP_TEST(rearrange) \ + REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(silu) \ + REGISTER_INFINIOP_TEST(div) \ + REGISTER_INFINIOP_TEST(logical_and) \ + REGISTER_INFINIOP_TEST(logical_or) \ + REGISTER_INFINIOP_TEST(logical_equal) \ + REGISTER_INFINIOP_TEST(gelu) \ + REGISTER_INFINIOP_TEST(gelu_backward) \ + REGISTER_INFINIOP_TEST(relu_backward) \ + REGISTER_INFINIOP_TEST(crossentropyloss_backward) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/crossentropyloss_backward.cpp b/src/infiniop-test/src/ops/crossentropyloss_backward.cpp new file mode 100644 index 000000000..fff99af2b --- /dev/null +++ b/src/infiniop-test/src/ops/crossentropyloss_backward.cpp @@ -0,0 +1,112 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::crossentropyloss_backward { +struct Test::Attributes { + std::shared_ptr probs; + std::shared_ptr target; + std::shared_ptr grad_logits; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("probs") == tensors.end() + || tensors.find("target") == tensors.end() + || tensors.find("grad_logits") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->probs = tensors["probs"]; + test->_attributes->target = tensors["target"]; + test->_attributes->grad_logits = tensors["grad_logits"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopCrossEntropyLossBackWardDescriptor_t op_desc; + auto probs = _attributes->probs->to(device, device_id); + auto target = _attributes->target->to(device, device_id); + auto grad_logits = _attributes->grad_logits->to(device, device_id); + CHECK_OR(infiniopCreateCrossEntropyLossBackWardDescriptor(handle, &op_desc, + grad_logits->desc(), + probs->desc(), + target->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create crossentropyloss_backward descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetCrossEntropyLossBackWardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopCrossEntropyLossBackWard(op_desc, workspace, workspace_size, + grad_logits->data(), + probs->data(), + target->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(grad_logits, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopCrossEntropyLossBackWard( + op_desc, workspace, workspace_size, + grad_logits->data(), + probs->data(), + target->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyCrossEntropyLossBackWardDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"probs", "target", "grad_logits", "ans"}; +} + +std::vector Test::output_names() { + return {"grad_logits"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- probs: " << _attributes->probs->info() << std::endl; + oss << "- target: " << _attributes->target->info() << std::endl; + oss << "- grad_logits: " << _attributes->grad_logits->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::crossentropyloss_backward \ No newline at end of file diff --git a/src/infiniop-test/src/ops/div.cpp b/src/infiniop-test/src/ops/div.cpp new file mode 100644 index 000000000..f026359b3 --- /dev/null +++ b/src/infiniop-test/src/ops/div.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::div { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopDivDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateDivDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create div descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetDivWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopDiv(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + 0, + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopDiv( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + 0, + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyDivDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::div \ No newline at end of file diff --git a/src/infiniop-test/src/ops/gelu.cpp b/src/infiniop-test/src/ops/gelu.cpp new file mode 100644 index 000000000..583576e66 --- /dev/null +++ b/src/infiniop-test/src/ops/gelu.cpp @@ -0,0 +1,102 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::gelu { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopGeluDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateGeluDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create gelu descriptor.")); + + // Note: GeLU doesn't have GetWorkspaceSize function, so we set workspace_size to 0 + size_t workspace_size = 0; + void *workspace = nullptr; + + CHECK_OR(infiniopGelu(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopGelu( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyGeluDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::gelu \ No newline at end of file diff --git a/src/infiniop-test/src/ops/gelu_backward.cpp b/src/infiniop-test/src/ops/gelu_backward.cpp new file mode 100644 index 000000000..f81435d88 --- /dev/null +++ b/src/infiniop-test/src/ops/gelu_backward.cpp @@ -0,0 +1,110 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::gelu_backward { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr grad_output; + std::shared_ptr grad_input; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("grad_output") == tensors.end() + || tensors.find("grad_input") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->grad_output = tensors["grad_output"]; + test->_attributes->grad_input = tensors["grad_input"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopGeluBackWardDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto grad_output = _attributes->grad_output->to(device, device_id); + auto grad_input = _attributes->grad_input->to(device, device_id); + CHECK_OR(infiniopCreateGeluBackWardDescriptor(handle, &op_desc, + grad_input->desc(), + input->desc(), + grad_output->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create gelu_backward descriptor.")); + + // Note: GeLU Backward doesn't have GetWorkspaceSize function, so we set workspace_size to 0 + size_t workspace_size = 0; + void *workspace = nullptr; + + CHECK_OR(infiniopGeluBackWard(op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(grad_input, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopGeluBackWard( + op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyGeluBackWardDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "grad_output", "grad_input", "ans"}; +} + +std::vector Test::output_names() { + return {"grad_input"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- grad_output: " << _attributes->grad_output->info() << std::endl; + oss << "- grad_input: " << _attributes->grad_input->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::gelu_backward \ No newline at end of file diff --git a/src/infiniop-test/src/ops/logical_and.cpp b/src/infiniop-test/src/ops/logical_and.cpp new file mode 100644 index 000000000..2854d4c9f --- /dev/null +++ b/src/infiniop-test/src/ops/logical_and.cpp @@ -0,0 +1,112 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::logical_and { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLogicalAndDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateLogicalAndDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create logical_and descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetLogicalAndWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopLogicalAnd(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allEqual(c, _attributes->ans); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLogicalAnd( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyLogicalAndDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::logical_and \ No newline at end of file diff --git a/src/infiniop-test/src/ops/logical_equal.cpp b/src/infiniop-test/src/ops/logical_equal.cpp new file mode 100644 index 000000000..712fd7c09 --- /dev/null +++ b/src/infiniop-test/src/ops/logical_equal.cpp @@ -0,0 +1,112 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::logical_equal { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLogicalEqualDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateLogicalEqualDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create logical_equal descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetLogicalEqualWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopLogicalEqual(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allEqual(c, _attributes->ans); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLogicalEqual( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyLogicalEqualDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::logical_equal \ No newline at end of file diff --git a/src/infiniop-test/src/ops/logical_or.cpp b/src/infiniop-test/src/ops/logical_or.cpp new file mode 100644 index 000000000..e3d8a8160 --- /dev/null +++ b/src/infiniop-test/src/ops/logical_or.cpp @@ -0,0 +1,112 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::logical_or { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLogicalOrDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateLogicalOrDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create logical_or descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetLogicalOrWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopLogicalOr(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allEqual(c, _attributes->ans); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLogicalOr( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyLogicalOrDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::logical_or \ No newline at end of file diff --git a/src/infiniop-test/src/ops/relu_backward.cpp b/src/infiniop-test/src/ops/relu_backward.cpp new file mode 100644 index 000000000..b3513ab3e --- /dev/null +++ b/src/infiniop-test/src/ops/relu_backward.cpp @@ -0,0 +1,112 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::relu_backward { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr grad_output; + std::shared_ptr grad_input; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("grad_output") == tensors.end() + || tensors.find("grad_input") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->grad_output = tensors["grad_output"]; + test->_attributes->grad_input = tensors["grad_input"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopReluBackWardDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto grad_output = _attributes->grad_output->to(device, device_id); + auto grad_input = _attributes->grad_input->to(device, device_id); + CHECK_OR(infiniopCreateReluBackWardDescriptor(handle, &op_desc, + grad_input->desc(), + input->desc(), + grad_output->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create relu_backward descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetReluBackWardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopReluBackWard(op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(grad_input, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopReluBackWard( + op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroyReluBackWardDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "grad_output", "grad_input", "ans"}; +} + +std::vector Test::output_names() { + return {"grad_input"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- grad_output: " << _attributes->grad_output->info() << std::endl; + oss << "- grad_input: " << _attributes->grad_input->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::relu_backward \ No newline at end of file diff --git a/src/infiniop-test/src/ops/silu.cpp b/src/infiniop-test/src/ops/silu.cpp new file mode 100644 index 000000000..e641b52f3 --- /dev/null +++ b/src/infiniop-test/src/ops/silu.cpp @@ -0,0 +1,104 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::silu { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopSiluDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateSiluDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create silu descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetSiluWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopSilu(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopSilu( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroySiluDescriptor(op_desc); + infinirtFree(workspace); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::silu \ No newline at end of file diff --git a/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.cc b/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.cc new file mode 100644 index 000000000..eaf4a1656 --- /dev/null +++ b/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.cc @@ -0,0 +1,59 @@ +#include "crossentropyloss_backward_cpu.h" +#include +#include +#include + +namespace op::CrossEntropyLossBackWard::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + auto shape = _info.getAllInputShapes(); + auto dim = _info.getNdim(); + const size_t N = std::accumulate(shape, shape + dim - 1, 1ull, std::multiplies()); + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream, N); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream, N); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream, N); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::CrossEntropyLossBackWard::cpu diff --git a/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.h b/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.h new file mode 100644 index 000000000..525e754c4 --- /dev/null +++ b/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.h @@ -0,0 +1,20 @@ +#ifndef __CrossEntropyLossBackWard_CPU_H__ +#define __CrossEntropyLossBackWard_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(CrossEntropyLossBackWard, cpu) + +namespace op::CrossEntropyLossBackWard::cpu { +typedef struct CrossEntropyLossBackWardOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &probs, const T &target, const size_t N) const { + // Cross Entropy Loss Backward: grad_logits = (probs - target) / batch_size + return (probs - target) / static_cast(N); + } +} CrossEntropyLossBackWardOp; +} // namespace op::CrossEntropyLossBackWard::cpu + +#endif // __CrossEntropyLossBackWard_CPU_H__ diff --git a/src/infiniop/ops/crossentropyloss_backward/cuda/kernel.cuh b/src/infiniop/ops/crossentropyloss_backward/cuda/kernel.cuh new file mode 100644 index 000000000..94b5ccec9 --- /dev/null +++ b/src/infiniop/ops/crossentropyloss_backward/cuda/kernel.cuh @@ -0,0 +1,25 @@ +#ifndef __CROSSENTROPYLOSSBACKWARD_CUDA_H__ +#define __CROSSENTROPYLOSSBACKWARD_CUDA_H__ + +#include +namespace op::CrossEntropyLossBackWard::cuda { +typedef struct CrossEntropyLossBackWardOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &probs, const T &target, const float N) const { + // Cross Entropy Loss Backward: grad_logits = (probs - target) / batch_size + if constexpr (std::is_same_v) { + return __hdiv(__hsub(probs, target), __float2half(N)); + } else if constexpr (std::is_same_v) { + return __hdiv(__hsub(probs, target), __float2bfloat16(N)); + } else if constexpr (std::is_same_v) { + return __fdiv_rn(__fsub_rn(probs, target), N); + } else { + return (probs - target) / N; + } + } +} CrossEntropyLossBackWardOp; +} // namespace op::CrossEntropyLossBackWard::cuda + +#endif // __CrossEntropyLossBackWard_CUDA_H__ diff --git a/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.h b/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.h new file mode 100644 index 000000000..f3274a445 --- /dev/null +++ b/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __CROSSENTROPYLOSSBACKWARD_METAX_API_H__ +#define __CROSSENTROPYLOSSBACKWARD_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(CrossEntropyLossBackWard, metax) + +#endif // __CrossEntropyLossBackWard_METAX_API_H__ diff --git a/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.maca b/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.maca new file mode 100644 index 000000000..0c96a031b --- /dev/null +++ b/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.maca @@ -0,0 +1,64 @@ +#include "crossentropyloss_backward_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::CrossEntropyLossBackWard::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + auto shape = _info.getAllInputShapes(); + auto dim = _info.getNdim(); + size_t N = std::accumulate(shape, shape + dim - 1, 1ull, std::multiplies()); + float fN = static_cast(N); + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, half>(_info, workspace, output, inputs, stream, std::move(N)); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, std::move(N)); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, float>(_info, workspace, output, inputs, stream, std::move(N)); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::CrossEntropyLossBackWard::metax diff --git a/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cu b/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cu new file mode 100644 index 000000000..40cca13a7 --- /dev/null +++ b/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cu @@ -0,0 +1,64 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "crossentropyloss_backward_nvidia.cuh" + +namespace op::CrossEntropyLossBackWard::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + auto shape = _info.getAllInputShapes(); + auto dim = _info.getNdim(); + size_t N = std::accumulate(shape, shape + dim - 1, 1ull, std::multiplies()); + float fN = static_cast(N); + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, half>(_info, workspace, output, inputs, stream, std::move(fN)); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, std::move(fN)); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, float>(_info, workspace, output, inputs, stream, std::move(fN)); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::CrossEntropyLossBackWard::nvidia diff --git a/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cuh b/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cuh new file mode 100644 index 000000000..f61edccb1 --- /dev/null +++ b/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CROSSENTROPYLOSSBACKWARD_CUDA_API_H__ +#define __CROSSENTROPYLOSSBACKWARD_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(CrossEntropyLossBackWard, nvidia) + +#endif // __CrossEntropyLossBackWard_CUDA_API_H__ diff --git a/src/infiniop/ops/crossentropyloss_backward/operator.cc b/src/infiniop/ops/crossentropyloss_backward/operator.cc new file mode 100644 index 000000000..3ab974c7e --- /dev/null +++ b/src/infiniop/ops/crossentropyloss_backward/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/crossentropyloss_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/crossentropyloss_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/crossentropyloss_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/crossentropyloss_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCrossEntropyLossBackWardDescriptor( + infiniopHandle_t handle, + infiniopCrossEntropyLossBackWardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_logits_desc, + infiniopTensorDescriptor_t probs_desc, + infiniopTensorDescriptor_t target_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::CrossEntropyLossBackWard::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_logits_desc, \ + {probs_desc, \ + target_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCrossEntropyLossBackWardWorkspaceSize(infiniopCrossEntropyLossBackWardDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCrossEntropyLossBackWard( + infiniopCrossEntropyLossBackWardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_logits, + const void *probs, + const void *target, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_logits, {probs, target}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCrossEntropyLossBackWardDescriptor(infiniopCrossEntropyLossBackWardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc new file mode 100644 index 000000000..8702fb581 --- /dev/null +++ b/src/infiniop/ops/div/cpu/div_cpu.cc @@ -0,0 +1,68 @@ +#include "div_cpu.h" +#include "../div_mode.h" +#include "infinicore.h" +#include + +namespace op::div::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + int mode, + void *stream) const { + +#define CALCULATE(OP) \ + switch (_dtype) { \ + case INFINI_DTYPE_BF16: \ + return _device_info->calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate(_info, output, inputs, stream); \ + case INFINI_DTYPE_F64: \ + return _device_info->calculate(_info, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (mode == MODE_DEFAULT) { + CALCULATE(DivOp) + } else if (mode == MODE_TRUNC) { + CALCULATE(DivOpTrunc) + } else if (mode == MODE_FLOOR) { + CALCULATE(DivOpFloor) + } + +#undef CALCULATE + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::cpu diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h new file mode 100644 index 000000000..5ee1d3d0c --- /dev/null +++ b/src/infiniop/ops/div/cpu/div_cpu.h @@ -0,0 +1,113 @@ +#ifndef __DIV_CPU_H__ +#define __DIV_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +// ELEMENTWISE_DESCRIPTOR(div, cpu) + +namespace op::div::cpu { +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::cpu::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + int mode, + void *stream) const; +}; + +typedef struct DivOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return a / b; + } +} DivOp; +typedef struct DivOpTrunc { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + float af = _f16_to_f32(a); + float bf = _f16_to_f32(b); + float res = std::trunc(af / bf); + + return _f32_to_f16(float(res)); + } else if constexpr (std::is_same_v) { + float af = _bf16_to_f32(a); + float bf = _bf16_to_f32(b); + float res = std::trunc(af / bf); + return _f32_to_bf16(res); + } else if constexpr (std::is_same_v) { + float res = std::trunc(a / b); + return res; + } else if constexpr (std::is_same_v) { + double res = std::trunc(a / b); + return res; + } else { + return std::trunc(a / b); + } + } +} DivOpTrunc; +typedef struct DivOpFloor { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + float af = _f16_to_f32(a); + float bf = _f16_to_f32(b); + float res = std::floor(af / bf); + + return _f32_to_f16(float(res)); + } else if constexpr (std::is_same_v) { + float af = _bf16_to_f32(a); + float bf = _bf16_to_f32(b); + float res = std::floor(af / bf); + + return _f32_to_bf16(res); + } else if constexpr (std::is_same_v) { + float res = std::floor(a / b); + return res; + } else if constexpr (std::is_same_v) { + double res = std::floor(a / b); + return res; + } else { + return std::floor(a / b); + } + } +} DivOpFloor; +} // namespace op::div::cpu + +#endif // __DIV_CPU_H__ diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh new file mode 100644 index 000000000..35d7582e4 --- /dev/null +++ b/src/infiniop/ops/div/cuda/kernel.cuh @@ -0,0 +1,70 @@ +#ifndef __DIV_CUDA_H__ +#define __DIV_CUDA_H__ + +// #include "../../../devices/nvidia/nvidia_kernel_common.cuh" +namespace op::div::cuda { +typedef struct DivOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __hdiv(a, b); + } else if constexpr (std::is_same_v) { + return __hdiv(a, b); + } else if constexpr (std::is_same_v) { + return fdividef(a, b); + } else if constexpr (std::is_same_v) { + return fdivide(a, b); + } else { + return a / b; + } + } +} DivOp; +typedef struct DivOpTrunc { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return htrunc(__hdiv(a, b)); + } else if constexpr (std::is_same_v) { + return htrunc(__hdiv(a, b)); + } else if constexpr (std::is_same_v) { + return truncf(fdividef(a, b)); + } else if constexpr (std::is_same_v) { + return trunc(fdivide(a, b)); + } else { + return a / b; + } + } +} DivOpTrunc; +typedef struct DivOpFloor { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + float fa = __half2float(a); + float fb = __half2float(b); + float res = floorf(fdividef(fa, fb)); + return __float2half(res); + } else if constexpr (std::is_same_v) { + float fa = __bfloat162float(a); + float fb = __bfloat162float(b); + float res = floorf(fdividef(fa, fb)); + return __float2bfloat16(res); + + } else if constexpr (std::is_same_v) { + // return floorf(fdividef(a, b)); + return floorf(a / b); + } else if constexpr (std::is_same_v) { + return floor(fdivide(a, b)); + } else { + return a / b; + } + } +} DivOpFloor; +} // namespace op::div::cuda + +#endif // __DIV_CUDA_H__ diff --git a/src/infiniop/ops/div/div_mode.h b/src/infiniop/ops/div/div_mode.h new file mode 100644 index 000000000..63a579d5b --- /dev/null +++ b/src/infiniop/ops/div/div_mode.h @@ -0,0 +1,5 @@ +enum ModeType { + MODE_DEFAULT, + MODE_TRUNC, + MODE_FLOOR, +}; diff --git a/src/infiniop/ops/div/metax/div_metax.h b/src/infiniop/ops/div/metax/div_metax.h new file mode 100644 index 000000000..9ce65655c --- /dev/null +++ b/src/infiniop/ops/div/metax/div_metax.h @@ -0,0 +1,45 @@ +#ifndef __DIV_METAX_API_H__ +#define __DIV_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +namespace op::div::metax { +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::metax::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + int mode, + void *stream) const; +}; +} // namespace op::div::metax +#endif // __DIV_METAX_API_H__ diff --git a/src/infiniop/ops/div/metax/div_metax.maca b/src/infiniop/ops/div/metax/div_metax.maca new file mode 100644 index 000000000..9882f35a5 --- /dev/null +++ b/src/infiniop/ops/div/metax/div_metax.maca @@ -0,0 +1,72 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "../div_mode.h" +#include "div_metax.h" + +namespace op::div::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto &b_desc = input_descs.at(1); + const auto &c_shape = output_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create metax elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + int mode, + void *stream) const { +#define CALCULATE(OP) \ + switch (_dtype) { \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::OP, half>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_BF16: \ + return _device_info->calculate<256, cuda::OP, cuda_bfloat16>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::OP, float>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F64: \ + return _device_info->calculate<256, cuda::OP, double>(_info, workspace, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + if (mode == MODE_DEFAULT) { + CALCULATE(DivOp) + } else if (mode == MODE_TRUNC) { + CALCULATE(DivOpTrunc) + } else if (mode == MODE_FLOOR) { + CALCULATE(DivOpFloor) + } + return INFINI_STATUS_SUCCESS; + + +} +} // namespace op::div::metax diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu new file mode 100644 index 000000000..0dd91a966 --- /dev/null +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu @@ -0,0 +1,70 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "../div_mode.h" +#include "div_nvidia.cuh" + +namespace op::div::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + int mode, + void *stream) const { +#define CALCULATE(OP) \ + switch (_dtype) { \ + case INFINI_DTYPE_F16: \ + return _device_info->calculate<256, cuda::OP, half>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_BF16: \ + return _device_info->calculate<256, cuda::OP, cuda_bfloat16>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F32: \ + return _device_info->calculate<256, cuda::OP, float>(_info, workspace, output, inputs, stream); \ + case INFINI_DTYPE_F64: \ + return _device_info->calculate<256, cuda::OP, double>(_info, workspace, output, inputs, stream); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + if (mode == MODE_DEFAULT) { + CALCULATE(DivOp) + } else if (mode == MODE_TRUNC) { + CALCULATE(DivOpTrunc) + } else if (mode == MODE_FLOOR) { + CALCULATE(DivOpFloor) + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::nvidia diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cuh b/src/infiniop/ops/div/nvidia/div_nvidia.cuh new file mode 100644 index 000000000..9e0f7c925 --- /dev/null +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cuh @@ -0,0 +1,45 @@ +#ifndef __DIV_CUDA_API_H__ +#define __DIV_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +namespace op::div::nvidia { +class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::nvidia::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size) {} + +public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + int mode, + void *stream) const; +}; +} // namespace op::div::nvidia +#endif // __DIV_CUDA_API_H__ diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc new file mode 100644 index 000000000..5dd8ff527 --- /dev/null +++ b/src/infiniop/ops/div/operator.cc @@ -0,0 +1,162 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "div_mode.h" +#include "infinicore.h" +#include "infiniop/ops/div.h" +#include + +#ifdef ENABLE_CPU_API +#include "cpu/div_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/div_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/div_metax.h" +#endif + +__C infiniStatus_t infiniopCreateDivDescriptor( + infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::div::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDiv( + infiniopDivDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *mode_, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, mode, stream) + + int mode_val = reinterpret_cast(mode_); + + int mode = MODE_DEFAULT; + if (mode_val == 0) { + mode = MODE_DEFAULT; + } else if (mode_val == 1) { + mode = MODE_TRUNC; + } else if (mode_val == 2) { + mode = MODE_FLOOR; + } else { + return INFINI_STATUS_BAD_PARAM; + } + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/gelu/cpu/gelu_cpu.cc b/src/infiniop/ops/gelu/cpu/gelu_cpu.cc new file mode 100644 index 000000000..716cb4e3e --- /dev/null +++ b/src/infiniop/ops/gelu/cpu/gelu_cpu.cc @@ -0,0 +1,50 @@ +#include "gelu_cpu.h" +#include "infinicore.h" + +namespace op::gelu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::gelu::cpu diff --git a/src/infiniop/ops/gelu/cpu/gelu_cpu.h b/src/infiniop/ops/gelu/cpu/gelu_cpu.h new file mode 100644 index 000000000..376634fb3 --- /dev/null +++ b/src/infiniop/ops/gelu/cpu/gelu_cpu.h @@ -0,0 +1,49 @@ +#ifndef __GELU_CPU_H__ +#define __GELU_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(gelu, cpu) + +namespace op::gelu::cpu { +typedef struct GeluOp { + float pi = 3.1415927f; + + float kappa = 0.044715; + float beta = sqrt(2 / pi); + +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &input) const { + // use Approximate formula Gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.0044715 * x **3))) + // \kappa = 0.044715, \beta = sqrt(2 / pi) + // inner() = \beta * (x + \kappa * x ** 3) + // SO: Gelu(x) = 0.5 * x * (1 + tanh(inner())) + // + if constexpr (std::is_same::value) { + float finput = _f16_to_f32(input); + float inner = beta * (finput + kappa * finput * finput * finput); + float res = 0.5 * finput * (1 + std::tanh(inner)); + + return _f32_to_f16(res); + } else if constexpr (std::is_same::value) { + float finput = _bf16_to_f32(input); + float inner = beta * (finput + kappa * finput * finput * finput); + float res = 0.5 * finput * (1 + std::tanh(inner)); + + return _f32_to_bf16(res); + } else if constexpr (std::is_same::value) { + float inner = beta * (input + kappa * input * input * input); + return 0.5 * input * (1 + std::tanh(inner)); + + } else { + float inner = beta * (input + kappa * input * input * input); + return 0.5 * input * (1 + std::tanh(inner)); + } + } +} GeluOp; +} // namespace op::gelu::cpu + +#endif diff --git a/src/infiniop/ops/gelu/cuda/kernel.cuh b/src/infiniop/ops/gelu/cuda/kernel.cuh new file mode 100644 index 000000000..f04ddfe55 --- /dev/null +++ b/src/infiniop/ops/gelu/cuda/kernel.cuh @@ -0,0 +1,67 @@ +#ifndef __GELU_CUDA_H__ +#define __GELU_CUDA_H__ + +namespace op::gelu::cuda { +typedef struct GeluOp { +private: + __device__ __forceinline__ half htanh_approx(const half x) const { + // Pade approximation: tanh(x) ≈ x * (27 + x^2) / (27 + 9*x^2) + // More accurate for small values + float fx = __half2float(x); + float tanh_val = tanhf(fx); + return __float2half(tanh_val); + } + + __device__ __forceinline__ cuda_bfloat16 htanh_approx(const cuda_bfloat16 x) const { + float fx = __bfloat162float(x); + float tanh_val = tanhf(fx); + return __float2bfloat16(tanh_val); + } + float pi = 3.1415927f; + float kappa = 0.044715f; + float beta = sqrtf(2 / pi); + + half h_kappa = __float2half(kappa); + cuda_bfloat16 b_kappa = __float2bfloat16(kappa); + + half h_beta = __float2half(beta); + cuda_bfloat16 b_beta = __float2bfloat16(beta); + + half h_point_fiv = __float2half(0.5f); + cuda_bfloat16 b_point_fiv = __float2bfloat16(0.5f); + + half h_one = __float2half(1.0f); + cuda_bfloat16 b_one = __float2bfloat16(1.0f); + +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &input) const { + // use Approximate formula Gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.0044715 * x **3))) + // \kappa = 0.044715, \beta = sqrt(2 / pi) + // inner() = \beta * (x + \kappa * x ** 3) + // Gelu(x) = 0.5 * x * (1 + tanh(inner())) + + if constexpr (std::is_same_v) { + half inner = __hmul(h_beta, __hadd(input, __hmul(h_kappa, __hmul(input, __hmul(input, input))))); + half res = __hmul(h_point_fiv, __hmul(input, __hadd(h_one, htanh_approx(inner)))); + + return res; + } else if constexpr (std::is_same_v) { + cuda_bfloat16 inner = __hmul(b_beta, __hadd(input, __hmul(b_kappa, __hmul(input, __hmul(input, input))))); + cuda_bfloat16 res = __hmul(b_point_fiv, __hmul(input, __hadd(b_one, htanh_approx(inner)))); + + return res; + } else if constexpr (std::is_same_v) { + float inner = __fmul_rn(beta, __fadd_rn(input, __fmul_rn(kappa, __fmul_rn(input, __fmul_rn(input, input))))); + float res = __fmul_rn(0.5f, __fmul_rn(input, __fadd_rn(1.0f, tanhf(inner)))); + + return res; + } else { + return 0.5 * input * (1 + beta * (input + kappa * powf(input, 3))); + } + }; +} GeluOp; +} // namespace op::gelu::cuda + +#endif // __silu_CUDA_H__ diff --git a/src/infiniop/ops/gelu/metax/gelu_metax.h b/src/infiniop/ops/gelu/metax/gelu_metax.h new file mode 100644 index 000000000..53ff79f10 --- /dev/null +++ b/src/infiniop/ops/gelu/metax/gelu_metax.h @@ -0,0 +1,8 @@ +#ifndef __GELU_METAX_API_H__ +#define __GELU_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(gelu, metax) + +#endif // __SWIGLU_METAX_API_H__ diff --git a/src/infiniop/ops/gelu/metax/gelu_metax.maca b/src/infiniop/ops/gelu/metax/gelu_metax.maca new file mode 100644 index 000000000..4e9b4de08 --- /dev/null +++ b/src/infiniop/ops/gelu/metax/gelu_metax.maca @@ -0,0 +1,55 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "gelu_metax.h" + +namespace op::gelu::metax{ +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs); + + return INFINI_STATUS_SUCCESS; + +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gelu_metax::metax diff --git a/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu new file mode 100644 index 000000000..dae788e45 --- /dev/null +++ b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu @@ -0,0 +1,55 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "gelu_nvidia.cuh" +#include "infinicore.h" + +namespace op::gelu::nvidia { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_descs) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // std::cout << "at calutate workspace: " << workspace << std::endl; + // std::cout << "at calutate workspace sieze: " << workspace_size << std::endl; + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gelu::nvidia diff --git a/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh new file mode 100644 index 000000000..40f75a967 --- /dev/null +++ b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __GELU_CUDA_API_H__ +#define __GELU_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(gelu, nvidia) + +#endif diff --git a/src/infiniop/ops/gelu/operator.cc b/src/infiniop/ops/gelu/operator.cc new file mode 100644 index 000000000..c9d64b422 --- /dev/null +++ b/src/infiniop/ops/gelu/operator.cc @@ -0,0 +1,132 @@ +#include "../../handle.h" +#include "infinicore.h" +#include "infiniop/ops/gelu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gelu_cpu.h" +#endif +#ifdef ENABLE_ILUVATAR_API +#include "nvidia/gelu_nvidia.cuh" +#endif +#ifdef ENABLE_NVIDIA_API +#include "nvidia/gelu_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/gelu_metax.h" +#endif + +__C infiniStatus_t infiniopCreateGeluDescriptor( + infiniopHandle_t handle, + infiniopGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { +#define CTEATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gelu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CTEATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CTEATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + CTEATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + CTEATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CTEATE +} + +__C infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopGelu( + infiniopGeluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} diff --git a/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc new file mode 100644 index 000000000..fc13bf7c6 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc @@ -0,0 +1,52 @@ +#include "gelu_backward_cpu.h" +#include "infinicore.h" + +namespace op::gelu_backward::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(0); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::gelu_backward::cpu diff --git a/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h new file mode 100644 index 000000000..18a56cf86 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h @@ -0,0 +1,52 @@ +#ifndef __GELU_BACKWARD_CPU_H__ +#define __GELU_BACKWARD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(gelu_backward, cpu) + +namespace op::gelu_backward::cpu { +typedef struct GeluBackWardOp { + float pi = 3.1415927f; + float kappa = 0.044715f; + float Beta = sqrt(2 / pi); + +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &input, const T &grad_output) const { + // use Approximate formula Gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.0044715 * x ** 3 ))) + // \kappa = 0.044715, \beta = sqrt(2 / pi) + // inner() = \beta * (x + \kappa * x ** 3) + // SO: Gelu(x) = 0.5 * x * (1 + tanh(inner())) + // + // d(Gelu(x)) = 0.5 [(1 + tanh(inner())) + x(1 - tanh(inner) ** 2)(d(inner()))] +#define CREATE_D_DEUL(x) \ + float inner = Beta * (x + kappa * x * x * x); \ + float dinner = Beta * (1 + 3 * kappa * x * x); \ + float tanh_inner = std::tanh(inner); \ + float dGelu = 0.5 * ((1 + tanh_inner) + x * (1 - tanh_inner * tanh_inner) * dinner); + + if constexpr (std::is_same::value) { + float finput = _f16_to_f32(input); + CREATE_D_DEUL(finput) + + return grad_output * _f32_to_f16(dGelu); + + } else if constexpr (std::is_same::value) { + float finput = _bf16_to_f32(input); + CREATE_D_DEUL(finput) + + return grad_output * _f32_to_bf16(dGelu); + + } else if constexpr (std::is_same::value) { + CREATE_D_DEUL(input) + + return grad_output * dGelu; + } + } +} GeluBackWardOp; +} // namespace op::gelu_backward::cpu + +#endif diff --git a/src/infiniop/ops/gelu_backward/cuda/kernel.cuh b/src/infiniop/ops/gelu_backward/cuda/kernel.cuh new file mode 100644 index 000000000..7b92a2d55 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/cuda/kernel.cuh @@ -0,0 +1,88 @@ +#ifndef __gelu_backward_CUDA_H__ +#define __gelu_backward_CUDA_H__ + +namespace op::gelu_backward::cuda { +typedef struct GeluBackWardOp { +private: + // MetaX tanh implementations for half and bfloat16 types + __device__ __forceinline__ half htanh_approx(const half x) const { + // Pade approximation: tanh(x) ≈ x * (27 + x^2) / (27 + 9*x^2) + // More accurate for small values + half x2 = __hmul(x, x); + half numerator = __hmul(x, __hadd(__float2half(27.0f), x2)); + half denominator = __hadd(__float2half(27.0f), __hmul(__float2half(9.0f), x2)); + return __hdiv(numerator, denominator); + } + + __device__ __forceinline__ cuda_bfloat16 htanh(const cuda_bfloat16 x) const { + // For bfloat16, convert to float, compute tanh, and convert back + float xf = __bfloat162float(x); + float tanh_val = tanhf(xf); + return __float2bfloat16(tanh_val); + } + float pi = 3.1415927f; + float kappa = 0.044715f; + float beta = sqrtf(2 / pi); + + half h_kappa = __float2half(kappa); + cuda_bfloat16 b_kappa = __float2bfloat16(kappa); + + half h_beta = __float2half(beta); + cuda_bfloat16 b_beta = __float2bfloat16(beta); + + half h_point_fiv = __float2half(0.5f); + cuda_bfloat16 b_point_fiv = __float2bfloat16(0.5f); + + half h_one = __float2half(1.0f); + cuda_bfloat16 b_one = __float2bfloat16(1.0f); + + half h_three = __float2half(3.0f); + cuda_bfloat16 b_three = __float2bfloat16(3.0f); + +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &input, const T &grad_output) const { + // use Approximate formula GeluBackWard(x) = 0.5 [(1 + tanh(inner())) + x(1 - tanh(inner) ** 2)(d(inner()))] + // use Approximate formula Gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.0044715 * x ** 3 ))) + // \kappa = 0.044715, \beta = sqrt(2 / pi) + // inner() = \beta * (x + \kappa * x ** 3) + // dinner() = \beta * (1 + 3 * \kappa * x ** 2) + // Gelu(x) = 0.5 * x * (1 + tanh(inner())) + // d(Gelu(x)) = 0.5 [(1 + tanh(inner())) + x(1 - tanh(inner) ** 2)(d(inner()))] + // + // GeluBackWard(x) = grad_out * d(Gelu(x)) + + if constexpr (std::is_same_v) { + half inner = __hmul(h_beta, __hadd(input, __hmul(h_kappa, __hmul(input, __hmul(input, input))))); + half dinner = __hmul(h_beta, __hadd(h_one, __hmul(h_three, __hmul(h_kappa, __hmul(input, input))))); + half tanh = htanh_approx(inner); + half dGelu = __hmul(h_point_fiv, __hadd(h_one, __hadd(tanh, __hmul(input, __hmul(__hadd(h_one, __hneg(__hmul(tanh, tanh))), dinner))))); + + return __hmul(grad_output, dGelu); + + } else if constexpr (std::is_same_v) { + cuda_bfloat16 inner = __hmul(b_beta, __hadd(input, __hmul(b_kappa, __hmul(input, __hmul(input, input))))); + cuda_bfloat16 dinner = __hmul(b_beta, __hadd(b_one, __hmul(b_three, __hmul(b_kappa, __hmul(input, input))))); + cuda_bfloat16 tanh = htanh(inner); + cuda_bfloat16 dGelu = __hmul(b_point_fiv, __hadd(b_one, __hadd(tanh, __hmul(input, __hmul(__hadd(b_one, __hneg(__hmul(tanh, tanh))), dinner))))); + + return __hmul(grad_output, dGelu); + + } else if constexpr (std::is_same_v) { + + float inner = __fmul_rn(beta, __fadd_rn(input, __fmul_rn(kappa, __fmul_rn(input, __fmul_rn(input, input))))); + float dinner = __fmul_rn(beta, __fadd_rn(1.0f, __fmul_rn(3.0f, __fmul_rn(kappa, __fmul_rn(input, input))))); + float tanh = tanhf(inner); + float dGelu = __fmul_rn(0.5f, __fadd_rn(1.0f, __fadd_rn(tanh, __fmul_rn(input, __fmul_rn(__fsub_rn(1.0f, __fmul_rn(tanh, tanh)), dinner))))); + + return __fmul_rn(grad_output, dGelu); + + } else { + return T(0.0); + } + }; +} GeluBackWardOp; +} // namespace op::gelu_backward::cuda + +#endif // __silu_CUDA_H__ diff --git a/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h new file mode 100644 index 000000000..201ab3d55 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __GELU_BACKWARD_METAX_API_H__ +#define __GELU_BACKWARD_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(gelu_backward, metax) + +#endif // __SWIGLU_METAX_API_H__ diff --git a/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca new file mode 100644 index 000000000..34dc173e3 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca @@ -0,0 +1,55 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "gelu_backward_metax.h" + +namespace op::gelu_backward::metax{ +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto &b_desc = input_descs.at(1); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + const auto b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create metax elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluBackWardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluBackWardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gelu_backward::metax diff --git a/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu new file mode 100644 index 000000000..570dcfcd7 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu @@ -0,0 +1,55 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "gelu_backward_nvidia.cuh" +#include "infinicore.h" + +namespace op::gelu_backward::nvidia { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_desc_vec) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(0); + const auto &grad_input_shape = output_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluBackWardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluBackWardOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gelu_backward::nvidia diff --git a/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh new file mode 100644 index 000000000..7b6cd9363 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __GELU_BACKWARD_CUDA_API_H__ +#define __GELU_BACKWARD_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(gelu_backward, nvidia) + +#endif diff --git a/src/infiniop/ops/gelu_backward/operator.cc b/src/infiniop/ops/gelu_backward/operator.cc new file mode 100644 index 000000000..97da6465f --- /dev/null +++ b/src/infiniop/ops/gelu_backward/operator.cc @@ -0,0 +1,131 @@ +#include "../../handle.h" +#include "infinicore.h" +#include "infiniop/ops/gelu_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gelu_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/gelu_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/gelu_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateGeluBackWardDescriptor( + infiniopHandle_t handle, + infiniopGeluBackWardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t grad_output_desc) { +#define CTEATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gelu_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input_desc, \ + {input_desc, grad_output_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CTEATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CTEATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + CTEATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + CTEATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CTEATE +} + +__C infiniStatus_t infiniopGetGeluBackWardWorkspaceSize(infiniopGeluBackWardDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopGeluBackWard( + infiniopGeluBackWardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyGeluBackWardDescriptor(infiniopGeluBackWardDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc new file mode 100644 index 000000000..70f1afe7d --- /dev/null +++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc @@ -0,0 +1,51 @@ +#include "logical_and_cpu.h" +#include "infinicore.h" + +namespace op::logical_and::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::logical_and::cpu diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h new file mode 100644 index 000000000..52dc0477e --- /dev/null +++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h @@ -0,0 +1,19 @@ +#ifndef __LOGICAL_AND_CPU_H__ +#define __LOGICAL_AND_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(logical_and, cpu) + +namespace op::logical_and::cpu { +typedef struct LogicalAndOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return a && b; + } +} LogicalAndOp; +} // namespace op::logical_and::cpu + +#endif diff --git a/src/infiniop/ops/logical_and/cuda/kernel.cuh b/src/infiniop/ops/logical_and/cuda/kernel.cuh new file mode 100644 index 000000000..1e280165b --- /dev/null +++ b/src/infiniop/ops/logical_and/cuda/kernel.cuh @@ -0,0 +1,15 @@ +#ifndef __LOGICAL_AND_CUDA_H__ +#define __LOGICAL_AND_CUDA_H__ + +namespace op::logical_and::cuda { +typedef struct LogicalAndOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return a && b; + } +} LogicalAndOp; +} // namespace op::logical_and::cuda + +#endif // __logical_and_CUDA_H__ diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.h b/src/infiniop/ops/logical_and/metax/logical_and_metax.h new file mode 100644 index 000000000..e94c5fd15 --- /dev/null +++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.h @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_AND_METAX_API_H__ +#define __LOGICAL_AND_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(logical_and, metax) + +#endif diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.maca b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca new file mode 100644 index 000000000..8be401a2e --- /dev/null +++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca @@ -0,0 +1,53 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "logical_and_metax.h" + +namespace op::logical_and::metax { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto &b_desc = input_descs.at(1); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + const auto b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalAndOp, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalAndOp, int8_t>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_and::metax diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu new file mode 100644 index 000000000..a4567a844 --- /dev/null +++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "logical_and_nvidia.cuh" +#include + +namespace op::logical_and::nvidia { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto &b_desc = input_descs.at(1); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + const auto b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_descs); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalAndOp, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalAndOp, int8_t>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_and::nvidia diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh new file mode 100644 index 000000000..8884f9d0b --- /dev/null +++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_AND_CUDA_API_H__ +#define __LOGICAL_AND_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(logical_and, nvidia) + +#endif diff --git a/src/infiniop/ops/logical_and/operator.cc b/src/infiniop/ops/logical_and/operator.cc new file mode 100644 index 000000000..e72798bee --- /dev/null +++ b/src/infiniop/ops/logical_and/operator.cc @@ -0,0 +1,134 @@ +#include "../../handle.h" +#include "infinicore.h" +#include "infiniop/ops/logical_and.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_and_cpu.h" +#endif +#ifdef ENABLE_NVIDIA_API +#include "nvidia/logical_and_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/logical_and_metax.h" +#endif +#ifdef ENABLE_ILUVATAR_API +#include "nvidia/logical_and_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateLogicalAndDescriptor( + infiniopHandle_t handle, + infiniopLogicalAndDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logical_and::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, b_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopLogicalAnd( + infiniopLogicalAndDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} diff --git a/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.cc b/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.cc new file mode 100644 index 000000000..cda411728 --- /dev/null +++ b/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.cc @@ -0,0 +1,64 @@ +#include "logical_equal_cpu.h" +#include "infinicore.h" +#include +#include + +namespace op::logical_equal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { +#define CASE(CASE, TYPE) \ + case CASE: \ + return _device_info->calculate(_info, output, inputs, stream); + + switch (_dtype) { + CASE(INFINI_DTYPE_BOOL, bool) + CASE(INFINI_DTYPE_I8, int8_t) + CASE(INFINI_DTYPE_I16, int16_t) + CASE(INFINI_DTYPE_I32, int32_t) + CASE(INFINI_DTYPE_I64, int64_t) + CASE(INFINI_DTYPE_F16, fp16_t) + CASE(INFINI_DTYPE_BF16, bf16_t) + CASE(INFINI_DTYPE_F32, float) + CASE(INFINI_DTYPE_F64, double_t) + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::logical_equal::cpu diff --git a/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.h b/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.h new file mode 100644 index 000000000..a8c5088a0 --- /dev/null +++ b/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.h @@ -0,0 +1,19 @@ +#ifndef __LOGICAL_EQUAL_CPU_H__ +#define __LOGICAL_EQUAL_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(logical_equal, cpu) + +namespace op::logical_equal::cpu { +typedef struct LogicalEqualOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return a == b; + } +} LogicalEqualOp; +} // namespace op::logical_equal::cpu + +#endif diff --git a/src/infiniop/ops/logical_equal/cuda/kernel.cuh b/src/infiniop/ops/logical_equal/cuda/kernel.cuh new file mode 100644 index 000000000..ee92a46bd --- /dev/null +++ b/src/infiniop/ops/logical_equal/cuda/kernel.cuh @@ -0,0 +1,15 @@ +#ifndef __LOGICAL_EQUAL_CUDA_H__ +#define __LOGICAL_EQUAL_CUDA_H__ + +namespace op::logical_equal::cuda { +typedef struct LogicalEqualOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return a == b; + } +} LogicalEqualOp; +} // namespace op::logical_equal::cuda + +#endif // __logical_equal_CUDA_H__ diff --git a/src/infiniop/ops/logical_equal/metax/logical_equal_metax.h b/src/infiniop/ops/logical_equal/metax/logical_equal_metax.h new file mode 100644 index 000000000..8db9a7083 --- /dev/null +++ b/src/infiniop/ops/logical_equal/metax/logical_equal_metax.h @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_EQUAL_METAX_API_H__ +#define __LOGICAL_EQUAL_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(logical_equal, metax) + +#endif diff --git a/src/infiniop/ops/logical_equal/metax/logical_equal_metax.maca b/src/infiniop/ops/logical_equal/metax/logical_equal_metax.maca new file mode 100644 index 000000000..b5763b004 --- /dev/null +++ b/src/infiniop/ops/logical_equal/metax/logical_equal_metax.maca @@ -0,0 +1,65 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "logical_equal_metax.h" +#include + +namespace op::logical_equal::metax { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto &b_desc = input_descs.at(1); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + const auto b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } +#define CASE(CASE, TYPE) \ + case CASE: \ + return _device_info->calculate<256, cuda::LogicalEqualOp, TYPE>(_info, workspace, output, inputs, stream); + + switch (_dtype) { + CASE(INFINI_DTYPE_BOOL, bool) + CASE(INFINI_DTYPE_I8, int8_t) + CASE(INFINI_DTYPE_I16, int16_t) + CASE(INFINI_DTYPE_I32, int32_t) + CASE(INFINI_DTYPE_I64, int64_t) + CASE(INFINI_DTYPE_F16, half) + CASE(INFINI_DTYPE_BF16, cuda_bfloat16) + CASE(INFINI_DTYPE_F32, float) + CASE(INFINI_DTYPE_F64, double_t) + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_equal::metax diff --git a/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cu b/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cu new file mode 100644 index 000000000..27076902e --- /dev/null +++ b/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cu @@ -0,0 +1,65 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "logical_equal_nvidia.cuh" +#include + +namespace op::logical_equal::nvidia { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto &b_desc = input_descs.at(1); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + const auto b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_descs); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } +#define CASE(CASE, TYPE) \ + case CASE: \ + return _device_info->calculate<256, cuda::LogicalEqualOp, TYPE>(_info, workspace, output, inputs, stream); + + switch (_dtype) { + CASE(INFINI_DTYPE_BOOL, bool) + CASE(INFINI_DTYPE_I8, int8_t) + CASE(INFINI_DTYPE_I16, int16_t) + CASE(INFINI_DTYPE_I32, int32_t) + CASE(INFINI_DTYPE_I64, int64_t) + CASE(INFINI_DTYPE_F16, half) + CASE(INFINI_DTYPE_BF16, cuda_bfloat16) + CASE(INFINI_DTYPE_F32, float) + CASE(INFINI_DTYPE_F64, double_t) + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_equal::nvidia diff --git a/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cuh b/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cuh new file mode 100644 index 000000000..350c37445 --- /dev/null +++ b/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_EQUAL_CUDA_API_H__ +#define __LOGICAL_EQUAL_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(logical_equal, nvidia) + +#endif diff --git a/src/infiniop/ops/logical_equal/operator.cc b/src/infiniop/ops/logical_equal/operator.cc new file mode 100644 index 000000000..6f39a9829 --- /dev/null +++ b/src/infiniop/ops/logical_equal/operator.cc @@ -0,0 +1,134 @@ +#include "../../handle.h" +#include "infinicore.h" +#include "infiniop/ops/logical_equal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_equal_cpu.h" +#endif +#ifdef ENABLE_NVIDIA_API +#include "nvidia/logical_equal_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/logical_equal_metax.h" +#endif +#ifdef ENABLE_ILUVATAR_API +#include "nvidia/logical_equal_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateLogicalEqualDescriptor( + infiniopHandle_t handle, + infiniopLogicalEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logical_equal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, b_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogicalEqualWorkspaceSize(infiniopLogicalEqualDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopLogicalEqual( + infiniopLogicalEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyLogicalEqualDescriptor(infiniopLogicalEqualDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc new file mode 100644 index 000000000..4c92f5efe --- /dev/null +++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc @@ -0,0 +1,51 @@ +#include "logical_or_cpu.h" +#include "infinicore.h" + +namespace op::logical_or::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::logical_or::cpu diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h new file mode 100644 index 000000000..0329a0190 --- /dev/null +++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h @@ -0,0 +1,19 @@ +#ifndef __LOGICAL_OR_CPU_H__ +#define __LOGICAL_OR_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(logical_or, cpu) + +namespace op::logical_or::cpu { +typedef struct LogicalOrOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return a || b; + } +} LogicalOrOp; +} // namespace op::logical_or::cpu + +#endif diff --git a/src/infiniop/ops/logical_or/cuda/kernel.cuh b/src/infiniop/ops/logical_or/cuda/kernel.cuh new file mode 100644 index 000000000..8b2deeb1c --- /dev/null +++ b/src/infiniop/ops/logical_or/cuda/kernel.cuh @@ -0,0 +1,15 @@ +#ifndef __LOGICAL_OR_CUDA_H__ +#define __LOGICAL_OR_CUDA_H__ + +namespace op::logical_or::cuda { +typedef struct LogicalOrOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return a || b; + } +} LogicalOrOp; +} // namespace op::logical_or::cuda + +#endif // __LOGICAL_OR_CUDA_H__ diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.h b/src/infiniop/ops/logical_or/metax/logical_or_metax.h new file mode 100644 index 000000000..ec7d9c73b --- /dev/null +++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.h @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_OR_METAX_API_H__ +#define __LOGICAL_OR_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(logical_or, metax) + +#endif diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.maca b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca new file mode 100644 index 000000000..eda4b077b --- /dev/null +++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca @@ -0,0 +1,53 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "logical_or_metax.h" + +namespace op::logical_or::metax { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto &b_desc = input_descs.at(1); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + const auto b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalOrOp, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalOrOp, int8_t>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_or::metax diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu new file mode 100644 index 000000000..1725fbca0 --- /dev/null +++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu @@ -0,0 +1,54 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "logical_or_nvidia.cuh" +#include + +namespace op::logical_or::nvidia { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto &b_desc = input_descs.at(1); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + const auto b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_descs) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalOrOp, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalOrOp, int8_t>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_or::nvidia diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh new file mode 100644 index 000000000..07731049c --- /dev/null +++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_OR_CUDA_API_H__ +#define __LOGICAL_OR_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(logical_or, nvidia) + +#endif diff --git a/src/infiniop/ops/logical_or/operator.cc b/src/infiniop/ops/logical_or/operator.cc new file mode 100644 index 000000000..5a6ecd35d --- /dev/null +++ b/src/infiniop/ops/logical_or/operator.cc @@ -0,0 +1,134 @@ +#include "../../handle.h" +#include "infinicore.h" +#include "infiniop/ops/logical_or.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_or_cpu.h" +#endif +#ifdef ENABLE_NVIDIA_API +#include "nvidia/logical_or_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/logical_or_metax.h" +#endif +#ifdef ENABLE_ILUVATAR_API +#include "nvidia/logical_or_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateLogicalOrDescriptor( + infiniopHandle_t handle, + infiniopLogicalOrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logical_or::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, b_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopLogicalOr( + infiniopLogicalOrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} diff --git a/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc new file mode 100644 index 000000000..88d998f34 --- /dev/null +++ b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc @@ -0,0 +1,54 @@ +#include "relu_backward_cpu.h" + +namespace op::relu_backward::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(0); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::relu_backward::cpu diff --git a/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h new file mode 100644 index 000000000..ab70b3eed --- /dev/null +++ b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h @@ -0,0 +1,21 @@ +#ifndef __RELU_BACKWARD_CPU_H__ +#define __RELU_BACKWARD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(relu_backward, cpu) + +namespace op::relu_backward::cpu { +typedef struct ReluOp { +public: + static constexpr size_t num_inputs = 2; + + template + T operator()(const T &input, const T &grad_output) const { + auto dy = T(1) ? input > T(0) : T(0); + return grad_output * dy; + } +} ReluOp; +} // namespace op::relu_backward::cpu + +#endif // __RELU_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/relu_backward/cuda/kernel.cuh b/src/infiniop/ops/relu_backward/cuda/kernel.cuh new file mode 100644 index 000000000..30aae9ef8 --- /dev/null +++ b/src/infiniop/ops/relu_backward/cuda/kernel.cuh @@ -0,0 +1,24 @@ +#ifndef __RELU_BACKWARD_CUDA_H__ +#define __RELU_BACKWARD_CUDA_H__ + +namespace op::relu_backward::cuda { +typedef struct ReluBackWardOp { + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &input, const T &grad_output) const { + auto dy = input > T(0) ? T(1) : T(0); + if constexpr (std::is_same_v || std::is_same_v) { + return __hmul2(grad_output, dy); + } else if constexpr (std::is_same_v || std::is_same_v) { + return __hmul(grad_output, dy); + } else if constexpr (std::is_same_v) { + return __fmul_rn(grad_output, dy); + } else { + return grad_output * dy; + } + } +} ReluBackWardOp; + +} // namespace op::relu_backward::cuda + +#endif // __RELU_BACKWARD_CUDA_H__ diff --git a/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h new file mode 100644 index 000000000..13c41b83a --- /dev/null +++ b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __RELU_BACKWARD_METAX_API_H__ +#define __RELU_BACKWARD_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(relu_backward, metax) + +#endif // __RELU_BACKWARD_METAX_API_H__ diff --git a/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca new file mode 100644 index 000000000..79e38743e --- /dev/null +++ b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca @@ -0,0 +1,60 @@ +#include "../../../elementwise/metax/elementwise_metax.h" +#include "../cuda/kernel.cuh" +#include "relu_backward_metax.h" + +namespace op::relu_backward::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(0); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ReluBackWardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ReluBackWardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ReluBackWardOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ReluBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::relu_backward::metax + diff --git a/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu new file mode 100644 index 000000000..9d67cb1e5 --- /dev/null +++ b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu @@ -0,0 +1,56 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "relu_backward_nvidia.cuh" + +namespace op::relu_backward::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(0); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ReluBackWardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ReluBackWardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ReluBackWardOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ReluBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} +} // namespace op::relu_backward::nvidia diff --git a/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh new file mode 100644 index 000000000..b29db1540 --- /dev/null +++ b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __RELU_NVIDIA_API_H__ +#define __RELU_NVIDIA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(relu_backward, nvidia) + +#endif // __RELU_NVIDIA_API_H__ diff --git a/src/infiniop/ops/relu_backward/operator.cc b/src/infiniop/ops/relu_backward/operator.cc new file mode 100644 index 000000000..3b1c92730 --- /dev/null +++ b/src/infiniop/ops/relu_backward/operator.cc @@ -0,0 +1,144 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/relu_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/relu_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/relu_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/relu_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateReluBackWardDescriptor( + infiniopHandle_t handle, + infiniopReluBackWardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t grad_output_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::relu_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input_desc, \ + {input_desc, grad_output_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetReluBackWardWorkspaceSize(infiniopReluBackWardDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReluBackWard( + infiniopReluBackWardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyReluBackWardDescriptor(infiniopReluBackWardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/silu/cpu/silu_cpu.cc b/src/infiniop/ops/silu/cpu/silu_cpu.cc new file mode 100644 index 000000000..c005d7a80 --- /dev/null +++ b/src/infiniop/ops/silu/cpu/silu_cpu.cc @@ -0,0 +1,50 @@ +#include "silu_cpu.h" +#include "infinicore.h" + +namespace op::silu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::silu::cpu diff --git a/src/infiniop/ops/silu/cpu/silu_cpu.h b/src/infiniop/ops/silu/cpu/silu_cpu.h new file mode 100644 index 000000000..8ca15afee --- /dev/null +++ b/src/infiniop/ops/silu/cpu/silu_cpu.h @@ -0,0 +1,25 @@ +#ifndef __SILU_CPU_H__ +#define __SILU_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(silu, cpu) + +namespace op::silu::cpu { +typedef struct SiluOp { +private: + template + T sigmoid(const T &x) const { + return T(1) / (T(1) + std::exp(-x)); + } + +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return x * sigmoid(x); + } +} SiluOp; +} // namespace op::silu::cpu + +#endif diff --git a/src/infiniop/ops/silu/cuda/kernel.cuh b/src/infiniop/ops/silu/cuda/kernel.cuh new file mode 100644 index 000000000..a239619b8 --- /dev/null +++ b/src/infiniop/ops/silu/cuda/kernel.cuh @@ -0,0 +1,65 @@ +#ifndef __SILU_CUDA_H__ +#define __SILU_CUDA_H__ + +namespace op::silu::cuda { +typedef struct SiluOp { +private: + template + __device__ __forceinline__ T sigmoid(const T &x) const { + if constexpr (std::is_same_v) { + return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x)))); + } else if constexpr (std::is_same_v) { + return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x)))))); + } else if constexpr (std::is_same_v) { + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + float sig0 = __frcp_rn(__fadd_rn(1.0f, __expf(-x0))); + float sig1 = __frcp_rn(__fadd_rn(1.0f, __expf(-x1))); + return __floats2bfloat162_rn(sig0, sig1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + return __float2bfloat16_rn(__frcp_rn(__fadd_rn(1.0f, __expf(-xf)))); + } else if constexpr (std::is_same_v) { + return __frcp_rn(__fadd_rn(1, __expf(-x))); + } else { + return 1 / (1 + std::exp(x)); + } + } + +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return __hmul2(x, sigmoid(x)); + } else if constexpr (std::is_same_v) { + return __hmul(x, sigmoid(x)); + } else if constexpr (std::is_same_v) { + cuda_bfloat162 sig = sigmoid(x); + + float x0 = __bfloat162float(__low2bfloat16(x)); + float x1 = __bfloat162float(__high2bfloat16(x)); + float sig0 = __bfloat162float(__low2bfloat16(sig)); + float sig1 = __bfloat162float(__high2bfloat16(sig)); + + float res0 = __fmul_rn(x0, sig0); + float res1 = __fmul_rn(x1, sig1); + return __floats2bfloat162_rn(res0, res1); + } else if constexpr (std::is_same_v) { + cuda_bfloat16 sig = sigmoid(x); + + float xf = __bfloat162float(x); + float sigf = __bfloat162float(sig); + + float res = __fmul_rn(xf, sigf); + return __float2bfloat16_rn(res); + } else if constexpr (std::is_same_v) { + return __fmul_rn(x, sigmoid(x)); + } else { + return x * sigmoid(x); + } + }; +} SiluOp; +} // namespace op::silu::cuda + +#endif // __silu_CUDA_H__ diff --git a/src/infiniop/ops/silu/metax/silu_metax.h b/src/infiniop/ops/silu/metax/silu_metax.h new file mode 100644 index 000000000..4d66b2ead --- /dev/null +++ b/src/infiniop/ops/silu/metax/silu_metax.h @@ -0,0 +1,8 @@ +#ifndef __SILU_METAX_API_H__ +#define __SILU_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(silu, metax) + +#endif // __SWIGLU_METAX_API_H__ diff --git a/src/infiniop/ops/silu/metax/silu_metax.maca b/src/infiniop/ops/silu/metax/silu_metax.maca new file mode 100644 index 000000000..6f93d67c7 --- /dev/null +++ b/src/infiniop/ops/silu/metax/silu_metax.maca @@ -0,0 +1,53 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "silu_metax.h" + +namespace op::silu::metax{ +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape); + + // create metax elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SiluOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SiluOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SiluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::silu::metax diff --git a/src/infiniop/ops/silu/nvidia/silu_nvidia.cu b/src/infiniop/ops/silu/nvidia/silu_nvidia.cu new file mode 100644 index 000000000..a3751a58c --- /dev/null +++ b/src/infiniop/ops/silu/nvidia/silu_nvidia.cu @@ -0,0 +1,53 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "silu_nvidia.cuh" + +namespace op::silu::nvidia { +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + const auto &a_desc = input_descs.at(0); + const auto c_shape = output_desc->shape(); + const auto a_shape = a_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_descs) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SiluOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SiluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SiluOp, float>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::silu::nvidia diff --git a/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh b/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh new file mode 100644 index 000000000..ba01648ae --- /dev/null +++ b/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SILU_CUDA_API_H__ +#define __SILU_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(silu, nvidia) + +#endif diff --git a/src/infiniop/ops/silu/operator.cc b/src/infiniop/ops/silu/operator.cc new file mode 100644 index 000000000..2ce72c8ab --- /dev/null +++ b/src/infiniop/ops/silu/operator.cc @@ -0,0 +1,132 @@ +#include "../../handle.h" +#include "infinicore.h" +#include "infiniop/ops/silu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/silu_cpu.h" +#endif +#ifdef ENABLE_NVIDIA_API +#include "nvidia/silu_nvidia.cuh" +#endif +#ifdef ENABLE_ILUVATAR_API +#include "nvidia/silu_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/silu_metax.h" +#endif + +__C infiniStatus_t infiniopCreateSiluDescriptor( + infiniopHandle_t handle, + infiniopSiluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { +#define CTEATE(CASE, NAMESPACE) \ + case CASE: \ + return op::silu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CTEATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CTEATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CTEATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CTEATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CTEATE +} + +__C infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopSilu( + infiniopSiluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} diff --git a/test/infiniop/crossentropyloss_backward.py b/test/infiniop/crossentropyloss_backward.py new file mode 100644 index 000000000..755d19a9c --- /dev/null +++ b/test/infiniop/crossentropyloss_backward.py @@ -0,0 +1,191 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto +import math + +import torch +from torch.testing import assert_close +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + # TODO: Uncomment the following line. + # ((),), + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + INPLACE_X2 = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, + Inplace.INPLACE_X2, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16] +# _TENSOR_DTYPES = [InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def crossentropyloss_backward(probs, target): + shape = probs.shape + N = shape.numel() / shape[-1] + return (probs - target) / N + + +def test( + handle, + device, + shape, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor( + shape, + None, + dtype, + device, + ) + grad_output = TestTensor( + shape, + None, + dtype, + device, + ) + + if inplace == Inplace.INPLACE_X: + y = input + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing crossentropyloss_backward on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = crossentropyloss_backward(input.torch_tensor(), grad_output.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCrossEntropyLossBackWardDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + input.descriptor, + grad_output.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [y, input, grad_output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCrossEntropyLossBackWardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_crossentropyloss_backward(): + LIBINFINIOP.infiniopCrossEntropyLossBackWard( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + input.data(), + grad_output.data(), + None, + ) + + lib_crossentropyloss_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + # print(ans) + # print(y.actual_tensor()) + assert_close(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: crossentropyloss_backward(input.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_crossentropyloss_backward(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error( + LIBINFINIOP.infiniopDestroyCrossEntropyLossBackWardDescriptor(descriptor) + ) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/div.py b/test/infiniop/div.py new file mode 100644 index 000000000..2c3e17c30 --- /dev/null +++ b/test/infiniop/div.py @@ -0,0 +1,394 @@ +import torch +from torch.testing import assert_close +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + InfiniDivMode, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + (mode,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE + # for mode in InfiniDivMode + for mode in InfiniDivMode +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def div(c, a, b, mode): + torch_mode = [None, "trunc", "floor"] + + torch.div(a, b, rounding_mode=torch_mode[mode.value], out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + mode=None, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}, Mode: {mode}" + ) + + div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor(), mode) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateDivDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetDivWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_div(): + check_error( + LIBINFINIOP.infiniopDiv( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + mode, + None, + ) + ) + + lib_div() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if (dtype == InfiniDtype.F32) and (mode == InfiniDivMode.FLOOR): + atol = 1 + + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + assert_close( + c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor(),mode), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") +import torch +from torch.testing import assert_close +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + InfiniDivMode, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + (mode,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE + # for mode in InfiniDivMode + for mode in InfiniDivMode +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def div(c, a, b, mode): + torch_mode = [None, "trunc", "floor"] + + torch.div(a, b, rounding_mode=torch_mode[mode.value], out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + mode=None, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}, Mode: {mode}" + ) + + div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor(), mode) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateDivDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetDivWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_div(): + check_error( + LIBINFINIOP.infiniopDiv( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + mode, + None, + ) + ) + + lib_div() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if (dtype == InfiniDtype.F32) and (mode == InfiniDivMode.FLOOR): + atol = 1 + + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True) + + assert_close( + c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor(),mode), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/gelu.py b/test/infiniop/gelu.py new file mode 100644 index 000000000..9c6b36fee --- /dev/null +++ b/test/infiniop/gelu.py @@ -0,0 +1,161 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + # TODO: Uncomment the following line. + # ((),), + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def gelu(x): + return torch.nn.functional.gelu(x, approximate="tanh").to(x.dtype) + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None +): + x_torch_tensor = torch.rand(shape) * 2 - 1 + + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing gelu on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = gelu(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateGeluDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetGeluWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_gelu(): + LIBINFINIOP.infiniopGelu( + descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None + ) + + lib_gelu() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: gelu(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_gelu(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyGeluDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/gelu_backward.py b/test/infiniop/gelu_backward.py new file mode 100644 index 000000000..03b032541 --- /dev/null +++ b/test/infiniop/gelu_backward.py @@ -0,0 +1,200 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto +import math + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + # TODO: Uncomment the following line. + # ((),), + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] +# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16] +_TENSOR_DTYPES = [InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def gelu_backward(input, grad_output): + sqrt_2_over_pi = math.sqrt(2.0 / math.pi) + alpha = 0.044715 + + x = input + x_cubed = x * x * x + inner = sqrt_2_over_pi * (x + alpha * x_cubed) + tanh_inner = torch.tanh(inner) + + # derivative of tanh part + sech_squared = 1 - tanh_inner * tanh_inner + left = 0.5 * (1 + tanh_inner) + right = 0.5 * x * sech_squared * sqrt_2_over_pi * (1 + 3 * alpha * x * x) + + grad_input = grad_output * (left + right) + return grad_input + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None +): + x_torch_tensor = torch.rand(shape) * 2 - 1 + + input = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + grad_output_torch_tensor = torch.rand(shape) * 4 - 2 + grad_output = TestTensor( + shape, + grad_output_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + + if inplace == Inplace.INPLACE_X: + y = input + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing gelu_backward on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = gelu_backward(input.torch_tensor(), grad_output.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateGeluBackWardDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + input.descriptor, + grad_output.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [y, input, grad_output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetGeluBackWardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_gelu_backward(): + LIBINFINIOP.infiniopGeluBackWard( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + input.data(), + grad_output.data(), + None, + ) + + lib_gelu_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + # print(y.actual_tensor()) + # print(ans) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: gelu_backward(input.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_gelu_backward(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyGeluBackWardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/datatypes.py b/test/infiniop/libinfiniop/datatypes.py index 633aaafa7..232895d9d 100644 --- a/test/infiniop/libinfiniop/datatypes.py +++ b/test/infiniop/libinfiniop/datatypes.py @@ -1,3 +1,6 @@ +from enum import IntEnum + + class InfiniDtype: INVALID = 0 BYTE = 1 @@ -43,3 +46,9 @@ class InfiniDtype: InfiniDtype.C64: "C64", InfiniDtype.BF16: "BF16", } + + +class InfiniDivMode(IntEnum): + DEFAULT = 0 + TRUNC = 1 + FLOOR = 2 diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index e92e77105..79df0e335 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -316,6 +316,105 @@ def relu_(lib): lib.infiniopDestroyReluDescriptor.argtypes = [infiniopOperatorDescriptor_t] +@OpRegister.operator +def relu_backward_(lib): + lib.infiniopCreateReluBackWardDescriptor.restype = c_int32 + lib.infiniopCreateReluBackWardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopReluBackWard.restype = c_int32 + lib.infiniopReluBackWard.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReluBackWardDescriptor.restype = c_int32 + lib.infiniopDestroyReluBackWardDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def gelu_(lib): + lib.infiniopCreateGeluDescriptor.restype = c_int32 + lib.infiniopCreateGeluDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGelu.restype = c_int32 + lib.infiniopGelu.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyGeluDescriptor.restype = c_int32 + lib.infiniopDestroyGeluDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def gelu_backward_(lib): + lib.infiniopCreateGeluBackWardDescriptor.restype = c_int32 + lib.infiniopCreateGeluBackWardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGeluBackWard.restype = c_int32 + lib.infiniopGeluBackWard.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyGeluBackWardDescriptor.restype = c_int32 + lib.infiniopDestroyGeluBackWardDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def crossentropyloss_backward_(lib): + lib.infiniopCreateCrossEntropyLossBackWardDescriptor.restype = c_int32 + lib.infiniopCreateCrossEntropyLossBackWardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetCrossEntropyLossBackWardWorkspaceSize.restype = c_int32 + lib.infiniopGetCrossEntropyLossBackWardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopCrossEntropyLossBackWard.restype = c_int32 + lib.infiniopCrossEntropyLossBackWard.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyCrossEntropyLossBackWardDescriptor.restype = c_int32 + lib.infiniopDestroyCrossEntropyLossBackWardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t + ] + + @OpRegister.operator def rms_norm_(lib): lib.infiniopCreateRMSNormDescriptor.restype = c_int32 @@ -454,6 +553,7 @@ def swiglu_(lib): infiniopOperatorDescriptor_t, ] + @OpRegister.operator def conv_(lib): lib.infiniopCreateConvDescriptor.restype = c_int32 @@ -489,3 +589,154 @@ def conv_(lib): lib.infiniopDestroyConvDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + + +@OpRegister.operator +def silu_(lib): + lib.infiniopCreateSiluDescriptor.restype = c_int32 + lib.infiniopCreateSiluDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSiluWorkspaceSize.restype = c_int32 + lib.infiniopGetSiluWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSilu.restype = c_int32 + lib.infiniopSilu.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySiluDescriptor.restype = c_int32 + lib.infiniopDestroySiluDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def logical_equal_(lib): + lib.infiniopCreateLogicalEqualDescriptor.restype = c_int32 + lib.infiniopCreateLogicalEqualDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetLogicalEqualWorkspaceSize.restype = c_int32 + lib.infiniopGetLogicalEqualWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLogicalEqual.restype = c_int32 + lib.infiniopLogicalEqual.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyLogicalEqualDescriptor.restype = c_int32 + lib.infiniopDestroyLogicalEqualDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def logical_or_(lib): + lib.infiniopCreateLogicalOrDescriptor.restype = c_int32 + lib.infiniopCreateLogicalOrDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetLogicalOrWorkspaceSize.restype = c_int32 + lib.infiniopGetLogicalOrWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLogicalOr.restype = c_int32 + lib.infiniopLogicalOr.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyLogicalOrDescriptor.restype = c_int32 + lib.infiniopDestroyLogicalOrDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def logical_and_(lib): + lib.infiniopCreateLogicalAndDescriptor.restype = c_int32 + lib.infiniopCreateLogicalAndDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetLogicalAndWorkspaceSize.restype = c_int32 + lib.infiniopGetLogicalAndWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLogicalAnd.restype = c_int32 + lib.infiniopLogicalAnd.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyLogicalAndDescriptor.restype = c_int32 + lib.infiniopDestroyLogicalAndDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def div_(lib): + lib.infiniopCreateDivDescriptor.restype = c_int32 + lib.infiniopCreateDivDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetDivWorkspaceSize.restype = c_int32 + lib.infiniopGetDivWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopDiv.restype = c_int32 + lib.infiniopDiv.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_int32, + c_void_p, + ] + + lib.infiniopDestroyDivDescriptor.restype = c_int32 + lib.infiniopDestroyDivDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index 5c8e7f80a..83d0eefaf 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -70,6 +70,22 @@ def __init__( self._torch_tensor = torch.rand( torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] ) + elif mode == "bool": + self._torch_tensor = torch.randint( + 0, + 2, + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) + elif mode == "int": + self._torch_tensor = torch.randint( + -50, + 50, + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) elif mode == "zeros": self._torch_tensor = torch.zeros( torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] @@ -122,7 +138,9 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum): def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): - if dt == InfiniDtype.I8: + if dt == InfiniDtype.BOOL: + return torch.bool + elif dt == InfiniDtype.I8: return torch.int8 elif dt == InfiniDtype.I16: return torch.int16 @@ -330,6 +348,11 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True): actual = actual.to(torch.float32) desired = desired.to(torch.float32) + # 如果是Bool,全部转成01再比对 + if actual.dtype == torch.bool or desired.dtype == torch.bool: + actual = actual.to(torch.int8) + desired = desired.to(torch.int8) + print_discrepancy(actual, desired, atol, rtol, equal_nan, verbose) np.testing.assert_allclose( actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True diff --git a/test/infiniop/logical_and.py b/test/infiniop/logical_and.py new file mode 100644 index 000000000..1b0437941 --- /dev/null +++ b/test/infiniop/logical_and.py @@ -0,0 +1,172 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.I8, InfiniDtype.BOOL] + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def logical_and(c, a, b): + return torch.logical_and(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device, mode="bool") + b = TestTensor(shape, b_stride, dtype, device, mode="bool") + + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device, mode="bool") + if c.is_broadcast(): + return + + print( + f"Testing LogicalAnd on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLogicalAndDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLogicalAndWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_logical_and(): + check_error( + LIBINFINIOP.infiniopLogicalAnd( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_logical_and() + + atol, rtol = 0, 0 + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_logical_and(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLogicalAndDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/logical_equal.py b/test/infiniop/logical_equal.py new file mode 100644 index 000000000..a185ea7bc --- /dev/null +++ b/test/infiniop/logical_equal.py @@ -0,0 +1,183 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +# 三类type +bool_types = [InfiniDtype.BOOL] +float_types = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32, InfiniDtype.F64] +int_types = [InfiniDtype.I8, InfiniDtype.I16, InfiniDtype.I32, InfiniDtype.I64] + +_TENSOR_DTYPES = bool_types + float_types + int_types +mode_map = { + **{t: "bool" for t in bool_types}, + **{t: "random" for t in float_types}, + **{t: "int" for t in int_types}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def logical_equal(c, a, b): + return torch.eq(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + mode = mode_map.get(dtype) + a = TestTensor(shape, a_stride, dtype, device, mode=mode) + b = TestTensor(shape, b_stride, dtype, device, mode=mode) + + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device, mode=mode) + if c.is_broadcast(): + return + + print( + f"Testing LogicalEqual on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + logical_equal(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLogicalEqualDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLogicalEqualWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_logical_equal(): + check_error( + LIBINFINIOP.infiniopLogicalEqual( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_logical_equal() + + atol, rtol = 0, 0 + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: logical_equal(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_logical_equal(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLogicalEqualDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/logical_or.py b/test/infiniop/logical_or.py new file mode 100644 index 000000000..0a177aefc --- /dev/null +++ b/test/infiniop/logical_or.py @@ -0,0 +1,172 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.I8, InfiniDtype.BOOL] + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def logical_or(c, a, b): + return torch.logical_or(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device, mode="bool") + b = TestTensor(shape, b_stride, dtype, device, mode="bool") + + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device, mode="bool") + if c.is_broadcast(): + return + + print( + f"Testing LogicalOr on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLogicalOrDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLogicalOrWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_logical_or(): + check_error( + LIBINFINIOP.infiniopLogicalOr( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_logical_or() + + atol, rtol = 0, 0 + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_logical_or(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLogicalOrDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/relu_backward.py b/test/infiniop/relu_backward.py new file mode 100644 index 000000000..58f6b8d13 --- /dev/null +++ b/test/infiniop/relu_backward.py @@ -0,0 +1,177 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + # TODO: Uncomment the following line. + # ((),), + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def relu_backward(input, grad_output): + dy = (input > 0).float().to(input.dtype) + return grad_output * dy + + +def test( + handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None +): + input_torch_tensor = torch.rand(shape) * 2 - 1 + input = TestTensor( + shape, + input_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=input_torch_tensor, + ) + grad_output_torch_tensor = torch.rand(shape) * 4 - 2 + grad_output = TestTensor( + shape, + grad_output_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=input_torch_tensor, + ) + + if inplace == Inplace.INPLACE_INPUT: + grad_input = input + else: + grad_input = TestTensor(shape, None, dtype, device) + print( + f"Testing ReluBackWard on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = relu_backward(input.torch_tensor(), grad_output.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReluBackWardDescriptor( + handle, + ctypes.byref(descriptor), + grad_input.descriptor, + input.descriptor, + grad_output.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [grad_input, input, grad_output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReluBackWardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, input.device) + + def lib_relu_backward(): + LIBINFINIOP.infiniopReluBackWard( + descriptor, + workspace.data(), + workspace.size(), + grad_input.data(), + input.data(), + grad_output.data(), + None, + ) + + lib_relu_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(grad_input.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(grad_input.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: relu_backward(input.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_relu_backward(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyReluBackWardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/silu.py b/test/infiniop/silu.py new file mode 100644 index 000000000..7f971dec7 --- /dev/null +++ b/test/infiniop/silu.py @@ -0,0 +1,173 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + profile_operation, + TestWorkspace, + get_tolerance, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # tensor_shape, inplace + # TODO: Uncomment the following line. + # ((),), + ((1, 3),), + ((3, 3),), + ((32, 20, 512),), + ((33, 333, 333),), + ((32, 256, 112, 112),), + ((3, 3, 13, 9, 17),), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-3}, + InfiniDtype.F32: {"atol": 2e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def silu(x): + return x * torch.sigmoid(x) + + +def test( + handle, + device, + shape, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + x_torch_tensor = torch.rand(shape) * 2 - 1 + x = TestTensor( + shape, + x_torch_tensor.stride(), + dtype, + device, + mode="manual", + set_tensor=x_torch_tensor, + ) + if inplace == Inplace.INPLACE_X: + y = x + else: + y = TestTensor(shape, None, dtype, device) + + if y.is_broadcast(): + return + + print( + f"Testing Silu on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}" + ) + + ans = silu(x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSiluDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ) + ) + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSiluWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_silu(): + check_error( + LIBINFINIOP.infiniopSilu( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + None, + ) + ) + + lib_silu() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: silu(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_silu(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroySiluDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m")