diff --git a/include/infiniop.h b/include/infiniop.h index 58833f5c7..c3120f62a 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -4,11 +4,18 @@ #include "infiniop/handle.h" #include "infiniop/ops/add.h" #include "infiniop/ops/attention.h" +#include "infiniop/ops/averagepool.h" +#include "infiniop/ops/averagepool_backward.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" +#include "infiniop/ops/conv_backward.h" +#include "infiniop/ops/cross_entropy_loss.h" #include "infiniop/ops/dequantize.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/interpolate_nearest.h" +#include "infiniop/ops/maxpool.h" +#include "infiniop/ops/maxpool_backward.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/random_sample.h" #include "infiniop/ops/rearrange.h" diff --git a/include/infiniop/ops/averagepool.h b/include/infiniop/ops/averagepool.h new file mode 100644 index 000000000..87e857175 --- /dev/null +++ b/include/infiniop/ops/averagepool.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_AVERAGEPOOL_H__ +#define __INFINIOP_AVERAGEPOOL_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t; + +__C infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle, + infiniopAvgPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode); + +__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc); + +#endif // __INFINIOP_AVERAGEPOOL_H__ diff --git a/include/infiniop/ops/averagepool_backward.h b/include/infiniop/ops/averagepool_backward.h new file mode 100644 index 000000000..9229e9e1b --- /dev/null +++ b/include/infiniop/ops/averagepool_backward.h @@ -0,0 +1,31 @@ +#ifndef __INFINIOP_AVERAGEPOOL_BACKWARD_H__ +#define __INFINIOP_AVERAGEPOOL_BACKWARD_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopAvgPoolBackwardDescriptor_t; + +__C infiniStatus_t infiniopCreateAvgPoolBackwardDescriptor(infiniopHandle_t handle, + infiniopAvgPoolBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode); + +__C infiniStatus_t infiniopGetAvgPoolBackwardWorkspaceSize(infiniopAvgPoolBackwardDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopAvgPoolBackward(infiniopAvgPoolBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *grad_output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyAvgPoolBackwardDescriptor(infiniopAvgPoolBackwardDescriptor_t desc); + +#endif // __INFINIOP_AVERAGEPOOL_BACKWARD_H__ diff --git a/include/infiniop/ops/conv_backward.h b/include/infiniop/ops/conv_backward.h new file mode 100644 index 000000000..a692ed0eb --- /dev/null +++ b/include/infiniop/ops/conv_backward.h @@ -0,0 +1,34 @@ +#ifndef __INFINIOP_CONV_BACKWARD_API_H__ +#define __INFINIOP_CONV_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopConvBackwardDescriptor_t; + +__C infiniStatus_t infiniopCreateConvBackwardDescriptor(infiniopHandle_t handle, + infiniopConvBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + void *pads, + void *strides, + void *dilations, + size_t n); + +__C infiniStatus_t infiniopGetConvBackwardWorkspaceSize(infiniopConvBackwardDescriptor_t desc, size_t *size); + +__C infiniStatus_t infiniopConvBackward(infiniopConvBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + void *grad_weight, + void *grad_bias, + const void *grad_output, + const void *input, + const void *weight, + void *stream); + +__C infiniStatus_t infiniopDestroyConvBackwardDescriptor(infiniopConvBackwardDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/cross_entropy_loss.h b/include/infiniop/ops/cross_entropy_loss.h new file mode 100644 index 000000000..8b59843c9 --- /dev/null +++ b/include/infiniop/ops/cross_entropy_loss.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_CROSS_ENTROPY_LOSS_API_H__ +#define __INFINIOP_CROSS_ENTROPY_LOSS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCrossEntropyLossDescriptor_t; + +__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(infiniopHandle_t handle, + infiniopCrossEntropyLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t loss_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc); + +__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *loss, + const void *logits, + const void *target, + void *stream); + +__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc); + +#endif // __INFINIOP_CROSS_ENTROPY_LOSS_API_H__ diff --git a/include/infiniop/ops/interpolate_nearest.h b/include/infiniop/ops/interpolate_nearest.h new file mode 100644 index 000000000..7f970dc38 --- /dev/null +++ b/include/infiniop/ops/interpolate_nearest.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_INTERPOLATE_NEAREST_H__ +#define __INFINIOP_INTERPOLATE_NEAREST_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopInterpolateNearestDescriptor_t; + +__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(infiniopHandle_t handle, + infiniopInterpolateNearestDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc); + +__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopInterpolateNearest(infiniopInterpolateNearestDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc); + +#endif // __INFINIOP_INTERPOLATE_NEAREST_H__ diff --git a/include/infiniop/ops/maxpool.h b/include/infiniop/ops/maxpool.h new file mode 100644 index 000000000..e47a43aed --- /dev/null +++ b/include/infiniop/ops/maxpool.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_MAX_POOL_H__ +#define __INFINIOP_MAX_POOL_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t; + +__C infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle, + infiniopMaxPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode); + +__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc); + +#endif // __INFINIOP_MAX_POOL_H__ diff --git a/include/infiniop/ops/maxpool_backward.h b/include/infiniop/ops/maxpool_backward.h new file mode 100644 index 000000000..361c04895 --- /dev/null +++ b/include/infiniop/ops/maxpool_backward.h @@ -0,0 +1,31 @@ +#ifndef __INFINIOP_MAXPOOL_BACKWARD_H__ +#define __INFINIOP_MAXPOOL_BACKWARD_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopMaxPoolBackwardDescriptor_t; + +__C infiniStatus_t infiniopCreateMaxPoolBackwardDescriptor(infiniopHandle_t handle, + infiniopMaxPoolBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode); + +__C infiniStatus_t infiniopGetMaxPoolBackwardWorkspaceSize(infiniopMaxPoolBackwardDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopMaxPoolBackward(infiniopMaxPoolBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *grad_output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyMaxPoolBackwardDescriptor(infiniopMaxPoolBackwardDescriptor_t desc); + +#endif // __INFINIOP_MAXPOOL_BACKWARD_H__ diff --git a/scripts/python_test.py b/scripts/python_test.py index 5348c8c69..710156838 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -25,6 +25,13 @@ def run_tests(args): "sub.py", "swiglu.py", "softplus.py", + "averagepool_backward.py", + "averagepool.py", + "maxpool_backward.py", + "maxpool.py", + "interpolate_nearest.py", + "conv_backward.py", + "cross_entropy_loss.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..cb417761f 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,13 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(cross_entropy_loss) +DECLARE_INFINIOP_TEST(averagepool) +DECLARE_INFINIOP_TEST(averagepool_backward) +DECLARE_INFINIOP_TEST(interpolate_nearest) +DECLARE_INFINIOP_TEST(conv_backward) +DECLARE_INFINIOP_TEST(maxpool) +DECLARE_INFINIOP_TEST(maxpool_backward) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -30,19 +37,26 @@ DECLARE_INFINIOP_TEST(sub) /* * Register all the tests here */ -#define TEST_BUILDER_MAPPINGS \ - { \ - REGISTER_INFINIOP_TEST(gemm) \ - REGISTER_INFINIOP_TEST(random_sample) \ - REGISTER_INFINIOP_TEST(add) \ - REGISTER_INFINIOP_TEST(mul) \ - REGISTER_INFINIOP_TEST(clip) \ - REGISTER_INFINIOP_TEST(swiglu) \ - REGISTER_INFINIOP_TEST(rope) \ - REGISTER_INFINIOP_TEST(rms_norm) \ - REGISTER_INFINIOP_TEST(causal_softmax) \ - REGISTER_INFINIOP_TEST(rearrange) \ - REGISTER_INFINIOP_TEST(sub) \ +#define TEST_BUILDER_MAPPINGS \ + { \ + REGISTER_INFINIOP_TEST(gemm) \ + REGISTER_INFINIOP_TEST(random_sample) \ + REGISTER_INFINIOP_TEST(add) \ + REGISTER_INFINIOP_TEST(mul) \ + REGISTER_INFINIOP_TEST(clip) \ + REGISTER_INFINIOP_TEST(swiglu) \ + REGISTER_INFINIOP_TEST(rope) \ + REGISTER_INFINIOP_TEST(rms_norm) \ + REGISTER_INFINIOP_TEST(causal_softmax) \ + REGISTER_INFINIOP_TEST(rearrange) \ + REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(cross_entropy_loss) \ + REGISTER_INFINIOP_TEST(averagepool) \ + REGISTER_INFINIOP_TEST(averagepool_backward) \ + REGISTER_INFINIOP_TEST(interpolate_nearest) \ + REGISTER_INFINIOP_TEST(conv_backward) \ + REGISTER_INFINIOP_TEST(maxpool) \ + REGISTER_INFINIOP_TEST(maxpool_backward) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/averagepool.cpp b/src/infiniop-test/src/ops/averagepool.cpp new file mode 100644 index 000000000..4f6a80201 --- /dev/null +++ b/src/infiniop-test/src/ops/averagepool.cpp @@ -0,0 +1,265 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::averagepool { + +struct Test::Attributes { + // 输入与期望输出 + std::shared_ptr input; + std::shared_ptr expected_output; + + // 平均池化参数 + std::vector kernel_size; + std::vector stride; + std::vector padding; + bool ceil_mode; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->expected_output = tensors["output"]; + + // (N, C, spatial...) → 池化维度数 = rank - 2 + size_t pool_ndim = test->_attributes->input->shape().size() - 2; + if (pool_ndim == 0) { + throw std::runtime_error( + "Input tensor must have at least 3 dimensions (N, C, ...)"); + } + + // ---- 解析并广播 kernel_size ---- + auto kernel_size_data = attributes["kernel_size"]; + if (kernel_size_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid kernel_size data size"); + } + size_t kernel_size_count = kernel_size_data.size() / sizeof(int); + const int *kernel_size_ptr = reinterpret_cast(kernel_size_data.data()); + + if (kernel_size_count == pool_ndim) { + test->_attributes->kernel_size.clear(); + for (size_t i = 0; i < kernel_size_count; ++i) { + test->_attributes->kernel_size.push_back( + static_cast(kernel_size_ptr[i])); + } + } else { + test->_attributes->kernel_size.assign( + pool_ndim, static_cast(kernel_size_ptr[0])); + } + + // ---- 解析并广播 stride ---- + auto stride_data = attributes["stride"]; + if (stride_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid stride data size"); + } + size_t stride_count = stride_data.size() / sizeof(int); + const int *stride_ptr = reinterpret_cast(stride_data.data()); + + if (stride_count == pool_ndim) { + test->_attributes->stride.clear(); + for (size_t i = 0; i < stride_count; ++i) { + test->_attributes->stride.push_back( + static_cast(stride_ptr[i])); + } + } else { + test->_attributes->stride.assign( + pool_ndim, static_cast(stride_ptr[0])); + } + + // ---- 解析并广播 padding ---- + auto padding_data = attributes["padding"]; + if (padding_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid padding data size"); + } + size_t padding_count = padding_data.size() / sizeof(int); + const int *padding_ptr = reinterpret_cast(padding_data.data()); + + if (padding_count == pool_ndim) { + test->_attributes->padding.clear(); + for (size_t i = 0; i < padding_count; ++i) { + test->_attributes->padding.push_back( + static_cast(padding_ptr[i])); + } + } else { + test->_attributes->padding.assign( + pool_ndim, static_cast(padding_ptr[0])); + } + + // ---- 解析 ceil_mode ---- + auto ceil_mode_data = attributes["ceil_mode"]; + if (ceil_mode_data.size() == sizeof(bool)) { + test->_attributes->ceil_mode = *reinterpret_cast(ceil_mode_data.data()); + } else if (ceil_mode_data.size() == sizeof(uint8_t)) { + test->_attributes->ceil_mode = *reinterpret_cast(ceil_mode_data.data()) != 0; + } else { + throw std::runtime_error("Invalid ceil_mode data size"); + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopAvgPoolDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto expected_output = _attributes->expected_output; + + auto input_dtype = input->ggml_type(); + auto output_shape = expected_output->shape(); + + size_t output_size_bytes = 1; + for (auto d : output_shape) { + output_size_bytes *= d; + } + output_size_bytes *= ggmlTypeSize(input_dtype); + + auto output_memory = std::make_shared(output_size_bytes, device, device_id); + + std::vector output_strides(output_shape.size()); + if (!output_shape.empty()) { + output_strides[output_shape.size() - 1] = 1; + for (int i = static_cast(output_shape.size()) - 2; i >= 0; --i) { + output_strides[i] = output_strides[i + 1] * output_shape[i + 1]; + } + } + + auto actual_output = std::make_shared( + output_memory, 0, output_shape, output_strides, input_dtype); + + // 参数指针(按底层接口需要传 void*) + void *kernel_size_ptr = _attributes->kernel_size.data(); + void *stride_ptr = _attributes->stride.data(); + void *padding_ptr = _attributes->padding.data(); + + // ---- 创建算子描述符 ---- + CHECK_OR(infiniopCreateAvgPoolDescriptor( + handle, &op_desc, + actual_output->desc(), + input->desc(), + kernel_size_ptr, + stride_ptr, + padding_ptr, + _attributes->ceil_mode), + return TEST_FAILED(OP_CREATION_FAILED, + "Failed to create avgpool descriptor.")); + + // ---- 获取工作空间大小 ---- + size_t workspace_size = 0; + CHECK_OR(infiniopGetAvgPoolWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, + "Failed to get workspace size.")); + + // ---- 分配工作空间(如需要)---- + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, + "Failed to allocate workspace.")); + } + + // ---- 执行平均池化 ---- + CHECK_OR(infiniopAvgPool( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, + "Failed during avgpool execution.")); + + // ---- 精度校验 ---- + try { + allClose(actual_output, expected_output, _rtol, _atol); + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyAvgPoolDescriptor(op_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // ---- 性能测试 ---- + double elapsed_time = benchmark( + [=]() { + infiniopAvgPool( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + // ---- 清理资源 ---- + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyAvgPoolDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"kernel_size", "stride", "padding", "ceil_mode"}; +} + +std::vector Test::tensor_names() { + return {"input", "output"}; +} + +std::vector Test::output_names() { + return {}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- expected_output: " << _attributes->expected_output->info() << std::endl; + + oss << "- kernel_size: ["; + for (size_t i = 0; i < _attributes->kernel_size.size(); ++i) { + if (i) { + oss << ", "; + } + oss << _attributes->kernel_size[i]; + } + oss << "]\n- stride: ["; + for (size_t i = 0; i < _attributes->stride.size(); ++i) { + if (i) { + oss << ", "; + } + oss << _attributes->stride[i]; + } + oss << "]\n- padding: ["; + for (size_t i = 0; i < _attributes->padding.size(); ++i) { + if (i) { + oss << ", "; + } + oss << _attributes->padding[i]; + } + oss << "]\n- ceil_mode: " + << (_attributes->ceil_mode ? "true" : "false") << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::averagepool diff --git a/src/infiniop-test/src/ops/averagepool_backward.cpp b/src/infiniop-test/src/ops/averagepool_backward.cpp new file mode 100644 index 000000000..52949fdc1 --- /dev/null +++ b/src/infiniop-test/src/ops/averagepool_backward.cpp @@ -0,0 +1,254 @@ +// averagepool_backward.cpp +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::averagepool_backward { + +struct Test::Attributes { + // 张量 + std::shared_ptr input; // 前向输入 X + std::shared_ptr grad_output; // 上游梯度 dY + std::shared_ptr expected_grad_input; // 期望梯度 dX + + // 平均池化参数 + std::vector kernel_size; + std::vector stride; + std::vector padding; + bool ceil_mode; +}; + +static void broadcast_or_fill(std::vector &dst, + const int *src, size_t src_cnt, + size_t ndim) { + dst.clear(); + if (src_cnt == ndim) { + for (size_t i = 0; i < ndim; ++i) { + dst.push_back(static_cast(src[i])); + } + } else { + // 将单个值广播到所有池化维度 + dst.assign(ndim, static_cast(src[0])); + } +} + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { + throw std::runtime_error("Invalid Test: missing attributes or tensors"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->grad_output = tensors["grad_output"]; + test->_attributes->expected_grad_input = tensors["grad_input"]; + + // 维度:去掉 N、C 后的空间维度数 + const auto &in_shape = test->_attributes->input->shape(); + if (in_shape.size() < 3) { + throw std::runtime_error("Input tensor rank must be >= 3 (N, C, ...)"); + } + size_t pool_ndim = in_shape.size() - 2; + + // --- kernel_size --- + { + const auto &buf = attributes["kernel_size"]; + if (buf.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid kernel_size data size"); + } + size_t cnt = buf.size() / sizeof(int); + const int *p = reinterpret_cast(buf.data()); + broadcast_or_fill(test->_attributes->kernel_size, p, cnt, pool_ndim); + } + + // --- stride --- + { + const auto &buf = attributes["stride"]; + if (buf.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid stride data size"); + } + size_t cnt = buf.size() / sizeof(int); + const int *p = reinterpret_cast(buf.data()); + broadcast_or_fill(test->_attributes->stride, p, cnt, pool_ndim); + } + + // --- padding --- + { + const auto &buf = attributes["padding"]; + if (buf.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid padding data size"); + } + size_t cnt = buf.size() / sizeof(int); + const int *p = reinterpret_cast(buf.data()); + broadcast_or_fill(test->_attributes->padding, p, cnt, pool_ndim); + } + + // --- ceil_mode --- + { + const auto &buf = attributes["ceil_mode"]; + if (buf.size() == sizeof(bool)) { + test->_attributes->ceil_mode = *reinterpret_cast(buf.data()); + } else if (buf.size() == sizeof(uint8_t)) { + test->_attributes->ceil_mode = (*reinterpret_cast(buf.data()) != 0); + } else { + throw std::runtime_error("Invalid ceil_mode data size"); + } + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + // 把张量放到目标设备 + auto input = _attributes->input->to(device, device_id); // X + auto grad_output = _attributes->grad_output->to(device, device_id); // dY + auto expected_grad_input = _attributes->expected_grad_input; // 参考 dX + + // 构造实际输出 dX 的张量(形状等于 input,dtype 等于 input) + const auto &in_shape = input->shape(); + std::vector in_strides(in_shape.size()); + if (!in_shape.empty()) { + in_strides.back() = 1; + for (int i = static_cast(in_shape.size()) - 2; i >= 0; --i) { + in_strides[i] = in_strides[i + 1] * in_shape[i + 1]; + } + } + size_t dx_bytes = ggmlTypeSize(input->ggml_type()); + for (auto d : in_shape) { + dx_bytes *= d; + } + + auto dx_mem = std::make_shared(dx_bytes, device, device_id); + auto actual_grad_input = std::make_shared( + dx_mem, 0, in_shape, in_strides, input->ggml_type()); + + // 参数指针 + void *kernel_size_ptr = _attributes->kernel_size.data(); + void *stride_ptr = _attributes->stride.data(); + void *padding_ptr = _attributes->padding.data(); + + // --- 创建反向算子描述符 --- + infiniopAvgPoolBackwardDescriptor_t bwd_desc; + CHECK_OR(infiniopCreateAvgPoolBackwardDescriptor( + handle, &bwd_desc, + actual_grad_input->desc(), // grad_input_desc (dX) + grad_output->desc(), // grad_output_desc (dY) + input->desc(), // input_desc (X) + kernel_size_ptr, + stride_ptr, + padding_ptr, + _attributes->ceil_mode), + return TEST_FAILED(OP_CREATION_FAILED, + "Failed to create averagepool backward descriptor.")); + + // --- 获取工作空间大小 --- + size_t workspace_size = 0; + CHECK_OR(infiniopGetAvgPoolBackwardWorkspaceSize(bwd_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, + "Failed to get backward workspace size.")); + + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, + "Failed to allocate backward workspace.")); + } + + // --- 执行反向:dX = AvgPoolBackward(dY, X, ...) --- + CHECK_OR(infiniopAvgPoolBackward( + bwd_desc, workspace, workspace_size, + actual_grad_input->data(), // dX + grad_output->data(), // dY + input->data(), // X + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, + "Failed during averagepool backward execution.")); + + // --- 校验数值 --- + try { + allClose(actual_grad_input, expected_grad_input, _rtol, _atol); + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyAvgPoolBackwardDescriptor(bwd_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // --- 基准测试 --- + double elapsed_time = benchmark( + [=]() { + infiniopAvgPoolBackward( + bwd_desc, workspace, workspace_size, + actual_grad_input->data(), + grad_output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + // --- 清理 --- + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyAvgPoolBackwardDescriptor(bwd_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"kernel_size", "stride", "padding", "ceil_mode"}; +} + +std::vector Test::tensor_names() { + // 需要的输入张量 + return {"input", "grad_output", "grad_input"}; +} + +std::vector Test::output_names() { + // 无额外导出 + return {}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << "\n"; + oss << "- input: " << _attributes->input->info() << "\n"; + oss << "- grad_output (dY): " << _attributes->grad_output->info() << "\n"; + oss << "- expected_grad_input: " << _attributes->expected_grad_input->info() << "\n"; + + auto dump = [&](const char *name, const std::vector &v) { + oss << "- " << name << ": ["; + for (size_t i = 0; i < v.size(); ++i) { + if (i) { + oss << ", "; + } + oss << v[i]; + } + oss << "]\n"; + }; + dump("kernel_size", _attributes->kernel_size); + dump("stride", _attributes->stride); + dump("padding", _attributes->padding); + + oss << "- ceil_mode: " << (_attributes->ceil_mode ? "true" : "false") << "\n"; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << "\n"; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::averagepool_backward diff --git a/src/infiniop-test/src/ops/conv_backward.cpp b/src/infiniop-test/src/ops/conv_backward.cpp new file mode 100644 index 000000000..ed7814bff --- /dev/null +++ b/src/infiniop-test/src/ops/conv_backward.cpp @@ -0,0 +1,335 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include +#include + +namespace infiniop_test::conv_backward { + +struct Test::Attributes { + std::shared_ptr grad_output; + std::shared_ptr input; + std::shared_ptr weight; + std::shared_ptr bias; + + std::shared_ptr expected_grad_input; + std::shared_ptr expected_grad_weight; + std::shared_ptr expected_grad_bias; + + std::vector stride; + std::vector padding; + std::vector dilation; + int groups; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->grad_output = tensors["grad_output"]; + test->_attributes->input = tensors["input"]; + test->_attributes->weight = tensors["weight"]; + + auto bias_it = tensors.find("bias"); + if (bias_it != tensors.end()) { + test->_attributes->bias = bias_it->second; + } + + test->_attributes->expected_grad_input = tensors["grad_input"]; + test->_attributes->expected_grad_weight = tensors["grad_weight"]; + + auto grad_bias_it = tensors.find("grad_bias"); + if (grad_bias_it != tensors.end()) { + test->_attributes->expected_grad_bias = grad_bias_it->second; + } + + size_t pool_ndim = test->_attributes->input->shape().size() - 2; + if (pool_ndim == 0) { + throw std::runtime_error("Input tensor must have at least 3 dimensions (N, C, ...)"); + } + + auto stride_data = attributes["stride"]; + auto padding_data = attributes["padding"]; + auto dilation_data = attributes["dilation"]; + + size_t stride_count = stride_data.size() / sizeof(int); + size_t padding_count = padding_data.size() / sizeof(int); + size_t dilation_count = dilation_data.size() / sizeof(int); + + if (stride_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid stride data size"); + } + const int *stride_ptr = reinterpret_cast(stride_data.data()); + if (stride_count == pool_ndim) { + test->_attributes->stride.clear(); + for (size_t i = 0; i < stride_count; i++) { + test->_attributes->stride.push_back(static_cast(stride_ptr[i])); + } + } else { + test->_attributes->stride.assign(pool_ndim, static_cast(stride_ptr[0])); + } + + if (padding_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid padding data size"); + } + const int *padding_ptr = reinterpret_cast(padding_data.data()); + if (padding_count == pool_ndim) { + test->_attributes->padding.clear(); + for (size_t i = 0; i < padding_count; i++) { + test->_attributes->padding.push_back(static_cast(padding_ptr[i])); + } + } else { + test->_attributes->padding.assign(pool_ndim, static_cast(padding_ptr[0])); + } + + if (dilation_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid dilation data size"); + } + const int *dilation_ptr = reinterpret_cast(dilation_data.data()); + if (dilation_count == pool_ndim) { + test->_attributes->dilation.clear(); + for (size_t i = 0; i < dilation_count; i++) { + test->_attributes->dilation.push_back(static_cast(dilation_ptr[i])); + } + } else { + test->_attributes->dilation.assign(pool_ndim, static_cast(dilation_ptr[0])); + } + + test->_attributes->groups = *reinterpret_cast(attributes["groups"].data()); + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + infiniopConvBackwardDescriptor_t op_desc; + + auto grad_output = _attributes->grad_output->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto weight = _attributes->weight->to(device, device_id); + auto bias = _attributes->bias ? _attributes->bias->to(device, device_id) : nullptr; + + auto expected_grad_input = _attributes->expected_grad_input; + auto expected_grad_weight = _attributes->expected_grad_weight; + auto expected_grad_bias = _attributes->expected_grad_bias; + + auto input_dtype = input->ggml_type(); + + auto grad_input_shape = expected_grad_input->shape(); + size_t grad_input_size = 1; + for (auto dim : grad_input_shape) { + grad_input_size *= dim; + } + grad_input_size *= ggmlTypeSize(input_dtype); + + auto grad_input_memory = std::make_shared(grad_input_size, device, device_id); + std::vector grad_input_strides(grad_input_shape.size()); + + if (grad_input_shape.size() > 0) { + grad_input_strides[grad_input_shape.size() - 1] = 1; + for (int i = static_cast(grad_input_shape.size()) - 2; i >= 0; i--) { + grad_input_strides[i] = grad_input_strides[i + 1] * grad_input_shape[i + 1]; + } + } + + auto actual_grad_input = std::make_shared( + grad_input_memory, 0, grad_input_shape, grad_input_strides, input_dtype); + + auto grad_weight_shape = expected_grad_weight->shape(); + size_t grad_weight_size = 1; + for (auto dim : grad_weight_shape) { + grad_weight_size *= dim; + } + grad_weight_size *= ggmlTypeSize(input_dtype); + + auto grad_weight_memory = std::make_shared(grad_weight_size, device, device_id); + std::vector grad_weight_strides(grad_weight_shape.size()); + + if (grad_weight_shape.size() > 0) { + grad_weight_strides[grad_weight_shape.size() - 1] = 1; + for (int i = static_cast(grad_weight_shape.size()) - 2; i >= 0; i--) { + grad_weight_strides[i] = grad_weight_strides[i + 1] * grad_weight_shape[i + 1]; + } + } + + auto actual_grad_weight = std::make_shared( + grad_weight_memory, 0, grad_weight_shape, grad_weight_strides, input_dtype); + + std::shared_ptr actual_grad_bias = nullptr; + if (bias && expected_grad_bias) { + auto grad_bias_shape = expected_grad_bias->shape(); + size_t grad_bias_size = 1; + for (auto dim : grad_bias_shape) { + grad_bias_size *= dim; + } + grad_bias_size *= ggmlTypeSize(input_dtype); + + auto grad_bias_memory = std::make_shared(grad_bias_size, device, device_id); + std::vector grad_bias_strides(grad_bias_shape.size()); + + if (grad_bias_shape.size() > 0) { + grad_bias_strides[grad_bias_shape.size() - 1] = 1; + for (int i = static_cast(grad_bias_shape.size()) - 2; i >= 0; i--) { + grad_bias_strides[i] = grad_bias_strides[i + 1] * grad_bias_shape[i + 1]; + } + } + + actual_grad_bias = std::make_shared( + grad_bias_memory, 0, grad_bias_shape, grad_bias_strides, input_dtype); + } + + void *pads_ptr = _attributes->padding.data(); + void *strides_ptr = _attributes->stride.data(); + void *dilations_ptr = _attributes->dilation.data(); + + CHECK_OR(infiniopCreateConvBackwardDescriptor( + handle, &op_desc, + grad_output->desc(), + input->desc(), + weight->desc(), + bias ? bias->desc() : nullptr, + pads_ptr, + strides_ptr, + dilations_ptr, + _attributes->groups), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create conv_backward descriptor.")); + + // 获取工作空间大小 + size_t workspace_size; + CHECK_OR(infiniopGetConvBackwardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + // 分配工作空间 + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + } + + CHECK_OR(infiniopConvBackward( + op_desc, workspace, workspace_size, + actual_grad_input->data(), // void *grad_input + actual_grad_weight->data(), // void *grad_weight + actual_grad_bias ? actual_grad_bias->data() : nullptr, // void *grad_bias + grad_output->data(), // const void *grad_output + input->data(), // const void *input + weight->data(), // const void *weight + nullptr), // void *stream + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during conv_backward execution.")); + + // 验证结果 + try { + allClose(actual_grad_input, expected_grad_input, _rtol, _atol); + allClose(actual_grad_weight, expected_grad_weight, _rtol, _atol); + + if (actual_grad_bias && expected_grad_bias) { + allClose(actual_grad_bias, expected_grad_bias, _rtol, _atol); + } + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyConvBackwardDescriptor(op_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // 性能测试 + double elapsed_time = benchmark( + [=]() { + infiniopConvBackward( + op_desc, workspace, workspace_size, + actual_grad_input->data(), + actual_grad_weight->data(), + actual_grad_bias ? actual_grad_bias->data() : nullptr, + grad_output->data(), + input->data(), + weight->data(), + nullptr); + }, + warm_ups, iterations); + + // 清理资源 + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyConvBackwardDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"stride", "padding", "dilation", "groups"}; +} + +std::vector Test::tensor_names() { + return {"grad_output", "input", "weight", "bias", "grad_input", "grad_weight", "grad_bias"}; +} + +std::vector Test::output_names() { + return {}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- grad_output: " << _attributes->grad_output->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- weight: " << _attributes->weight->info() << std::endl; + if (_attributes->bias) { + oss << "- bias: " << _attributes->bias->info() << std::endl; + } + oss << "- expected_grad_input: " << _attributes->expected_grad_input->info() << std::endl; + oss << "- expected_grad_weight: " << _attributes->expected_grad_weight->info() << std::endl; + if (_attributes->expected_grad_bias) { + oss << "- expected_grad_bias: " << _attributes->expected_grad_bias->info() << std::endl; + } + + oss << "- stride: ["; + for (size_t i = 0; i < _attributes->stride.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->stride[i]; + } + oss << "]" << std::endl; + + oss << "- padding: ["; + for (size_t i = 0; i < _attributes->padding.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->padding[i]; + } + oss << "]" << std::endl; + + oss << "- dilation: ["; + for (size_t i = 0; i < _attributes->dilation.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->dilation[i]; + } + oss << "]" << std::endl; + + oss << "- groups: " << _attributes->groups << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::conv_backward diff --git a/src/infiniop-test/src/ops/cross_entropy_loss.cpp b/src/infiniop-test/src/ops/cross_entropy_loss.cpp new file mode 100644 index 000000000..7fac231e0 --- /dev/null +++ b/src/infiniop-test/src/ops/cross_entropy_loss.cpp @@ -0,0 +1,156 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::cross_entropy_loss { + +struct Test::Attributes { + // 输入张量 + std::shared_ptr logits; + std::shared_ptr target; + std::shared_ptr loss; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + // 检查必需的张量是否存在 + if (!check_names(tensors, Test::tensor_names()) || !check_names(attributes, Test::attribute_names())) { + throw std::runtime_error("Invalid Test: Missing required tensors."); + } + + test->_attributes->logits = tensors["logits"]; + test->_attributes->target = tensors["target"]; + test->_attributes->loss = tensors["loss"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopCrossEntropyLossDescriptor_t op_desc; + + // 将输入张量移动到目标设备 + auto logits = _attributes->logits->to(device, device_id); + auto target = _attributes->target->to(device, device_id); + auto loss = _attributes->loss; + + // 根据期望输出的形状创建实际输出张量 + auto output_shape = loss->shape(); + size_t output_size = 1; + for (auto dim : output_shape) { + output_size *= dim; + } + output_size *= ggmlTypeSize(logits->ggml_type()); + + auto output_memory = std::make_shared(output_size, device, device_id); + std::vector output_strides(static_cast(output_shape.size())); + if (output_shape.size() > 0) { + output_strides[output_shape.size() - 1] = 1; + for (int i = static_cast(output_shape.size()) - 2; i >= 0; i--) { + output_strides[i] = output_strides[i + 1] * output_shape[i + 1]; + } + } + auto actual_output = std::make_shared( + output_memory, 0, output_shape, output_strides, logits->ggml_type()); + + // 1. 创建算子描述符 + CHECK_OR(infiniopCreateCrossEntropyLossDescriptor( + handle, &op_desc, + actual_output->desc(), + logits->desc(), + target->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cross entropy loss descriptor.")); + + // 2. 获取并分配工作空间 + size_t workspace_size; + CHECK_OR(infiniopGetCrossEntropyLossWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + } + + // 3. 执行计算 + CHECK_OR(infiniopCrossEntropyLoss( + op_desc, workspace, workspace_size, + actual_output->data(), + logits->data(), + target->data(), + nullptr), // stream + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during cross entropy loss execution.")); + + // 4. 验证结果 + try { + allClose(actual_output, loss, _rtol, _atol); + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyCrossEntropyLossDescriptor(op_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // 5. 性能测试 + double elapsed_time = benchmark( + [=]() { + infiniopCrossEntropyLoss( + op_desc, workspace, workspace_size, + actual_output->data(), + logits->data(), + target->data(), + nullptr); // stream + }, + warm_ups, iterations); + + // 6. 清理资源 + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyCrossEntropyLossDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +// 定义算子需要的属性名列表 +std::vector Test::attribute_names() { + return {}; // CrossEntropyLoss 没有额外的属性 +} + +// 定义算子需要的张量名列表 +std::vector Test::tensor_names() { + return {"logits", "target", "loss"}; +} + +std::vector Test::output_names() { + return {}; +} + +// 打印测试信息的辅助函数 +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- logits: " << _attributes->logits->info() << std::endl; + oss << "- target: " << _attributes->target->info() << std::endl; + oss << "- loss: " << _attributes->loss->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::cross_entropy_loss diff --git a/src/infiniop-test/src/ops/interpolate_nearest.cpp b/src/infiniop-test/src/ops/interpolate_nearest.cpp new file mode 100644 index 000000000..071527249 --- /dev/null +++ b/src/infiniop-test/src/ops/interpolate_nearest.cpp @@ -0,0 +1,151 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::interpolate_nearest { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr expected_output; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { + std::cout << "DEBUG: Name check failed" << std::endl; + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; // F32 输入数据 + test->_attributes->expected_output = tensors["output"]; // F64 期望结果 + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopInterpolateNearestDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto expected_output = _attributes->expected_output; // F64 期望结果 + + // 动态创建实际的输出张量,使用期望结果的形状,但使用输入的数据类型 + auto output_shape = expected_output->shape(); + auto input_dtype = input->ggml_type(); + + // 创建输出张量的内存 + size_t output_size = 1; + for (auto dim : output_shape) { + output_size *= dim; + } + output_size *= ggmlTypeSize(input_dtype); + + auto output_memory = std::make_shared(output_size, device, device_id); + std::vector output_strides(output_shape.size()); + + // 计算连续的步长 + if (output_shape.size() > 0) { + output_strides[output_shape.size() - 1] = 1; + for (int i = static_cast(output_shape.size()) - 2; i >= 0; i--) { + output_strides[i] = output_strides[i + 1] * output_shape[i + 1]; + } + } + + auto actual_output = std::make_shared( + output_memory, 0, output_shape, output_strides, input_dtype); + + // Create operator descriptor + CHECK_OR(infiniopCreateInterpolateNearestDescriptor( + handle, &op_desc, + actual_output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + + // Get workspace size + size_t workspace_size; + CHECK_OR(infiniopGetInterpolateNearestWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + // Allocate workspace if needed + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + } + + // Execute interpolate nearest + CHECK_OR(infiniopInterpolateNearest( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + // Verify result - 比较实际输出和期望结果 + try { + allClose(actual_output, expected_output, _rtol, _atol); + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyInterpolateNearestDescriptor(op_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // Benchmark + double elapsed_time = benchmark( + [=]() { + infiniopInterpolateNearest( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + // Cleanup + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyInterpolateNearestDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output"}; +} + +std::vector Test::output_names() { + return {}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- expected_output: " << _attributes->expected_output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::interpolate_nearest diff --git a/src/infiniop-test/src/ops/maxpool.cpp b/src/infiniop-test/src/ops/maxpool.cpp new file mode 100644 index 000000000..698c5ad89 --- /dev/null +++ b/src/infiniop-test/src/ops/maxpool.cpp @@ -0,0 +1,263 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::maxpool { + +struct Test::Attributes { + // 输入张量 + std::shared_ptr input; + std::shared_ptr expected_output; + + // 最大池化参数 + std::vector kernel_size; + std::vector stride; + std::vector padding; + bool ceil_mode; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { + throw std::runtime_error("Invalid Test"); + } + + auto input_tensor = tensors["input"]; + test->_attributes->input = tensors["input"]; + test->_attributes->expected_output = tensors["output"]; + + // 获取池化维度(输入张量维度 - 2,去掉batch和channel维度) + size_t pool_ndim = test->_attributes->input->shape().size() - 2; + if (pool_ndim == 0) { + throw std::runtime_error("Input tensor must have at least 3 dimensions (N, C, ...)"); + } + + // 解析并广播 kernel_size - 修复类型转换 + auto kernel_size_data = attributes["kernel_size"]; + if (kernel_size_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid kernel_size data size"); + } + size_t kernel_size_count = kernel_size_data.size() / sizeof(int); + const int *kernel_size_ptr = reinterpret_cast(kernel_size_data.data()); + + if (kernel_size_count == pool_ndim) { + test->_attributes->kernel_size.clear(); + for (size_t i = 0; i < kernel_size_count; i++) { + test->_attributes->kernel_size.push_back(static_cast(kernel_size_ptr[i])); + } + } else { + test->_attributes->kernel_size.assign(pool_ndim, static_cast(kernel_size_ptr[0])); + } + + // 解析并广播 stride + auto stride_data = attributes["stride"]; + if (stride_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid stride data size"); + } + size_t stride_count = stride_data.size() / sizeof(int); + const int *stride_ptr = reinterpret_cast(stride_data.data()); + + if (stride_count == pool_ndim) { + // 直接使用提供的值 + test->_attributes->stride.clear(); + for (size_t i = 0; i < stride_count; i++) { + test->_attributes->stride.push_back(static_cast(stride_ptr[i])); + } + } else { + // 广播单个值到所有维度 + test->_attributes->stride.assign(pool_ndim, static_cast(stride_ptr[0])); + } + + // 解析并广播 padding + auto padding_data = attributes["padding"]; + if (padding_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid padding data size"); + } + size_t padding_count = padding_data.size() / sizeof(int); + const int *padding_ptr = reinterpret_cast(padding_data.data()); + + if (padding_count == pool_ndim) { + test->_attributes->padding.clear(); + for (size_t i = 0; i < padding_count; i++) { + test->_attributes->padding.push_back(static_cast(padding_ptr[i])); + } + } else { + test->_attributes->padding.assign(pool_ndim, static_cast(padding_ptr[0])); + } + + // 解析 ceil_mode + auto ceil_mode_data = attributes["ceil_mode"]; + if (ceil_mode_data.size() == sizeof(bool)) { + test->_attributes->ceil_mode = *reinterpret_cast(ceil_mode_data.data()); + } else if (ceil_mode_data.size() == sizeof(uint8_t)) { + test->_attributes->ceil_mode = *reinterpret_cast(ceil_mode_data.data()) != 0; + } else { + throw std::runtime_error("Invalid ceil_mode data size"); + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopMaxPoolDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto expected_output = _attributes->expected_output; + + auto input_dtype = input->ggml_type(); + + auto output_shape = expected_output->shape(); + + size_t output_size = 1; + for (auto dim : output_shape) { + output_size *= dim; + } + output_size *= ggmlTypeSize(input_dtype); + + auto output_memory = std::make_shared(output_size, device, device_id); + std::vector output_strides(output_shape.size()); + + if (output_shape.size() > 0) { + output_strides[output_shape.size() - 1] = 1; + for (int i = static_cast(output_shape.size()) - 2; i >= 0; i--) { + output_strides[i] = output_strides[i + 1] * output_shape[i + 1]; + } + } + + auto actual_output = std::make_shared( + output_memory, 0, output_shape, output_strides, input_dtype); + + // 准备参数指针 + void *kernel_size_ptr = _attributes->kernel_size.data(); + void *stride_ptr = _attributes->stride.data(); + void *padding_ptr = _attributes->padding.data(); + + // 创建算子描述符 + CHECK_OR(infiniopCreateMaxPoolDescriptor( + handle, &op_desc, + actual_output->desc(), + input->desc(), + kernel_size_ptr, + stride_ptr, + padding_ptr, + _attributes->ceil_mode), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create maxpool descriptor.")); + + // 获取工作空间大小 + size_t workspace_size; + CHECK_OR(infiniopGetMaxPoolWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + // 分配工作空间 + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + } + + // 执行最大池化 + CHECK_OR(infiniopMaxPool( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during maxpool execution.")); + + // 验证结果 + try { + allClose(actual_output, expected_output, _rtol, _atol); + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyMaxPoolDescriptor(op_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // 性能测试 + double elapsed_time = benchmark( + [=]() { + infiniopMaxPool( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + // 清理资源 + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyMaxPoolDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"kernel_size", "stride", "padding", "ceil_mode"}; +} + +std::vector Test::tensor_names() { + return {"input", "output"}; +} + +std::vector Test::output_names() { + return {}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- expected_output: " << _attributes->expected_output->info() << std::endl; + + oss << "- kernel_size: ["; + for (size_t i = 0; i < _attributes->kernel_size.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->kernel_size[i]; + } + oss << "]" << std::endl; + + oss << "- stride: ["; + for (size_t i = 0; i < _attributes->stride.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->stride[i]; + } + oss << "]" << std::endl; + + oss << "- padding: ["; + for (size_t i = 0; i < _attributes->padding.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->padding[i]; + } + oss << "]" << std::endl; + + oss << "- ceil_mode: " << (_attributes->ceil_mode ? "true" : "false") << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::maxpool diff --git a/src/infiniop-test/src/ops/maxpool_backward.cpp b/src/infiniop-test/src/ops/maxpool_backward.cpp new file mode 100644 index 000000000..2687fcf37 --- /dev/null +++ b/src/infiniop-test/src/ops/maxpool_backward.cpp @@ -0,0 +1,266 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::maxpool_backward { + +struct Test::Attributes { + std::shared_ptr grad_output; + std::shared_ptr input; + std::shared_ptr expected_grad_input; + + std::vector kernel_size; + std::vector stride; + std::vector padding; + bool ceil_mode; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->grad_output = tensors["grad_output"]; + test->_attributes->input = tensors["input"]; + test->_attributes->expected_grad_input = tensors["grad_input"]; + + // 获取池化维度 + size_t pool_ndim = test->_attributes->input->shape().size() - 2; + if (pool_ndim == 0) { + throw std::runtime_error("Input tensor must have at least 3 dimensions (N, C, ...)"); + } + + // 解析并广播 kernel_size + auto kernel_size_data = attributes["kernel_size"]; + if (kernel_size_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid kernel_size data size"); + } + size_t kernel_size_count = kernel_size_data.size() / sizeof(int); + const int *kernel_size_ptr = reinterpret_cast(kernel_size_data.data()); + if (kernel_size_count == pool_ndim) { + test->_attributes->kernel_size.clear(); + for (size_t i = 0; i < kernel_size_count; i++) { + test->_attributes->kernel_size.push_back(static_cast(kernel_size_ptr[i])); + } + } else { + test->_attributes->kernel_size.assign(pool_ndim, static_cast(kernel_size_ptr[0])); + } + + // 解析并广播 stride + auto stride_data = attributes["stride"]; + if (stride_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid stride data size"); + } + size_t stride_count = stride_data.size() / sizeof(int); + const int *stride_ptr = reinterpret_cast(stride_data.data()); + if (stride_count == pool_ndim) { + test->_attributes->stride.clear(); + for (size_t i = 0; i < stride_count; i++) { + test->_attributes->stride.push_back(static_cast(stride_ptr[i])); + } + } else { + test->_attributes->stride.assign(pool_ndim, static_cast(stride_ptr[0])); + } + + // 解析并广播 padding + auto padding_data = attributes["padding"]; + if (padding_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid padding data size"); + } + size_t padding_count = padding_data.size() / sizeof(int); + const int *padding_ptr = reinterpret_cast(padding_data.data()); + if (padding_count == pool_ndim) { + test->_attributes->padding.clear(); + for (size_t i = 0; i < padding_count; i++) { + test->_attributes->padding.push_back(static_cast(padding_ptr[i])); + } + } else { + test->_attributes->padding.assign(pool_ndim, static_cast(padding_ptr[0])); + } + + // 解析 ceil_mode + auto ceil_mode_data = attributes["ceil_mode"]; + if (ceil_mode_data.size() == sizeof(bool)) { + test->_attributes->ceil_mode = *reinterpret_cast(ceil_mode_data.data()); + } else if (ceil_mode_data.size() == sizeof(uint8_t)) { + test->_attributes->ceil_mode = *reinterpret_cast(ceil_mode_data.data()) != 0; + } else { + throw std::runtime_error("Invalid ceil_mode data size"); + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopMaxPoolBackwardDescriptor_t op_desc; + + // 将输入张量移动到指定设备 + auto grad_output = _attributes->grad_output->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto expected_grad_input = _attributes->expected_grad_input; + + // 获取输入数据类型 + auto input_dtype = input->ggml_type(); + + // 手动创建 grad_input 张量(使用期望结果的形状,但使用输入的数据类型) + auto grad_input_shape = expected_grad_input->shape(); + + size_t grad_input_size = 1; + for (auto dim : grad_input_shape) { + grad_input_size *= dim; + } + grad_input_size *= ggmlTypeSize(input_dtype); + + auto grad_input_memory = std::make_shared(grad_input_size, device, device_id); + std::vector grad_input_strides(grad_input_shape.size()); + + if (grad_input_shape.size() > 0) { + grad_input_strides[grad_input_shape.size() - 1] = 1; + for (int i = static_cast(grad_input_shape.size()) - 2; i >= 0; i--) { + grad_input_strides[i] = grad_input_strides[i + 1] * grad_input_shape[i + 1]; + } + } + + auto actual_grad_input = std::make_shared( + grad_input_memory, 0, grad_input_shape, grad_input_strides, input_dtype); + + // 准备参数指针 + void *kernel_size_ptr = _attributes->kernel_size.data(); + void *stride_ptr = _attributes->stride.data(); + void *padding_ptr = _attributes->padding.data(); + + auto grad_output_shape = grad_output->shape(); + CHECK_OR(infiniopCreateMaxPoolBackwardDescriptor( + handle, &op_desc, + actual_grad_input->desc(), + grad_output->desc(), + input->desc(), + kernel_size_ptr, + stride_ptr, + padding_ptr, + _attributes->ceil_mode), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create maxpool_backward descriptor.")); + + // 获取工作空间大小 + size_t workspace_size; + CHECK_OR(infiniopGetMaxPoolBackwardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + // 分配工作空间 + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + } + + // 执行最大池化反向传播 + CHECK_OR(infiniopMaxPoolBackward( + op_desc, workspace, workspace_size, + actual_grad_input->data(), // void *grad_input + grad_output->data(), // const void *grad_output + input->data(), // const void *input + nullptr), // void *stream + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during maxpool_backward execution.")); + + // 验证结果 + try { + allClose(actual_grad_input, expected_grad_input, _rtol, _atol); + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyMaxPoolBackwardDescriptor(op_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // 性能测试 + double elapsed_time = benchmark( + [=]() { + infiniopMaxPoolBackward( + op_desc, workspace, workspace_size, + actual_grad_input->data(), + grad_output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + // 清理资源 + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyMaxPoolBackwardDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"kernel_size", "stride", "padding", "ceil_mode"}; +} + +std::vector Test::tensor_names() { + return {"grad_output", "input", "grad_input"}; +} + +std::vector Test::output_names() { + return {}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- grad_output: " << _attributes->grad_output->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- expected_grad_input: " << _attributes->expected_grad_input->info() << std::endl; + + oss << "- kernel_size: ["; + for (size_t i = 0; i < _attributes->kernel_size.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->kernel_size[i]; + } + oss << "]" << std::endl; + + oss << "- stride: ["; + for (size_t i = 0; i < _attributes->stride.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->stride[i]; + } + oss << "]" << std::endl; + + oss << "- padding: ["; + for (size_t i = 0; i < _attributes->padding.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->padding[i]; + } + oss << "]" << std::endl; + + oss << "- ceil_mode: " << (_attributes->ceil_mode ? "true" : "false") << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::maxpool_backward diff --git a/src/infiniop/ops/averagepool/averagepool.h b/src/infiniop/ops/averagepool/averagepool.h new file mode 100644 index 000000000..7762826ab --- /dev/null +++ b/src/infiniop/ops/averagepool/averagepool.h @@ -0,0 +1,52 @@ +#ifndef __AVERAGEPOOL_H__ +#define __AVERAGEPOOL_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::averagepool::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + AvgPoolInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + AvgPoolInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + void *kernel_size, \ + void *strides, \ + void *pads, \ + bool ceil_mode); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __AVERAGEPOOL_H__ diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc new file mode 100644 index 000000000..2e8fa6851 --- /dev/null +++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc @@ -0,0 +1,362 @@ +#include "averagepool_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include +#include + +namespace op::averagepool::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + AvgPoolInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const AvgPoolInfo &avgpool_info) + : handle(handle_ptr), info(avgpool_info) { + workspace_size = 0; + } + + template + void _avgpool_1d(Ydata *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_width = info.input_dims[0]; + size_t output_width = info.output_dims[0]; + size_t kernel_width = info.kernel_sizes[0]; + size_t stride_width = info.strides[0]; + size_t pad_width = info.pads[0]; + + const size_t input_nc_stride = input_width; + const size_t output_nc_stride = output_width; + + #pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + const size_t input_offset = (b * channels + c) * input_nc_stride; + const size_t output_offset = (b * channels + c) * output_nc_stride; + + for (size_t ow = 0; ow < output_width; ++ow) { + float sum = 0.0f; + int valid_count = 0; + + const int window_start = static_cast(ow * stride_width) - static_cast(pad_width); + const int window_end = window_start + static_cast(kernel_width); + + for (int iw = window_start; iw < window_end; ++iw) { + if (iw >= 0 && iw < static_cast(input_width)) { + sum += utils::cast(input[input_offset + iw]); + valid_count++; + } else if (iw >= -static_cast(pad_width) && + iw < static_cast(input_width + pad_width)) { + valid_count++; + } + } + + float result = 0.0f; + if (valid_count > 0) { + result = sum / static_cast(valid_count); + } + output[output_offset + ow] = utils::cast(result); + } + } + } + } + + template + void _avgpool_2d(Ydata *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_height = info.input_dims[0]; + size_t input_width = info.input_dims[1]; + size_t output_height = info.output_dims[0]; + size_t output_width = info.output_dims[1]; + size_t kernel_height = info.kernel_sizes[0]; + size_t kernel_width = info.kernel_sizes[1]; + size_t stride_height = info.strides[0]; + size_t stride_width = info.strides[1]; + size_t pad_height = info.pads[0]; + size_t pad_width = info.pads[1]; + + const size_t input_nc_stride = input_height * input_width; + const size_t output_nc_stride = output_height * output_width; + + #pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + const size_t input_offset = (b * channels + c) * input_nc_stride; + const size_t output_offset = (b * channels + c) * output_nc_stride; + + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + float sum = 0.0f; + int valid_count = 0; + + const int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + const int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + + for (int kh = 0; kh < static_cast(kernel_height); ++kh) { + for (int kw = 0; kw < static_cast(kernel_width); ++kw) { + const int ih = start_h + kh; + const int iw = start_w + kw; + + if (ih >= 0 && ih < static_cast(input_height) && + iw >= 0 && iw < static_cast(input_width)) { + sum += utils::cast(input[input_offset + ih * input_width + iw]); + valid_count++; + } else if (ih >= -static_cast(pad_height) && + ih < static_cast(input_height + pad_height) && + iw >= -static_cast(pad_width) && + iw < static_cast(input_width + pad_width)) { + valid_count++; + } + } + } + + float result = 0.0f; + if (valid_count > 0) { + result = sum / static_cast(valid_count); + } + output[output_offset + oh * output_width + ow] = utils::cast(result); + } + } + } + } + } + + template + void _avgpool_3d(Ydata *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_depth = info.input_dims[0]; + size_t input_height = info.input_dims[1]; + size_t input_width = info.input_dims[2]; + size_t output_depth = info.output_dims[0]; + size_t output_height = info.output_dims[1]; + size_t output_width = info.output_dims[2]; + size_t kernel_depth = info.kernel_sizes[0]; + size_t kernel_height = info.kernel_sizes[1]; + size_t kernel_width = info.kernel_sizes[2]; + size_t stride_depth = info.strides[0]; + size_t stride_height = info.strides[1]; + size_t stride_width = info.strides[2]; + size_t pad_depth = info.pads[0]; + size_t pad_height = info.pads[1]; + size_t pad_width = info.pads[2]; + + const size_t input_nc_stride = input_depth * input_height * input_width; + const size_t output_nc_stride = output_depth * output_height * output_width; + + #pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + const size_t input_offset = (b * channels + c) * input_nc_stride; + const size_t output_offset = (b * channels + c) * output_nc_stride; + + for (size_t od = 0; od < output_depth; ++od) { + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + float sum = 0.0f; + int valid_count = 0; + + const int start_d = static_cast(od * stride_depth) - static_cast(pad_depth); + const int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + const int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + + for (int kd = 0; kd < static_cast(kernel_depth); ++kd) { + const int id = start_d + kd; + for (int kh = 0; kh < static_cast(kernel_height); ++kh) { + const int ih = start_h + kh; + for (int kw = 0; kw < static_cast(kernel_width); ++kw) { + const int iw = start_w + kw; + + if (id >= 0 && id < static_cast(input_depth) && + ih >= 0 && ih < static_cast(input_height) && + iw >= 0 && iw < static_cast(input_width)) { + const size_t idx = id * (input_height * input_width) + + ih * input_width + iw; + sum += utils::cast(input[input_offset + idx]); + valid_count++; + } else if (id >= -static_cast(pad_depth) && + id < static_cast(input_depth + pad_depth) && + ih >= -static_cast(pad_height) && + ih < static_cast(input_height + pad_height) && + iw >= -static_cast(pad_width) && + iw < static_cast(input_width + pad_width)) { + valid_count++; + } + } + } + } + + float result = 0.0f; + if (valid_count > 0) { + result = sum / static_cast(valid_count); + } + + const size_t out_idx = od * (output_height * output_width) + + oh * output_width + ow; + output[output_offset + out_idx] = utils::cast(result); + } + } + } + } + } + } + + template + void _avgpool_cpu(Ydata *output, const T *input) const { + switch (info.ndim) { + case 1: + _avgpool_1d(output, input); + break; + case 2: + _avgpool_2d(output, input); + break; + case 3: + _avgpool_3d(output, input); + break; + default: + break; + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + AvgPoolInfo &info) { + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *output, const void *input, infiniDtype_t dtype) const { + if (!output || !input) { + return INFINI_STATUS_BAD_PARAM; + } + + size_t output_size = info.batch * info.channels; + for (size_t i = 0; i < info.ndim; ++i) { + output_size *= info.output_dims[i]; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = static_cast(output); + const float *typed_input = static_cast(input); + _avgpool_cpu(typed_output, typed_input); + break; + } + case INFINI_DTYPE_F16: { + float *typed_output_f32 = static_cast(workspace); + const fp16_t *typed_input = static_cast(input); + + _avgpool_cpu(typed_output_f32, typed_input); + + fp16_t *typed_output = static_cast(output); + #pragma omp parallel for + for(size_t i = 0; i < output_size; ++i) { + typed_output[i] = utils::cast(typed_output_f32[i]); + } + break; + } + case INFINI_DTYPE_BF16: { + float *typed_output_f32 = static_cast(workspace); + const bf16_t *typed_input = static_cast(input); + + _avgpool_cpu(typed_output_f32, typed_input); + + bf16_t *typed_output = static_cast(output); + #pragma omp parallel for + for(size_t i = 0; i < output_size; ++i) { + typed_output[i] = utils::cast(typed_output_f32[i]); + } + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +inline size_t calculateOutputSize(const AvgPoolInfo &info) { + size_t size = info.batch * info.channels; + for(size_t i = 0; i < info.ndim; ++i) { + size *= info.output_dims[i]; + } + return size; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle, info); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + size_t workspace_size = 0; + if (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16) { + workspace_size = calculateOutputSize(info) * sizeof(float); + } + + *desc_ptr = new Descriptor(dtype, std::move(info), workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + return _opaque->calculate(workspace, workspace_size, output, input, _dtype); +} + +} // namespace op::averagepool::cpu diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h new file mode 100644 index 000000000..8388f80ff --- /dev/null +++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_CPU_H__ +#define __AVERAGEPOOL_CPU_H__ + +#include "../averagepool.h" + +DESCRIPTOR(cpu) + +#endif // __AVERAGEPOOL_CPU_H__ diff --git a/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh new file mode 100644 index 000000000..7c9d0f438 --- /dev/null +++ b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh @@ -0,0 +1,185 @@ +#ifndef __AVERAGEPOOL_KERNEL_H__ +#define __AVERAGEPOOL_KERNEL_H__ + +#include + +// 1D平均池化kernel,兼容PyTorch的隐式填充逻辑 +template +__global__ void avgpool1d_pytorch_compatible_kernel( + const T *input, T *output, int batch_size, int channels, int input_length, + int output_length, int kernel_size, int stride, int padding) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= output_length) { + return; + } + + // 计算输入和输出的偏移 + const T *input_ptr = input + batch_idx * channels * input_length + channel_idx * input_length; + T *output_ptr = output + batch_idx * channels * output_length + channel_idx * output_length; + + // 计算池化窗口的起始位置 + int window_start = output_idx * stride - padding; + + // 使用单精度进行中间计算 + float sum = 0.0f; + int valid_count = 0; + + // 遍历池化窗口 + for (int k = 0; k < kernel_size; ++k) { + int input_pos = window_start + k; + + if (input_pos >= 0 && input_pos < input_length) { + // 有效的输入位置,转换为单精度进行累加 + sum += static_cast(input_ptr[input_pos]); + valid_count++; + } else if (input_pos >= -padding && input_pos < input_length + padding) { + // 显式填充区域,值为0,只增加计数 + valid_count++; + } + // 其他位置是隐式填充,不计入分母 + } + + // 计算平均值并转换回原始数据类型 + if (valid_count > 0) { + float result = sum / static_cast(valid_count); + output_ptr[output_idx] = static_cast(result); + } else { + output_ptr[output_idx] = T(0); + } +} + +// 2D平均池化kernel,兼容PyTorch的隐式填充逻辑 +template +__global__ void avgpool2d_pytorch_compatible_kernel( + const T *input, T *output, int batch_size, int channels, int input_height, + int input_width, int output_height, int output_width, int kernel_h, + int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + int total_output_elements = output_height * output_width; + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) { + return; + } + + // 将线性索引转换为2D坐标 + int out_h = output_idx / output_width; + int out_w = output_idx % output_width; + + // 计算输入和输出的偏移 + const T *input_ptr = input + batch_idx * channels * input_height * input_width + channel_idx * input_height * input_width; + T *output_ptr = output + batch_idx * channels * output_height * output_width + channel_idx * output_height * output_width; + + // 计算池化窗口的起始位置 + int window_start_h = out_h * stride_h - pad_h; + int window_start_w = out_w * stride_w - pad_w; + + // 使用单精度进行中间计算 + float sum = 0.0f; + int valid_count = 0; + + // 遍历池化窗口 + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int input_h = window_start_h + kh; + int input_w = window_start_w + kw; + + if (input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) { + // 有效的输入位置,转换为单精度进行累加 + int input_idx = input_h * input_width + input_w; + sum += static_cast(input_ptr[input_idx]); + valid_count++; + } else if (input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) { + // 显式填充区域,值为0,只增加计数 + valid_count++; + } + // 其他位置是隐式填充,不计入分母 + } + } + + // 计算平均值并转换回原始数据类型 + if (valid_count > 0) { + float result = sum / static_cast(valid_count); + output_ptr[output_idx] = static_cast(result); + } else { + output_ptr[output_idx] = T(0); + } +} + +// 3D平均池化kernel,兼容PyTorch的隐式填充逻辑 +template +__global__ void avgpool3d_pytorch_compatible_kernel( + const T *input, T *output, int batch_size, int channels, int input_depth, + int input_height, int input_width, int output_depth, int output_height, + int output_width, int kernel_d, int kernel_h, int kernel_w, int stride_d, + int stride_h, int stride_w, int pad_d, int pad_h, int pad_w) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + int total_output_elements = output_depth * output_height * output_width; + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) { + return; + } + + // 将线性索引转换为3D坐标 + int out_d = output_idx / (output_height * output_width); + int remaining = output_idx % (output_height * output_width); + int out_h = remaining / output_width; + int out_w = remaining % output_width; + + // 计算输入和输出的偏移 + int input_spatial_size = input_depth * input_height * input_width; + int output_spatial_size = output_depth * output_height * output_width; + + const T *input_ptr = input + batch_idx * channels * input_spatial_size + channel_idx * input_spatial_size; + T *output_ptr = output + batch_idx * channels * output_spatial_size + channel_idx * output_spatial_size; + + // 计算池化窗口的起始位置 + int window_start_d = out_d * stride_d - pad_d; + int window_start_h = out_h * stride_h - pad_h; + int window_start_w = out_w * stride_w - pad_w; + + // 使用单精度进行中间计算 + float sum = 0.0f; + int valid_count = 0; + + // 遍历池化窗口 + for (int kd = 0; kd < kernel_d; ++kd) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int input_d = window_start_d + kd; + int input_h = window_start_h + kh; + int input_w = window_start_w + kw; + + if (input_d >= 0 && input_d < input_depth && input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) { + // 有效的输入位置,转换为单精度进行累加 + int input_idx = (input_d * input_height + input_h) * input_width + input_w; + sum += static_cast(input_ptr[input_idx]); + valid_count++; + } else if (input_d >= -pad_d && input_d < input_depth + pad_d && input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) { + // 显式填充区域,值为0,只增加计数 + valid_count++; + } + // 其他位置是隐式填充,不计入分母 + } + } + } + + // 计算平均值并转换回原始数据类型 + if (valid_count > 0) { + float result = sum / static_cast(valid_count); + output_ptr[output_idx] = static_cast(result); + } else { + output_ptr[output_idx] = T(0); + } +} + +#endif // __AVERAGEPOOL_KERNEL_H__ diff --git a/src/infiniop/ops/averagepool/info.h b/src/infiniop/ops/averagepool/info.h new file mode 100644 index 000000000..871e827a7 --- /dev/null +++ b/src/infiniop/ops/averagepool/info.h @@ -0,0 +1,136 @@ +#ifndef __AVERAGEPOOL_INFO_H__ +#define __AVERAGEPOOL_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include +#include + +namespace op::averagepool { + +inline utils::Result calculatePoolOutputSize( + size_t input_size, + size_t kernel_size, + size_t stride, + size_t padding = 0, + bool ceil_mode = false) { + + if (stride == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + if (kernel_size == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + + size_t padded_input_size = input_size + 2 * padding; + + if (padded_input_size < kernel_size) { + return utils::Result(INFINI_STATUS_BAD_TENSOR_SHAPE); + } + + size_t output_size; + if (ceil_mode) { + // 等效于整数的上取整 + output_size = (padded_input_size - kernel_size + stride - 1) / stride + 1; + } else { + // 等效于整数的下取整 + output_size = (padded_input_size - kernel_size) / stride + 1; + } + + return utils::Result(output_size); +} + +// 检查是否存在隐式填充 +inline bool hasImplicitPadding( + size_t input_size, + size_t kernel_size, + size_t stride, + size_t padding, + bool ceil_mode) { + + if (!ceil_mode) { + return false; + } + return ((input_size + 2 * padding) - kernel_size) % stride != 0; +} + +class AvgPoolInfo { + AvgPoolInfo() = default; + +public: + std::vector input_dims; + std::vector output_dims; + std::vector kernel_sizes; + std::vector strides; + std::vector pads; + bool ceil_mode; + size_t ndim; + size_t batch; + size_t channels; + bool has_implicit_padding = false; + + static utils::Result create( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + AvgPoolInfo info; + + if (input_desc->ndim() < 3 || input_desc->ndim() > 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->ndim() != output_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.ndim = input_desc->ndim() - 2; // 空间维度 + info.batch = input_desc->dim(0); + info.channels = input_desc->dim(1); + info.ceil_mode = ceil_mode; + + auto kernel_ptr = reinterpret_cast(kernel_size); + auto stride_ptr = reinterpret_cast(strides); + auto pad_ptr = reinterpret_cast(pads); + + // 初始化隐式填充标志 + info.has_implicit_padding = false; + + // 获取并校验空间维度 + for (size_t i = 0; i < info.ndim; ++i) { + info.input_dims.push_back(input_desc->dim(i + 2)); + info.kernel_sizes.push_back(kernel_ptr[i]); + info.strides.push_back(stride_ptr[i]); + info.pads.push_back(pad_ptr[i]); + + auto output_size_result = calculatePoolOutputSize( + info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode); + CHECK_RESULT(output_size_result); + + size_t expected_size = output_size_result.take(); + if (expected_size != output_desc->dim(i + 2)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.output_dims.push_back(output_desc->dim(i + 2)); + + // 检查当前维度是否存在隐式填充 + if (hasImplicitPadding(info.input_dims[i], info.kernel_sizes[i], + info.strides[i], info.pads[i], info.ceil_mode)) { + info.has_implicit_padding = true; + } + } + return utils::Result(std::move(info)); + } +}; +} // namespace op::averagepool + +#endif // __AVERAGEPOOL_INFO_H__ diff --git a/src/infiniop/ops/averagepool/metax/averagepool_metax.h b/src/infiniop/ops/averagepool/metax/averagepool_metax.h new file mode 100644 index 000000000..eef332b5f --- /dev/null +++ b/src/infiniop/ops/averagepool/metax/averagepool_metax.h @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_METAX_H__ +#define __AVERAGEPOOL_METAX_H__ + +#include "../averagepool.h" + +DESCRIPTOR(metax) + +#endif // __AVERAGEPOOL_METAX_CUH__ diff --git a/src/infiniop/ops/averagepool/metax/averagepool_metax.maca b/src/infiniop/ops/averagepool/metax/averagepool_metax.maca new file mode 100644 index 000000000..ee3c4bd9c --- /dev/null +++ b/src/infiniop/ops/averagepool/metax/averagepool_metax.maca @@ -0,0 +1,332 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include "averagepool_metax.h" +#include "../cuda/averagepool_kernel.cuh" +#include + +infiniStatus_t launch_avgpool_pytorch_kernel( + const op::averagepool::AvgPoolInfo& info, + const void* input, void* output, + infiniDtype_t data_type, hcStream_t stream) { + + int batch_size = static_cast(info.batch); + int channels = static_cast(info.channels); + + if (info.ndim == 1) { + // 1D平均池化 + int input_length = static_cast(info.input_dims[0]); + int output_length = static_cast(info.output_dims[0]); + int kernel_size = static_cast(info.kernel_sizes[0]); + int stride = static_cast(info.strides[0]); + int padding = static_cast(info.pads[0]); + + dim3 blockSize(256); + dim3 gridSize(batch_size, channels, (output_length + blockSize.x - 1) / blockSize.x); + + switch (data_type) { + case INFINI_DTYPE_F32: + avgpool1d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_length, output_length, + kernel_size, stride, padding); + break; + case INFINI_DTYPE_F16: + avgpool1d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_length, output_length, + kernel_size, stride, padding); + break; + case INFINI_DTYPE_BF16: + avgpool1d_pytorch_compatible_kernel<__hpcc_bfloat16><<>>( + static_cast(input), static_cast<__hpcc_bfloat16*>(output), + batch_size, channels, input_length, output_length, + kernel_size, stride, padding); + break; + default: + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + } else if (info.ndim == 2) { + // 2D平均池化 + int input_height = static_cast(info.input_dims[0]); + int input_width = static_cast(info.input_dims[1]); + int output_height = static_cast(info.output_dims[0]); + int output_width = static_cast(info.output_dims[1]); + int kernel_h = static_cast(info.kernel_sizes[0]); + int kernel_w = static_cast(info.kernel_sizes[1]); + int stride_h = static_cast(info.strides[0]); + int stride_w = static_cast(info.strides[1]); + int pad_h = static_cast(info.pads[0]); + int pad_w = static_cast(info.pads[1]); + + int total_output_elements = output_height * output_width; + dim3 blockSize(256); + dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x); + + switch (data_type) { + case INFINI_DTYPE_F32: + avgpool2d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_height, input_width, + output_height, output_width, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w); + break; + case INFINI_DTYPE_F16: + avgpool2d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_height, input_width, + output_height, output_width, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w); + break; + case INFINI_DTYPE_BF16: + avgpool2d_pytorch_compatible_kernel<__hpcc_bfloat16><<>>( + static_cast(input), static_cast<__hpcc_bfloat16*>(output), + batch_size, channels, input_height, input_width, + output_height, output_width, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w); + break; + default: + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + } else if (info.ndim == 3) { + // 3D平均池化 + int input_depth = static_cast(info.input_dims[0]); + int input_height = static_cast(info.input_dims[1]); + int input_width = static_cast(info.input_dims[2]); + int output_depth = static_cast(info.output_dims[0]); + int output_height = static_cast(info.output_dims[1]); + int output_width = static_cast(info.output_dims[2]); + int kernel_d = static_cast(info.kernel_sizes[0]); + int kernel_h = static_cast(info.kernel_sizes[1]); + int kernel_w = static_cast(info.kernel_sizes[2]); + int stride_d = static_cast(info.strides[0]); + int stride_h = static_cast(info.strides[1]); + int stride_w = static_cast(info.strides[2]); + int pad_d = static_cast(info.pads[0]); + int pad_h = static_cast(info.pads[1]); + int pad_w = static_cast(info.pads[2]); + + int total_output_elements = output_depth * output_height * output_width; + dim3 blockSize(256); + dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x); + + switch (data_type) { + case INFINI_DTYPE_F32: + avgpool3d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, + kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w); + break; + case INFINI_DTYPE_F16: + avgpool3d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, + kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w); + break; + case INFINI_DTYPE_BF16: + avgpool3d_pytorch_compatible_kernel<__hpcc_bfloat16><<>>( + static_cast(input), static_cast<__hpcc_bfloat16*>(output), + batch_size, channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, + kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w); + break; + default: + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + } else { + return INFINI_STATUS_BAD_PARAM; + } + + return INFINI_STATUS_SUCCESS; +} + +#define DESTROY_hcdnn_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_hcdnn_DESCRIPTORS() \ + do { \ + DESTROY_hcdnn_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(output_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(pooling_desc, hcdnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::averagepool::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_HCDNN_API + hcdnnTensorDescriptor_t input_desc = nullptr; + hcdnnTensorDescriptor_t output_desc = nullptr; + hcdnnPoolingDescriptor_t pooling_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_HCDNN_API + infiniStatus_t createPoolingDescriptors(const AvgPoolInfo &info, + hcdnnDataType_t hcdnn_data_type) { + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc)); + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&output_desc)); + CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_desc)); + + std::vector input_dims = {static_cast(info.batch), static_cast(info.channels)}; + std::vector output_dims = {static_cast(info.batch), static_cast(info.channels)}; + for (size_t i = 0; i < info.ndim; ++i) { + input_dims.push_back(static_cast(info.input_dims[i])); + output_dims.push_back(static_cast(info.output_dims[i])); + } + while (input_dims.size() < 5) input_dims.push_back(1); + while (output_dims.size() < 5) output_dims.push_back(1); + std::vector input_strides(input_dims.size(), 1); + std::vector output_strides(output_dims.size(), 1); + for (int i = input_dims.size() - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + output_strides[i] = output_strides[i + 1] * output_dims[i + 1]; + } + + CHECK_MCDNN(hcdnnSetTensorNdDescriptor(input_desc, hcdnn_data_type, + input_dims.size(), input_dims.data(), input_strides.data())); + CHECK_MCDNN(hcdnnSetTensorNdDescriptor(output_desc, hcdnn_data_type, + output_dims.size(), output_dims.data(), output_strides.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const AvgPoolInfo &info) { + std::vector kernel_size, strides, pads; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_size.push_back(static_cast(info.kernel_sizes[i])); + strides.push_back(static_cast(info.strides[i])); + pads.push_back(static_cast(info.pads[i])); + } + while (kernel_size.size() < 3) kernel_size.push_back(1); + while (strides.size() < 3) strides.push_back(1); + while (pads.size() < 3) pads.push_back(0); + CHECK_MCDNN(hcdnnSetPoolingNdDescriptor(pooling_desc, HCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, + HCDNN_NOT_PROPAGATE_NAN, kernel_size.size(), + kernel_size.data(), pads.data(), strides.data())); + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializehcdnnContext(AvgPoolInfo &info, + infiniDtype_t data_type) { + hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type); + CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + workspace_size = 0; + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) +#ifdef ENABLE_HCDNN_API + , input_desc(other.input_desc) + , output_desc(other.output_desc) + , pooling_desc(other.pooling_desc) +#endif + { +#ifdef ENABLE_HCDNN_API + other.input_desc = nullptr; + other.output_desc = nullptr; + other.pooling_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_HCDNN_API + CLEANUP_hcdnn_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + AvgPoolInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_HCDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializehcdnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { +#ifdef ENABLE_HCDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { +#ifdef ENABLE_HCDNN_API + if (_info.has_implicit_padding) { + // 使用自定义kernel实现PyTorch兼容的逻辑 + return launch_avgpool_pytorch_kernel(_info, input, output, _dtype, (hcStream_t)stream); + } else { + const float alpha = 1.0f, beta = 0.0f; + CHECK_STATUS(_opaque->internal->useMcdnn( + (hcStream_t)stream, [&](hcdnnHandle_t handle) { + CHECK_MCDNN(hcdnnPoolingForward(handle, _opaque->pooling_desc, &alpha, + _opaque->input_desc, input, &beta, + _opaque->output_desc, output)); + return INFINI_STATUS_SUCCESS; + })); + return INFINI_STATUS_SUCCESS; + } +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::averagepool::metax diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool.cu b/src/infiniop/ops/averagepool/nvidia/averagepool.cu new file mode 100644 index 000000000..6f276aac8 --- /dev/null +++ b/src/infiniop/ops/averagepool/nvidia/averagepool.cu @@ -0,0 +1,220 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "averagepool_nvidia.cuh" + +#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_CUDNN_DESCRIPTORS() \ + do { \ + DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::averagepool::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_CUDNN_API + cudnnTensorDescriptor_t input_desc = nullptr; + cudnnTensorDescriptor_t output_desc = nullptr; + cudnnPoolingDescriptor_t pooling_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_CUDNN_API + infiniStatus_t getCudnnDataType(infiniDtype_t data_type, + cudnnDataType_t &cudnn_data_type) const { + if (data_type == INFINI_DTYPE_F16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_F32) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_BF16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t createPoolingDescriptors(const AvgPoolInfo &info, + cudnnDataType_t cudnn_data_type) { + CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc)); + CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc)); + + std::vector input_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + + for (size_t i = 0; i < info.ndim; ++i) { + input_dims_vec.push_back(static_cast(info.input_dims[i])); + output_dims_vec.push_back(static_cast(info.output_dims[i])); + } + + if (info.ndim == 1) { + input_dims_vec.push_back(1); + output_dims_vec.push_back(1); + } + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(), + input_dims_vec.data())); + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(), + output_dims_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const AvgPoolInfo &info) { + std::vector kernel_vec, stride_vec, pad_vec; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_vec.push_back(static_cast(info.kernel_sizes[i])); + stride_vec.push_back(static_cast(info.strides[i])); + pad_vec.push_back(static_cast(info.pads[i])); + } + + if (info.ndim == 1) { + kernel_vec.push_back(1); + stride_vec.push_back(1); + pad_vec.push_back(0); + } + + CHECK_CUDNN(cudnnSetPoolingNdDescriptor( + pooling_desc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, + CUDNN_NOT_PROPAGATE_NAN, kernel_vec.size(), kernel_vec.data(), + pad_vec.data(), stride_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializeCudnnContext(AvgPoolInfo &info, + infiniDtype_t data_type) { + cudnnDataType_t cudnn_data_type; + CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type)); + + CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // Average pooling typically doesn't need a workspace + workspace_size = 0; + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) + // clang-format off +#ifdef ENABLE_CUDNN_API + , input_desc(other.input_desc) + , output_desc(other.output_desc) + , pooling_desc(other.pooling_desc) +#endif + // clang-format on + { +#ifdef ENABLE_CUDNN_API + other.input_desc = nullptr; + other.output_desc = nullptr; + other.pooling_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_CUDNN_API + CLEANUP_CUDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + AvgPoolInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_CUDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializeCudnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_CUDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + +#ifdef ENABLE_CUDNN_API + const float alpha = 1.0f, beta = 0.0f; + + CHECK_STATUS(_opaque->internal->useCudnn( + (cudaStream_t)stream, [&](cudnnHandle_t handle) { + CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha, + _opaque->input_desc, input, &beta, + _opaque->output_desc, output)); + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::averagepool::nvidia diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh new file mode 100644 index 000000000..ef19aa1dc --- /dev/null +++ b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_CUDA_CUH__ +#define __AVERAGEPOOL_CUDA_CUH__ + +#include "../averagepool.h" + +DESCRIPTOR(nvidia) + +#endif // __AVERAGEPOOL_CUDA_CUH__ diff --git a/src/infiniop/ops/averagepool/operator.cc b/src/infiniop/ops/averagepool/operator.cc new file mode 100644 index 000000000..5d72af8f8 --- /dev/null +++ b/src/infiniop/ops/averagepool/operator.cc @@ -0,0 +1,155 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/averagepool.h" + +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/averagepool_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/averagepool_metax.h" +#endif +#ifdef ENABLE_CPU_API +#include "cpu/averagepool_cpu.h" +#endif + +__C infiniStatus_t infiniopCreateAvgPoolDescriptor( + infiniopHandle_t handle, + infiniopAvgPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::averagepool::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + kernel_size, \ + strides, \ + pads, \ + ceil_mode) + + switch (handle->device) { + +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize( + infiniopAvgPoolDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopAvgPool( + infiniopAvgPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, \ + output, \ + input, \ + stream) + + switch (desc->device_type) { + +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/averagepool_backward/averagepool_backward.h b/src/infiniop/ops/averagepool_backward/averagepool_backward.h new file mode 100644 index 000000000..6322c3d92 --- /dev/null +++ b/src/infiniop/ops/averagepool_backward/averagepool_backward.h @@ -0,0 +1,55 @@ +#ifndef __AVERAGEPOOL_BACKWARD_H__ +#define __AVERAGEPOOL_BACKWARD_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::averagepool_backward::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + AvgPoolBackwardInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + AvgPoolBackwardInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t grad_input_desc, \ + infiniopTensorDescriptor_t grad_output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + void *kernel_size, \ + void *strides, \ + void *pads, \ + bool ceil_mode); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *grad_input, \ + const void *grad_output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __AVERAGEPOOL_BACKWARD_H__ diff --git a/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.cc b/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.cc new file mode 100644 index 000000000..399d005ee --- /dev/null +++ b/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.cc @@ -0,0 +1,390 @@ +#include "averagepool_backward_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include +#include + +namespace op::averagepool_backward::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + AvgPoolBackwardInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const AvgPoolBackwardInfo &avgpool_info) + : handle(handle_ptr), info(avgpool_info) { + workspace_size = 0; + } + + template + void _avgpool_backward_1d(T_out *grad_input, const T_in *grad_output) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_width = info.input_dims[0]; + size_t output_width = info.output_dims[0]; + size_t kernel_width = info.kernel_sizes[0]; + size_t stride_width = info.strides[0]; + size_t pad_width = info.pads[0]; + + const size_t input_nc_stride = input_width; + const size_t output_nc_stride = output_width; + + size_t grad_input_nelem = info.batch * info.channels * input_width; + memset(grad_input, 0, grad_input_nelem * sizeof(T_out)); + +#pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + const size_t grad_output_offset = (b * channels + c) * output_nc_stride; + const size_t grad_input_offset = (b * channels + c) * input_nc_stride; + + for (size_t ow = 0; ow < output_width; ++ow) { + float grad_value = utils::cast(grad_output[grad_output_offset + ow]); + + int valid_count = 0; + const int window_start = static_cast(ow * stride_width) - static_cast(pad_width); + const int window_end = window_start + static_cast(kernel_width); + + for (int iw = window_start; iw < window_end; ++iw) { + if (iw >= 0 && iw < static_cast(input_width)) { + valid_count++; + } else if (iw >= -static_cast(pad_width) && + iw < static_cast(input_width + pad_width)) { + valid_count++; + } + } + + if (valid_count > 0) { + float grad_distribute = grad_value / static_cast(valid_count); + for (int iw = window_start; iw < window_end; ++iw) { + if (iw >= 0 && iw < static_cast(input_width)) { + grad_input[grad_input_offset + iw] += utils::cast(grad_distribute); + } + } + } + } + } + } + } + + template + void _avgpool_backward_2d(T_out *grad_input, const T_in *grad_output) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_height = info.input_dims[0]; + size_t input_width = info.input_dims[1]; + size_t output_height = info.output_dims[0]; + size_t output_width = info.output_dims[1]; + size_t kernel_height = info.kernel_sizes[0]; + size_t kernel_width = info.kernel_sizes[1]; + size_t stride_height = info.strides[0]; + size_t stride_width = info.strides[1]; + size_t pad_h = info.pads[0]; + size_t pad_w = info.pads[1]; + + const size_t input_nc_stride = input_height * input_width; + const size_t output_nc_stride = output_height * output_width; + + size_t grad_input_nelem = info.batch * info.channels * input_height * input_width; + memset(grad_input, 0, grad_input_nelem * sizeof(T_out)); + + #pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + const size_t grad_output_offset = (b * channels + c) * output_nc_stride; + const size_t grad_input_offset = (b * channels + c) * input_nc_stride; + + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + float grad_value = utils::cast(grad_output[grad_output_offset + oh * output_width + ow]); + + int valid_count = 0; + const int start_h = static_cast(oh * stride_height) - static_cast(pad_h); + const int start_w = static_cast(ow * stride_width) - static_cast(pad_w); + + for (int kh = 0; kh < static_cast(kernel_height); ++kh) { + for (int kw = 0; kw < static_cast(kernel_width); ++kw) { + const int ih = start_h + kh; + const int iw = start_w + kw; + + if (ih >= 0 && ih < static_cast(input_height) && + iw >= 0 && iw < static_cast(input_width)) { + valid_count++; + } else if (ih >= -static_cast(pad_h) && + ih < static_cast(input_height + pad_h) && + iw >= -static_cast(pad_w) && + iw < static_cast(input_width + pad_w)) { + valid_count++; + } + } + } + + if (valid_count > 0) { + float grad_distribute = grad_value / static_cast(valid_count); + for (int kh = 0; kh < static_cast(kernel_height); ++kh) { + for (int kw = 0; kw < static_cast(kernel_width); ++kw) { + const int ih = start_h + kh; + const int iw = start_w + kw; + if (ih >= 0 && ih < static_cast(input_height) && + iw >= 0 && iw < static_cast(input_width)) { + grad_input[grad_input_offset + ih * input_width + iw] += utils::cast(grad_distribute); + } + } + } + } + } + } + } + } + } + + template + void _avgpool_backward_3d(T_out *grad_input, const T_in *grad_output) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_depth = info.input_dims[0]; + size_t input_height = info.input_dims[1]; + size_t input_width = info.input_dims[2]; + size_t output_depth = info.output_dims[0]; + size_t output_height = info.output_dims[1]; + size_t output_width = info.output_dims[2]; + size_t kernel_d = info.kernel_sizes[0]; + size_t kernel_h = info.kernel_sizes[1]; + size_t kernel_w = info.kernel_sizes[2]; + size_t stride_d = info.strides[0]; + size_t stride_h = info.strides[1]; + size_t stride_w = info.strides[2]; + size_t pad_d = info.pads[0]; + size_t pad_h = info.pads[1]; + size_t pad_w = info.pads[2]; + + const size_t input_nc_stride = input_depth * input_height * input_width; + const size_t output_nc_stride = output_depth * output_height * output_width; + + size_t grad_input_nelem = info.batch * info.channels * input_depth * input_height * input_width; + memset(grad_input, 0, grad_input_nelem * sizeof(T_out)); + + #pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + const size_t grad_output_offset = (b * channels + c) * output_nc_stride; + const size_t grad_input_offset = (b * channels + c) * input_nc_stride; + + for (size_t od = 0; od < output_depth; ++od) { + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + float grad_value = utils::cast(grad_output[grad_output_offset + od * output_height * output_width + oh * output_width + ow]); + + int valid_count = 0; + const int start_d = static_cast(od * stride_d) - static_cast(pad_d); + const int start_h = static_cast(oh * stride_h) - static_cast(pad_h); + const int start_w = static_cast(ow * stride_w) - static_cast(pad_w); + + for (int kd = 0; kd < static_cast(kernel_d); ++kd) { + for (int kh = 0; kh < static_cast(kernel_h); ++kh) { + for (int kw = 0; kw < static_cast(kernel_w); ++kw) { + const int id = start_d + kd; + const int ih = start_h + kh; + const int iw = start_w + kw; + + if (id >= 0 && id < static_cast(input_depth) && + ih >= 0 && ih < static_cast(input_height) && + iw >= 0 && iw < static_cast(input_width)) { + valid_count++; + } else if (id >= -static_cast(pad_d) && + id < static_cast(input_depth + pad_d) && + ih >= -static_cast(pad_h) && + ih < static_cast(input_height + pad_h) && + iw >= -static_cast(pad_w) && + iw < static_cast(input_width + pad_w)) { + valid_count++; + } + } + } + } + + if (valid_count > 0) { + float grad_distribute = grad_value / static_cast(valid_count); + for (int kd = 0; kd < static_cast(kernel_d); ++kd) { + for (int kh = 0; kh < static_cast(kernel_h); ++kh) { + for (int kw = 0; kw < static_cast(kernel_w); ++kw) { + const int id = start_d + kd; + const int ih = start_h + kh; + const int iw = start_w + kw; + if (id >= 0 && id < static_cast(input_depth) && + ih >= 0 && ih < static_cast(input_height) && + iw >= 0 && iw < static_cast(input_width)) { + grad_input[grad_input_offset + id * input_height * input_width + ih * input_width + iw] += utils::cast(grad_distribute); + } + } + } + } + } + } + } + } + } + } + } + + template + void _avgpool_backward_cpu(T_out *grad_input, const T_in *grad_output) const { + switch (info.ndim) { + case 1: + _avgpool_backward_1d(grad_input, grad_output); + break; + case 2: + _avgpool_backward_2d(grad_input, grad_output); + break; + case 3: + _avgpool_backward_3d(grad_input, grad_output); + break; + default: + break; + } + } +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + AvgPoolBackwardInfo &info) { + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *grad_input, const void *grad_output, + const void *input, infiniDtype_t dtype) const { + if (!grad_input || !grad_output) { + return INFINI_STATUS_BAD_PARAM; + } + + size_t grad_input_nelem = info.batch * info.channels * info.input_dims[0]; + if (info.ndim > 1) { + grad_input_nelem *= info.input_dims[1]; + } + if (info.ndim > 2) { + grad_input_nelem *= info.input_dims[2]; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_grad_input = static_cast(grad_input); + const float *typed_grad_output = static_cast(grad_output); + _avgpool_backward_cpu(typed_grad_input, typed_grad_output); + break; + } + case INFINI_DTYPE_F16: { + float *typed_grad_input_f32 = static_cast(workspace); + const fp16_t *typed_grad_output = static_cast(grad_output); + + _avgpool_backward_cpu(typed_grad_input_f32, typed_grad_output); + + fp16_t *typed_grad_input = static_cast(grad_input); + #pragma omp parallel for + for(size_t i = 0; i < grad_input_nelem; ++i) { + typed_grad_input[i] = utils::cast(typed_grad_input_f32[i]); + } + break; + } + case INFINI_DTYPE_BF16: { + float *typed_grad_input_f32 = static_cast(workspace); + const bf16_t *typed_grad_output = static_cast(grad_output); + + _avgpool_backward_cpu(typed_grad_input_f32, typed_grad_output); + + bf16_t *typed_grad_input = static_cast(grad_input); + #pragma omp parallel for + for(size_t i = 0; i < grad_input_nelem; ++i) { + typed_grad_input[i] = utils::cast(typed_grad_input_f32[i]); + } + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +inline size_t calculateOutputSize(const AvgPoolBackwardInfo &info) { + size_t size = info.batch * info.channels; + for (size_t i = 0; i < info.ndim; ++i) { + size *= info.input_dims[i]; + } + return size; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + auto handle = reinterpret_cast(handle_); + auto dtype = grad_input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + auto result = AvgPoolBackwardInfo::create( + grad_input_desc, grad_output_desc, input_desc, kernel_size, strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle, info); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + size_t workspace_size = 0; + if (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16) { + workspace_size = calculateOutputSize(info) * sizeof(float); + } + + *desc_ptr = new Descriptor(dtype, std::move(info), workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *grad_input, + const void *grad_output, + const void *input, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + return _opaque->calculate(workspace, workspace_size, grad_input, grad_output, input, _dtype); +} + +} // namespace op::averagepool_backward::cpu diff --git a/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.h b/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.h new file mode 100644 index 000000000..f83f70cbc --- /dev/null +++ b/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.h @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_BACKWARD_CPU_H__ +#define __AVERAGEPOOL_BACKWARD_CPU_H__ + +#include "../averagepool_backward.h" + +DESCRIPTOR(cpu) + +#endif // __AVERAGEPOOL_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/averagepool_backward/cuda/averagepool_backward_kernel.cuh b/src/infiniop/ops/averagepool_backward/cuda/averagepool_backward_kernel.cuh new file mode 100644 index 000000000..0394d62dd --- /dev/null +++ b/src/infiniop/ops/averagepool_backward/cuda/averagepool_backward_kernel.cuh @@ -0,0 +1,177 @@ +#ifndef __AVERAGEPOOL_BACKWARD_KERNEL_H__ +#define __AVERAGEPOOL_BACKWARD_KERNEL_H__ + +#include + +template +__global__ void +avgpool1d_pytorch_backward_kernel(const T *grad_output, T *grad_input, + int batch_size, int channels, + int input_length, int output_length, + int kernel_size, int stride, int padding) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= output_length) { + return; + } + + const T *grad_output_ptr = grad_output + batch_idx * channels * output_length + channel_idx * output_length; + T *grad_input_ptr = grad_input + batch_idx * channels * input_length + channel_idx * input_length; + + // 从输出中获取梯度值 + float grad = static_cast(grad_output_ptr[output_idx]); + int window_start = output_idx * stride - padding; + + int pool_size = 0; + for (int k = 0; k < kernel_size; ++k) { + int input_pos = window_start + k; + if ((input_pos >= 0 && input_pos < input_length) || (input_pos >= -padding && input_pos < input_length + padding)) { + pool_size++; + } + } + + // 避免除以零的极端情况 + if (pool_size == 0) { + return; + } + + float grad_per_input = grad / static_cast(pool_size); + for (int k = 0; k < kernel_size; ++k) { + int input_pos = window_start + k; + if (input_pos >= 0 && input_pos < input_length) { + // Atomically add the distributed gradient to the input gradient tensor + atomicAdd(&grad_input_ptr[input_pos], static_cast(grad_per_input)); + } + } +} + +template +__global__ void avgpool2d_pytorch_backward_kernel( + const T *grad_output, T *grad_input, int batch_size, int channels, + int input_height, int input_width, int output_height, int output_width, + int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, + int pad_w) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + int total_output_elements = output_height * output_width; + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) { + return; + } + + // 将线性输出索引转换为二维坐标 + int out_h = output_idx / output_width; + int out_w = output_idx % output_width; + + const T *grad_output_ptr = grad_output + batch_idx * channels * total_output_elements + channel_idx * total_output_elements; + T *grad_input_ptr = grad_input + batch_idx * channels * input_height * input_width + channel_idx * input_height * input_width; + + float grad = static_cast(grad_output_ptr[output_idx]); + int window_start_h = out_h * stride_h - pad_h; + int window_start_w = out_w * stride_w - pad_w; + + int pool_size = 0; + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int input_h = window_start_h + kh; + int input_w = window_start_w + kw; + if ((input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) || (input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w)) { + pool_size++; + } + } + } + + if (pool_size == 0) { + return; + } + + float grad_per_input = grad / static_cast(pool_size); + + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int input_h = window_start_h + kh; + int input_w = window_start_w + kw; + + if (input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) { + int input_idx = input_h * input_width + input_w; + atomicAdd(&grad_input_ptr[input_idx], static_cast(grad_per_input)); + } + } + } +} + +template +__global__ void avgpool3d_pytorch_backward_kernel( + const T *grad_output, T *grad_input, int batch_size, int channels, + int input_depth, int input_height, int input_width, int output_depth, + int output_height, int output_width, int kernel_d, int kernel_h, + int kernel_w, int stride_d, int stride_h, int stride_w, int pad_d, + int pad_h, int pad_w) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + int total_output_elements = output_depth * output_height * output_width; + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) { + return; + } + + // 将线性输出索引转换为三维坐标 + int out_d = output_idx / (output_height * output_width); + int remaining = output_idx % (output_height * output_width); + int out_h = remaining / output_width; + int out_w = remaining % output_width; + + int input_spatial_size = input_depth * input_height * input_width; + const T *grad_output_ptr = grad_output + batch_idx * channels * total_output_elements + channel_idx * total_output_elements; + T *grad_input_ptr = grad_input + batch_idx * channels * input_spatial_size + channel_idx * input_spatial_size; + + float grad = static_cast(grad_output_ptr[output_idx]); + int window_start_d = out_d * stride_d - pad_d; + int window_start_h = out_h * stride_h - pad_h; + int window_start_w = out_w * stride_w - pad_w; + + int pool_size = 0; + for (int kd = 0; kd < kernel_d; ++kd) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int input_d = window_start_d + kd; + int input_h = window_start_h + kh; + int input_w = window_start_w + kw; + + if ((input_d >= 0 && input_d < input_depth && input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) || (input_d >= -pad_d && input_d < input_depth + pad_d && input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w)) { + pool_size++; + } + } + } + } + + if (pool_size == 0) { + return; + } + + float grad_per_input = grad / static_cast(pool_size); + + for (int kd = 0; kd < kernel_d; ++kd) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int input_d = window_start_d + kd; + int input_h = window_start_h + kh; + int input_w = window_start_w + kw; + + if (input_d >= 0 && input_d < input_depth && input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) { + int input_idx = (input_d * input_height + input_h) * input_width + input_w; + atomicAdd(&grad_input_ptr[input_idx], static_cast(grad_per_input)); + } + } + } + } +} + +#endif // __AVERAGEPOOL_BACKWARD_KERNEL_H__ diff --git a/src/infiniop/ops/averagepool_backward/info.h b/src/infiniop/ops/averagepool_backward/info.h new file mode 100644 index 000000000..8927864b8 --- /dev/null +++ b/src/infiniop/ops/averagepool_backward/info.h @@ -0,0 +1,100 @@ +#ifndef __AVERAGEPOOL_BACKWARD_INFO_H__ +#define __AVERAGEPOOL_BACKWARD_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +namespace op::averagepool_backward { + +// 检查是否存在隐式填充 +inline bool hasImplicitPadding( + size_t input_size, + size_t kernel_size, + size_t stride, + size_t padding, + bool ceil_mode) { + + if (!ceil_mode) { + return false; + } + return ((input_size + 2 * padding) - kernel_size) % stride != 0; +} + +class AvgPoolBackwardInfo { + AvgPoolBackwardInfo() = default; + +public: + std::vector input_dims; // original input dimensions + std::vector output_dims; // pooled output dimensions + std::vector kernel_sizes; + std::vector strides; + std::vector pads; + bool ceil_mode; + size_t ndim; + size_t batch; + size_t channels; + bool has_implicit_padding = false; + + static utils::Result create( + infiniopTensorDescriptor_t grad_input_desc, // gradient w.r.t. input + infiniopTensorDescriptor_t grad_output_desc, // gradient w.r.t. output + infiniopTensorDescriptor_t input_desc, // original input from forward pass + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + AvgPoolBackwardInfo info; + + if (input_desc->ndim() < 3 || input_desc->ndim() > 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->ndim() != grad_input_desc->ndim() || grad_output_desc->ndim() != grad_input_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->dim(0) != grad_input_desc->dim(0) || input_desc->dim(1) != grad_input_desc->dim(1) || grad_output_desc->dim(0) != grad_input_desc->dim(0) || grad_output_desc->dim(1) != grad_input_desc->dim(1)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + for (size_t i = 2; i < input_desc->ndim(); ++i) { + if (input_desc->dim(i) != grad_input_desc->dim(i)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + info.ndim = input_desc->ndim() - 2; + info.batch = input_desc->dim(0); + info.channels = input_desc->dim(1); + info.ceil_mode = ceil_mode; + + auto kernel_ptr = reinterpret_cast(kernel_size); + auto stride_ptr = reinterpret_cast(strides); + auto pad_ptr = reinterpret_cast(pads); + + // 初始化隐式填充标志 + info.has_implicit_padding = false; + for (size_t i = 0; i < info.ndim; ++i) { + info.input_dims.push_back(input_desc->dim(i + 2)); + info.output_dims.push_back(grad_output_desc->dim(i + 2)); + info.kernel_sizes.push_back(kernel_ptr[i]); + info.strides.push_back(stride_ptr[i]); + info.pads.push_back(pad_ptr[i]); + + // 检查当前维度是否存在隐式填充 + if (hasImplicitPadding(info.input_dims[i], info.kernel_sizes[i], + info.strides[i], info.pads[i], info.ceil_mode)) { + info.has_implicit_padding = true; + } + } + + return utils::Result(std::move(info)); + } +}; + +} // namespace op::averagepool_backward + +#endif // __AVERAGEPOOL_BACKWARD_INFO_H__ diff --git a/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.h b/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.h new file mode 100644 index 000000000..65d1f25fc --- /dev/null +++ b/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_BACKWARD_METAX_H__ +#define __AVERAGEPOOL_BACKWARD_METAX_H__ + +#include "../averagepool_backward.h" + +DESCRIPTOR(metax) + +#endif // __AVERAGEPOOL_BACKWARD_METAX_H__ diff --git a/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.maca b/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.maca new file mode 100644 index 000000000..b11f24b99 --- /dev/null +++ b/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.maca @@ -0,0 +1,415 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include "averagepool_backward_metax.h" +#include "../cuda/averagepool_backward_kernel.cuh" +#include + +// 自定义核函数 +infiniStatus_t launch_avgpool_pytorch_backward_kernel( + const op::averagepool_backward::AvgPoolBackwardInfo& info, + const void* grad_output, void* grad_input, + infiniDtype_t data_type, hcStream_t stream) { + + // 在累加梯度之前,必须将grad_input张量清零 + size_t grad_input_nelem = info.batch * info.channels; + for (size_t i = 0; i < info.ndim; ++i) grad_input_nelem *= info.input_dims[i]; + + size_t dtype_size = 0; + switch (data_type) { + case INFINI_DTYPE_F32: + dtype_size = sizeof(float); + break; + case INFINI_DTYPE_F16: + dtype_size = sizeof(half); + break; + case INFINI_DTYPE_BF16: + dtype_size = sizeof(__hpcc_bfloat16); + break; + default: + return INFINI_STATUS_NOT_IMPLEMENTED; // Or handle error + } + + size_t grad_input_bytes = grad_input_nelem * dtype_size; + hcMemsetAsync(grad_input, 0, grad_input_bytes, stream); + + int batch_size = static_cast(info.batch); + int channels = static_cast(info.channels); + + if (info.ndim == 1) { + int input_length = static_cast(info.input_dims[0]); + int output_length = static_cast(info.output_dims[0]); + int kernel_size = static_cast(info.kernel_sizes[0]); + int stride = static_cast(info.strides[0]); + int padding = static_cast(info.pads[0]); + + dim3 blockSize(256); + dim3 gridSize(batch_size, channels, (output_length + blockSize.x - 1) / blockSize.x); + + switch (data_type) { + case INFINI_DTYPE_F32: + avgpool1d_pytorch_backward_kernel<<>>( + static_cast(grad_output), static_cast(grad_input), + batch_size, channels, input_length, output_length, + kernel_size, stride, padding); + break; + case INFINI_DTYPE_F16: + avgpool1d_pytorch_backward_kernel<<>>( + static_cast(grad_output), static_cast(grad_input), + batch_size, channels, input_length, output_length, + kernel_size, stride, padding); + break; + case INFINI_DTYPE_BF16: + avgpool1d_pytorch_backward_kernel<__hpcc_bfloat16><<>>( + static_cast(grad_output), static_cast<__hpcc_bfloat16*>(grad_input), + batch_size, channels, input_length, output_length, + kernel_size, stride, padding); + break; + default: + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + } else if (info.ndim == 2) { + // 2D平均池化 - 后向 + int input_height = static_cast(info.input_dims[0]); + int input_width = static_cast(info.input_dims[1]); + int output_height = static_cast(info.output_dims[0]); + int output_width = static_cast(info.output_dims[1]); + int kernel_h = static_cast(info.kernel_sizes[0]); + int kernel_w = static_cast(info.kernel_sizes[1]); + int stride_h = static_cast(info.strides[0]); + int stride_w = static_cast(info.strides[1]); + int pad_h = static_cast(info.pads[0]); + int pad_w = static_cast(info.pads[1]); + + int total_output_elements = output_height * output_width; + dim3 blockSize(256); + dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x); + + switch (data_type) { + case INFINI_DTYPE_F32: + avgpool2d_pytorch_backward_kernel<<>>( + static_cast(grad_output), static_cast(grad_input), + batch_size, channels, input_height, input_width, + output_height, output_width, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w); + break; + case INFINI_DTYPE_F16: + avgpool2d_pytorch_backward_kernel<<>>( + static_cast(grad_output), static_cast(grad_input), + batch_size, channels, input_height, input_width, + output_height, output_width, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w); + break; + case INFINI_DTYPE_BF16: + avgpool2d_pytorch_backward_kernel<__hpcc_bfloat16><<>>( + static_cast(grad_output), static_cast<__hpcc_bfloat16*>(grad_input), + batch_size, channels, input_height, input_width, + output_height, output_width, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w); + break; + default: + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + } else if (info.ndim == 3) { + // 3D平均池化 - 后向 + int input_depth = static_cast(info.input_dims[0]); + int input_height = static_cast(info.input_dims[1]); + int input_width = static_cast(info.input_dims[2]); + int output_depth = static_cast(info.output_dims[0]); + int output_height = static_cast(info.output_dims[1]); + int output_width = static_cast(info.output_dims[2]); + int kernel_d = static_cast(info.kernel_sizes[0]); + int kernel_h = static_cast(info.kernel_sizes[1]); + int kernel_w = static_cast(info.kernel_sizes[2]); + int stride_d = static_cast(info.strides[0]); + int stride_h = static_cast(info.strides[1]); + int stride_w = static_cast(info.strides[2]); + int pad_d = static_cast(info.pads[0]); + int pad_h = static_cast(info.pads[1]); + int pad_w = static_cast(info.pads[2]); + + int total_output_elements = output_depth * output_height * output_width; + dim3 blockSize(256); + dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x); + + switch (data_type) { + case INFINI_DTYPE_F32: + avgpool3d_pytorch_backward_kernel<<>>( + static_cast(grad_output), static_cast(grad_input), + batch_size, channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, + kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w); + break; + case INFINI_DTYPE_F16: + avgpool3d_pytorch_backward_kernel<<>>( + static_cast(grad_output), static_cast(grad_input), + batch_size, channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, + kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w); + break; + case INFINI_DTYPE_BF16: + avgpool3d_pytorch_backward_kernel<__hpcc_bfloat16><<>>( + static_cast(grad_output), static_cast<__hpcc_bfloat16*>(grad_input), + batch_size, channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, + kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w); + break; + default: + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + } else { + return INFINI_STATUS_BAD_PARAM; + } + + return INFINI_STATUS_SUCCESS; +} + +#define DESTROY_hcdnn_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_hcdnn_DESCRIPTORS() \ + do { \ + DESTROY_hcdnn_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(grad_input_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(grad_output_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(pooling_backward_desc, \ + hcdnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::averagepool_backward::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_HCDNN_API + hcdnnTensorDescriptor_t input_desc = nullptr; + hcdnnTensorDescriptor_t grad_input_desc = nullptr; + hcdnnTensorDescriptor_t grad_output_desc = nullptr; + hcdnnPoolingDescriptor_t pooling_backward_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_HCDNN_API + void calculateStrides(const std::vector &dims, std::vector &strides, + int ndim) const { + strides[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + strides[d] = strides[d + 1] * dims[d + 1]; + } + } + + infiniStatus_t createPoolingDescriptors(const AvgPoolBackwardInfo &info, + hcdnnDataType_t hcdnn_data_type) { + // 创建hcdnn描述符 + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc)); + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_input_desc)); + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_output_desc)); + CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_backward_desc)); + + // 构建输入、输出梯度维度(NCHW格式) + std::vector input_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + for (size_t i = 0; i < info.ndim; ++i) { + input_dims_vec.push_back(static_cast(info.input_dims[i])); + output_dims_vec.push_back(static_cast(info.output_dims[i])); + } + + while (input_dims_vec.size() < 5) input_dims_vec.push_back(1); + while (output_dims_vec.size() < 5) output_dims_vec.push_back(1); + + // 计算内存步幅 + std::vector input_strides_vec(input_dims_vec.size()); + std::vector output_strides_vec(output_dims_vec.size()); + calculateStrides(input_dims_vec, input_strides_vec, input_dims_vec.size()); + calculateStrides(output_dims_vec, output_strides_vec, output_dims_vec.size()); + + // 设置张量描述符(带步幅) + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + input_desc, hcdnn_data_type, input_dims_vec.size(), + input_dims_vec.data(), input_strides_vec.data())); + + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + grad_input_desc, hcdnn_data_type, input_dims_vec.size(), + input_dims_vec.data(), input_strides_vec.data())); + + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + grad_output_desc, hcdnn_data_type, output_dims_vec.size(), + output_dims_vec.data(), output_strides_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const AvgPoolBackwardInfo &info) { + // 构建池化参数 + std::vector kernel_vec, stride_vec, pad_vec; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_vec.push_back(static_cast(info.kernel_sizes[i])); + stride_vec.push_back(static_cast(info.strides[i])); + pad_vec.push_back(static_cast(info.pads[i])); + } + + while (kernel_vec.size() < 3) kernel_vec.push_back(1); + while (stride_vec.size() < 3) stride_vec.push_back(1); + while (pad_vec.size() < 3) pad_vec.push_back(0); + + // 设置平均池化反向描述符 + CHECK_MCDNN(hcdnnSetPoolingNdDescriptor( + pooling_backward_desc, + HCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, // 平均池化模式 + HCDNN_NOT_PROPAGATE_NAN, // 不传播NaN + kernel_vec.size(), + kernel_vec.data(), + pad_vec.data(), + stride_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializeHcdnnContext(AvgPoolBackwardInfo &info, + infiniDtype_t data_type) { + hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type); + + CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // 计算工作空间大小(需要存储前向输出用于反向计算) + CHECK_MCDNN(hcdnnGetTensorSizeInBytes(grad_output_desc, &workspace_size)); + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) +#ifdef ENABLE_HCDNN_API + , input_desc(other.input_desc) + , grad_input_desc(other.grad_input_desc) + , grad_output_desc(other.grad_output_desc) + , pooling_backward_desc(other.pooling_backward_desc) +#endif + { +#ifdef ENABLE_HCDNN_API + other.input_desc = nullptr; + other.grad_input_desc = nullptr; + other.grad_output_desc = nullptr; + other.pooling_backward_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_HCDNN_API + CLEANUP_hcdnn_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + AvgPoolBackwardInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_HCDNN_API + Opaque opaque(internal_ptr); + if (!info.has_implicit_padding) { + auto status = opaque.initializeHcdnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_HCDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + auto result = + AvgPoolBackwardInfo::create(grad_input_desc, grad_output_desc, input_desc, + kernel_size, strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *grad_input, const void *grad_output, + const void *input, void *stream) const { +#ifdef ENABLE_HCDNN_API + if (_info.has_implicit_padding) { + return launch_avgpool_pytorch_backward_kernel( + _info, grad_output, grad_input, _dtype, (hcStream_t)stream); + } else { + const float alpha = 1.0f, beta = 0.0f; + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + CHECK_STATUS(_opaque->internal->useMcdnn( + (hcStream_t)stream, [&](hcdnnHandle_t handle) { + void *temp_output = workspace; + CHECK_MCDNN(hcdnnPoolingForward( + handle, _opaque->pooling_backward_desc, &alpha, + _opaque->input_desc, input, + &beta, + _opaque->grad_output_desc, temp_output)); + CHECK_MCDNN(hcdnnPoolingBackward( + handle, _opaque->pooling_backward_desc, &alpha, + _opaque->grad_output_desc, temp_output, + _opaque->grad_output_desc, grad_output, + _opaque->input_desc, input, + &beta, + _opaque->grad_input_desc, grad_input + )); + return INFINI_STATUS_SUCCESS; + })); + return INFINI_STATUS_SUCCESS; + } +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::averagepool_backward::metax diff --git a/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cu b/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cu new file mode 100644 index 000000000..71fcf95d5 --- /dev/null +++ b/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cu @@ -0,0 +1,260 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "averagepool_backward_nvidia.cuh" + +#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_CUDNN_DESCRIPTORS() \ + do { \ + DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(grad_input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(grad_output_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(pooling_backward_desc, \ + cudnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::averagepool_backward::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_CUDNN_API + cudnnTensorDescriptor_t input_desc = nullptr; + cudnnTensorDescriptor_t grad_input_desc = nullptr; + cudnnTensorDescriptor_t grad_output_desc = nullptr; + cudnnPoolingDescriptor_t pooling_backward_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_CUDNN_API + infiniStatus_t getCudnnDataType(infiniDtype_t data_type, + cudnnDataType_t &cudnn_data_type) const { + if (data_type == INFINI_DTYPE_F16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_F32) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_BF16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + } + + void calculateStrides(const std::vector &dims, std::vector &strides, + int ndim) const { + strides[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + strides[d] = strides[d + 1] * dims[d + 1]; + } + } + + infiniStatus_t createPoolingDescriptors(const AvgPoolBackwardInfo &info, + cudnnDataType_t cudnn_data_type) { + CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_input_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_output_desc)); + CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_backward_desc)); + + std::vector input_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + + for (size_t i = 0; i < info.ndim; ++i) { + input_dims_vec.push_back(static_cast(info.input_dims[i])); + output_dims_vec.push_back(static_cast(info.output_dims[i])); + } + + if (info.ndim == 1) { + input_dims_vec.push_back(1); + output_dims_vec.push_back(1); + } + + std::vector input_strides_vec(input_dims_vec.size()); + std::vector output_strides_vec(output_dims_vec.size()); + calculateStrides(input_dims_vec, input_strides_vec, input_dims_vec.size()); + calculateStrides(output_dims_vec, output_strides_vec, + output_dims_vec.size()); + + CHECK_CUDNN(cudnnSetTensorNdDescriptor( + input_desc, cudnn_data_type, input_dims_vec.size(), + input_dims_vec.data(), input_strides_vec.data())); + + CHECK_CUDNN(cudnnSetTensorNdDescriptor( + grad_input_desc, cudnn_data_type, input_dims_vec.size(), + input_dims_vec.data(), input_strides_vec.data())); + + CHECK_CUDNN(cudnnSetTensorNdDescriptor( + grad_output_desc, cudnn_data_type, output_dims_vec.size(), + output_dims_vec.data(), output_strides_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const AvgPoolBackwardInfo &info) { + std::vector kernel_vec, stride_vec, pad_vec; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_vec.push_back(static_cast(info.kernel_sizes[i])); + stride_vec.push_back(static_cast(info.strides[i])); + pad_vec.push_back(static_cast(info.pads[i])); + } + + if (info.ndim == 1) { + kernel_vec.push_back(1); + stride_vec.push_back(1); + pad_vec.push_back(0); + } + + CHECK_CUDNN(cudnnSetPoolingNdDescriptor( + pooling_backward_desc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, CUDNN_NOT_PROPAGATE_NAN, + kernel_vec.size(), kernel_vec.data(), pad_vec.data(), + stride_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializeCudnnContext(AvgPoolBackwardInfo &info, + infiniDtype_t data_type) { + cudnnDataType_t cudnn_data_type; + CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type)); + + CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + CHECK_CUDNN(cudnnGetTensorSizeInBytes(grad_output_desc, &workspace_size)); + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) +#ifdef ENABLE_CUDNN_API + , + input_desc(other.input_desc), grad_input_desc(other.grad_input_desc), grad_output_desc(other.grad_output_desc), pooling_backward_desc(other.pooling_backward_desc) +#endif + { +#ifdef ENABLE_CUDNN_API + other.input_desc = nullptr; + other.grad_input_desc = nullptr; + other.grad_output_desc = nullptr; + other.pooling_backward_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_CUDNN_API + CLEANUP_CUDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + AvgPoolBackwardInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_CUDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializeCudnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_CUDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = AvgPoolBackwardInfo::create(grad_input_desc, grad_output_desc, input_desc, + kernel_size, strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *grad_input, const void *grad_output, + const void *input, void *stream) const { + +#ifdef ENABLE_CUDNN_API + const float alpha = 1.0f, beta = 0.0f; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + CHECK_STATUS(_opaque->internal->useCudnn( + (cudaStream_t)stream, [&](cudnnHandle_t handle) { + size_t grad_input_size = 0; + CHECK_CUDNN(cudnnGetTensorSizeInBytes(_opaque->grad_input_desc, + &grad_input_size)); + CHECK_CUDA(cudaMemset(grad_input, 0, grad_input_size)); + CHECK_CUDA(cudaMemset(workspace, 0, _workspace_size)); + + void *temp_output = workspace; + CHECK_CUDNN(cudnnPoolingForward( + handle, _opaque->pooling_backward_desc, &alpha, _opaque->input_desc, + input, &beta, _opaque->grad_output_desc, temp_output)); + + CHECK_CUDNN(cudnnPoolingBackward( + handle, _opaque->pooling_backward_desc, &alpha, + _opaque->grad_output_desc, temp_output, + _opaque->grad_output_desc, grad_output, + _opaque->input_desc, input, + &beta, + _opaque->grad_input_desc, grad_input)); + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::averagepool_backward::nvidia diff --git a/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cuh b/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cuh new file mode 100644 index 000000000..b4fa6661e --- /dev/null +++ b/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_BACKWARD_NVIDIA_CUH__ +#define __AVERAGEPOOL_BACKWARD_NVIDIA_CUH__ + +#include "../averagepool_backward.h" + +DESCRIPTOR(nvidia) + +#endif // __AVERAGEPOOL_BACKWARD_NVIDIA_CUH__ diff --git a/src/infiniop/ops/averagepool_backward/operator.cc b/src/infiniop/ops/averagepool_backward/operator.cc new file mode 100644 index 000000000..844c68601 --- /dev/null +++ b/src/infiniop/ops/averagepool_backward/operator.cc @@ -0,0 +1,159 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/averagepool_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/averagepool_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/averagepool_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/averagepool_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateAvgPoolBackwardDescriptor( + infiniopHandle_t handle, + infiniopAvgPoolBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::averagepool_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input_desc, \ + grad_output_desc, \ + input_desc, \ + kernel_size, \ + strides, \ + pads, \ + ceil_mode) + + switch (handle->device) { + +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAvgPoolBackwardWorkspaceSize( + infiniopAvgPoolBackwardDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopAvgPoolBackward( + infiniopAvgPoolBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *grad_output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, \ + grad_input, \ + grad_output, \ + input, \ + stream) + + switch (desc->device_type) { + +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyAvgPoolBackwardDescriptor(infiniopAvgPoolBackwardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/conv_backward/conv_backward.h b/src/infiniop/ops/conv_backward/conv_backward.h new file mode 100644 index 000000000..1c105af8c --- /dev/null +++ b/src/infiniop/ops/conv_backward/conv_backward.h @@ -0,0 +1,47 @@ +#ifndef __CONV_BACKWARD_H__ +#define __CONV_BACKWARD_H__ + +#include "../../operator.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::conv_backward::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t grad_output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t weight_desc, \ + infiniopTensorDescriptor_t bias_desc, \ + void *pads, \ + void *strides, \ + void *dilations, \ + size_t groups); \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *grad_input, void *grad_weight, void *grad_bias, \ + const void *grad_output, \ + const void *input, const void *weight, \ + void *stream) const; \ + }; \ + } + +#endif // __CONV_BACKWARD_H__ diff --git a/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.cc b/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.cc new file mode 100644 index 000000000..e20ee140c --- /dev/null +++ b/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.cc @@ -0,0 +1,517 @@ +#include "conv_backward_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include + +namespace op::conv_backward::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + op::conv_backward::ConvBackwardInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, + const op::conv_backward::ConvBackwardInfo &conv_info) + : handle(handle_ptr), info(conv_info) { + workspace_size = 0; + } + + // 递归函数:计算数据梯度的N维卷积反向传播 + template + void _applyDataGradient( + size_t grad_out_index, size_t weight_index, size_t grad_in_index, + size_t ndim, const GradOutData *grad_output, const WeightData *weight, + GradInData *grad_input, const size_t *grad_in_shape) const { + + if (ndim >= info.ndim + 2) { + // 到达最深层,执行实际计算 + // 始终使用float精度进行计算,避免半精度累积误差 + float grad_out_f32 = utils::cast(grad_output[grad_out_index]); + float weight_f32 = utils::cast(weight[weight_index]); + float current_grad_in = utils::cast(grad_input[grad_in_index]); + float result = current_grad_in + grad_out_f32 * weight_f32; + grad_input[grad_in_index] = utils::cast(result); + return; + } + + size_t dim_idx = ndim - 2; + size_t grad_out_dim = info.grad_output_dims[dim_idx]; + size_t weight_dim = info.weight_dims[dim_idx]; + size_t grad_in_dim = grad_in_shape[ndim]; + size_t stride = info.strides[dim_idx]; + size_t pad = info.pads[dim_idx]; + size_t dilation = info.dilations[dim_idx]; + + // 遍历输出维度 + for (size_t oh = 0; oh < grad_out_dim; ++oh) { + size_t curr_grad_out_index = grad_out_index * grad_out_dim + oh; + + // 遍历卷积核维度 + for (size_t kh = 0; kh < weight_dim; ++kh) { + size_t curr_weight_index = weight_index * weight_dim + kh; + + // 计算对应的输入位置 + int ih = static_cast(oh * stride + kh * dilation) - static_cast(pad); + + if (ih >= 0 && ih < static_cast(grad_in_dim)) { + size_t curr_grad_in_index = grad_in_index * grad_in_dim + ih; + + _applyDataGradient(curr_grad_out_index, curr_weight_index, curr_grad_in_index, + ndim + 1, grad_output, weight, grad_input, grad_in_shape); + } + } + } + } + + // 递归函数:计算权重梯度的N维卷积反向传播 + template + void _applyWeightGradient( + size_t input_index, size_t grad_out_index, size_t grad_weight_index, + size_t ndim, const InputData *input, const GradOutData *grad_output, + GradWeightData *grad_weight, const size_t *input_shape) const { + + if (ndim >= info.ndim + 2) { + // 到达最深层,执行实际计算 + // 始终使用float精度进行计算,避免半精度累积误差 + float input_f32 = utils::cast(input[input_index]); + float grad_out_f32 = utils::cast(grad_output[grad_out_index]); + float current_grad_weight = utils::cast(grad_weight[grad_weight_index]); + float result = current_grad_weight + input_f32 * grad_out_f32; + grad_weight[grad_weight_index] = utils::cast(result); + return; + } + + size_t dim_idx = ndim - 2; + size_t input_dim = input_shape[ndim]; + size_t grad_out_dim = info.grad_output_dims[dim_idx]; + size_t weight_dim = info.weight_dims[dim_idx]; + size_t stride = info.strides[dim_idx]; + size_t pad = info.pads[dim_idx]; + size_t dilation = info.dilations[dim_idx]; + + // 遍历卷积核维度 + for (size_t kh = 0; kh < weight_dim; ++kh) { + size_t curr_grad_weight_index = grad_weight_index * weight_dim + kh; + + // 遍历输出维度 + for (size_t oh = 0; oh < grad_out_dim; ++oh) { + size_t curr_grad_out_index = grad_out_index * grad_out_dim + oh; + + // 计算对应的输入位置 + int ih = static_cast(oh * stride + kh * dilation) - static_cast(pad); + + if (ih >= 0 && ih < static_cast(input_dim)) { + size_t curr_input_index = input_index * input_dim + ih; + + _applyWeightGradient(curr_input_index, curr_grad_out_index, curr_grad_weight_index, + ndim + 1, input, grad_output, grad_weight, input_shape); + } + } + } + } + + // 获取零值 + template + static T get_zero() { + if constexpr (std::is_same::value) { + return 0.0f; + } else if constexpr (std::is_same::value) { + return _f32_to_f16(0.0f); + } else if constexpr (std::is_same::value) { + return _f32_to_bf16(0.0f); + } else { + return T{}; + } + } + + // 计算数据梯度 (grad_input) - 使用更直接的实现避免递归 + template + void compute_data_gradient(GradInData *grad_input, const GradOutData *grad_output, + const WeightData *weight) const { + + size_t batch_size = info.batch; + size_t in_channels = info.in_channels; + size_t out_channels = info.out_channels; + size_t groups = info.groups; + size_t channels_per_group = in_channels / groups; + size_t out_channels_per_group = out_channels / groups; + + // 计算空间大小 + size_t input_spatial_size = 1; + size_t output_spatial_size = 1; + for (size_t i = 0; i < info.ndim; ++i) { + input_spatial_size *= info.input_dims[i]; + output_spatial_size *= info.grad_output_dims[i]; + } + + // 初始化为零 + size_t total_grad_input_size = batch_size * in_channels * input_spatial_size; + GradInData zero_val = get_zero(); + std::fill(grad_input, grad_input + total_grad_input_size, zero_val); + + // 对每个批次和组并行处理 +#pragma omp parallel for collapse(2) schedule(dynamic) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t g = 0; g < groups; ++g) { + // 对每个输出通道 + for (size_t oc = 0; oc < out_channels_per_group; ++oc) { + size_t abs_oc = g * out_channels_per_group + oc; + + // 对每个输入通道 + for (size_t ic = 0; ic < channels_per_group; ++ic) { + size_t abs_ic = g * channels_per_group + ic; + + // 对每个输出空间位置 + for (size_t out_spatial = 0; out_spatial < output_spatial_size; ++out_spatial) { + + // 将一维空间索引转换为多维坐标 + std::vector out_coords(info.ndim); + size_t temp = out_spatial; + for (int d = info.ndim - 1; d >= 0; --d) { + out_coords[d] = temp % info.grad_output_dims[d]; + temp /= info.grad_output_dims[d]; + } + + // 对每个卷积核空间位置 + size_t kernel_spatial_size = 1; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_spatial_size *= info.weight_dims[i]; + } + + for (size_t kernel_spatial = 0; kernel_spatial < kernel_spatial_size; ++kernel_spatial) { + + // 将一维卷积核索引转换为多维坐标 + std::vector kernel_coords(info.ndim); + temp = kernel_spatial; + for (int d = info.ndim - 1; d >= 0; --d) { + kernel_coords[d] = temp % info.weight_dims[d]; + temp /= info.weight_dims[d]; + } + + // 计算对应的输入坐标 + std::vector input_coords(info.ndim); + bool valid = true; + + for (size_t d = 0; d < info.ndim; ++d) { + input_coords[d] = static_cast(out_coords[d] * info.strides[d] + kernel_coords[d] * info.dilations[d]) - static_cast(info.pads[d]); + + if (input_coords[d] < 0 || input_coords[d] >= static_cast(info.input_dims[d])) { + valid = false; + break; + } + } + + if (valid) { + // 计算线性索引 + size_t grad_out_idx = b * out_channels * output_spatial_size + abs_oc * output_spatial_size + out_spatial; + + size_t weight_idx = abs_oc * channels_per_group * kernel_spatial_size + ic * kernel_spatial_size + kernel_spatial; + + size_t input_spatial_idx = 0; + size_t multiplier = 1; + for (int d = info.ndim - 1; d >= 0; --d) { + input_spatial_idx += input_coords[d] * multiplier; + multiplier *= info.input_dims[d]; + } + + size_t grad_in_idx = b * in_channels * input_spatial_size + abs_ic * input_spatial_size + input_spatial_idx; + + // 执行计算 + float grad_out_f32 = utils::cast(grad_output[grad_out_idx]); + float weight_f32 = utils::cast(weight[weight_idx]); + float current_grad_in = utils::cast(grad_input[grad_in_idx]); + float result = current_grad_in + grad_out_f32 * weight_f32; + grad_input[grad_in_idx] = utils::cast(result); + } + } + } + } + } + } + } + } + + // 计算权重梯度 (grad_weight) - 使用更直接的实现 + template + void compute_weight_gradient(GradWeightData *grad_weight, const GradOutData *grad_output, + const InputData *input) const { + + size_t batch_size = info.batch; + size_t in_channels = info.in_channels; + size_t out_channels = info.out_channels; + size_t groups = info.groups; + size_t channels_per_group = in_channels / groups; + size_t out_channels_per_group = out_channels / groups; + + // 计算空间大小 + size_t input_spatial_size = 1; + size_t output_spatial_size = 1; + size_t kernel_spatial_size = 1; + for (size_t i = 0; i < info.ndim; ++i) { + input_spatial_size *= info.input_dims[i]; + output_spatial_size *= info.grad_output_dims[i]; + kernel_spatial_size *= info.weight_dims[i]; + } + + // 初始化为零 + size_t total_weight_size = out_channels * channels_per_group * kernel_spatial_size; + GradWeightData zero_val = get_zero(); + std::fill(grad_weight, grad_weight + total_weight_size, zero_val); + + // 对每个权重元素并行处理 +#pragma omp parallel for collapse(3) schedule(dynamic) + for (size_t abs_oc = 0; abs_oc < out_channels; ++abs_oc) { + for (size_t ic = 0; ic < channels_per_group; ++ic) { + for (size_t kernel_spatial = 0; kernel_spatial < kernel_spatial_size; ++kernel_spatial) { + + size_t g = abs_oc / out_channels_per_group; + size_t abs_ic = g * channels_per_group + ic; + + // 将一维卷积核索引转换为多维坐标 + std::vector kernel_coords(info.ndim); + size_t temp = kernel_spatial; + for (int d = info.ndim - 1; d >= 0; --d) { + kernel_coords[d] = temp % info.weight_dims[d]; + temp /= info.weight_dims[d]; + } + + float accumulator = 0.0f; + + // 对所有批次和输出位置累积梯度 + for (size_t b = 0; b < batch_size; ++b) { + for (size_t out_spatial = 0; out_spatial < output_spatial_size; ++out_spatial) { + + // 将一维输出空间索引转换为多维坐标 + std::vector out_coords(info.ndim); + temp = out_spatial; + for (int d = info.ndim - 1; d >= 0; --d) { + out_coords[d] = temp % info.grad_output_dims[d]; + temp /= info.grad_output_dims[d]; + } + + // 计算对应的输入坐标 + std::vector input_coords(info.ndim); + bool valid = true; + + for (size_t d = 0; d < info.ndim; ++d) { + input_coords[d] = static_cast(out_coords[d] * info.strides[d] + kernel_coords[d] * info.dilations[d]) - static_cast(info.pads[d]); + + if (input_coords[d] < 0 || input_coords[d] >= static_cast(info.input_dims[d])) { + valid = false; + break; + } + } + + if (valid) { + // 计算线性索引 + size_t grad_out_idx = b * out_channels * output_spatial_size + abs_oc * output_spatial_size + out_spatial; + + size_t input_spatial_idx = 0; + size_t multiplier = 1; + for (int d = info.ndim - 1; d >= 0; --d) { + input_spatial_idx += input_coords[d] * multiplier; + multiplier *= info.input_dims[d]; + } + + size_t input_idx = b * in_channels * input_spatial_size + abs_ic * input_spatial_size + input_spatial_idx; + + // 累积梯度 + float input_f32 = utils::cast(input[input_idx]); + float grad_out_f32 = utils::cast(grad_output[grad_out_idx]); + accumulator += input_f32 * grad_out_f32; + } + } + } + + // 写入结果 + size_t weight_idx = abs_oc * channels_per_group * kernel_spatial_size + ic * kernel_spatial_size + kernel_spatial; + grad_weight[weight_idx] = utils::cast(accumulator); + } + } + } + } + + // 计算偏置梯度 (grad_bias) + template + void compute_bias_gradient(GradBiasData *grad_bias, const GradOutData *grad_output) const { + size_t batch_size = info.batch; + size_t out_channels = info.out_channels; + + size_t output_spatial_size = 1; + for (size_t i = 0; i < info.ndim; ++i) { + output_spatial_size *= info.grad_output_dims[i]; + } + + // 并行处理每个输出通道 +#pragma omp parallel for + for (ptrdiff_t c = 0; c < static_cast(out_channels); ++c) { + float sum = 0.0f; + + for (size_t b = 0; b < batch_size; ++b) { + for (size_t s = 0; s < output_spatial_size; ++s) { + size_t idx = b * out_channels * output_spatial_size + c * output_spatial_size + s; + sum += utils::cast(grad_output[idx]); + } + } + + grad_bias[c] = utils::cast(sum); + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + const op::conv_backward::ConvBackwardInfo &info, + infiniDtype_t data_type) { + if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + // CPU 实现的卷积反向传播 + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *grad_input, void *grad_weight, void *grad_bias, + const void *grad_output, const void *input, + const void *weight, infiniDtype_t dtype) const { + + if (!grad_output || !input || !weight) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + const float *grad_output_f32 = static_cast(grad_output); + const float *input_f32 = static_cast(input); + const float *weight_f32 = static_cast(weight); + + if (grad_input) { + float *grad_input_f32 = static_cast(grad_input); + compute_data_gradient(grad_input_f32, grad_output_f32, weight_f32); + } + + if (grad_weight) { + float *grad_weight_f32 = static_cast(grad_weight); + compute_weight_gradient(grad_weight_f32, grad_output_f32, input_f32); + } + + if (grad_bias) { + float *grad_bias_f32 = static_cast(grad_bias); + compute_bias_gradient(grad_bias_f32, grad_output_f32); + } + break; + } + + case INFINI_DTYPE_F16: { + const fp16_t *grad_output_f16 = static_cast(grad_output); + const fp16_t *input_f16 = static_cast(input); + const fp16_t *weight_f16 = static_cast(weight); + + if (grad_input) { + fp16_t *grad_input_f16 = static_cast(grad_input); + compute_data_gradient(grad_input_f16, grad_output_f16, weight_f16); + } + + if (grad_weight) { + fp16_t *grad_weight_f16 = static_cast(grad_weight); + compute_weight_gradient(grad_weight_f16, grad_output_f16, input_f16); + } + + if (grad_bias) { + fp16_t *grad_bias_f16 = static_cast(grad_bias); + compute_bias_gradient(grad_bias_f16, grad_output_f16); + } + break; + } + + case INFINI_DTYPE_BF16: { + const bf16_t *grad_output_bf16 = static_cast(grad_output); + const bf16_t *input_bf16 = static_cast(input); + const bf16_t *weight_bf16 = static_cast(weight); + + if (grad_input) { + bf16_t *grad_input_bf16 = static_cast(grad_input); + compute_data_gradient(grad_input_bf16, grad_output_bf16, weight_bf16); + } + + if (grad_weight) { + bf16_t *grad_weight_bf16 = static_cast(grad_weight); + compute_weight_gradient(grad_weight_bf16, grad_output_bf16, input_bf16); + } + + if (grad_bias) { + bf16_t *grad_bias_bf16 = static_cast(grad_bias); + compute_bias_gradient(grad_bias_bf16, grad_output_bf16); + } + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + void *pads, void *strides, void *dilations, + size_t groups) { + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + auto info_result = op::conv_backward::ConvBackwardInfo::create( + grad_output_desc, input_desc, weight_desc, pads, strides, dilations, groups); + CHECK_RESULT(info_result); + auto info = info_result.take(); + + auto opaque_result = Opaque::create(handle, info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, opaque->workspace_size, opaque, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *grad_input, void *grad_weight, + void *grad_bias, const void *grad_output, + const void *input, const void *weight, + void *stream) const { + return _opaque->calculate(workspace, workspace_size, grad_input, grad_weight, + grad_bias, grad_output, input, weight, _dtype); +} + +} // namespace op::conv_backward::cpu diff --git a/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.h b/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.h new file mode 100644 index 000000000..6adb485bf --- /dev/null +++ b/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.h @@ -0,0 +1,8 @@ +#ifndef __CONV_BACKWARD_CPU_H__ +#define __CONV_BACKWARD_CPU_H__ + +#include "../conv_backward.h" + +DESCRIPTOR(cpu) + +#endif // __CONV_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/conv_backward/cuda/bias_grad_kernel.cuh b/src/infiniop/ops/conv_backward/cuda/bias_grad_kernel.cuh new file mode 100644 index 000000000..63aa9967c --- /dev/null +++ b/src/infiniop/ops/conv_backward/cuda/bias_grad_kernel.cuh @@ -0,0 +1,27 @@ +#ifndef __GRAD_CUDA_H__ +#define __GRAD_CUDA_H__ + +#include + +// 特化模板:对于 bf16 类型,使用 float 进行累加以保持精度 +template +__global__ void compute_bias_grad_kernel(const T *grad_output, T *grad_bias, + int batch_size, int channels, + int spatial_size) { + int c = blockIdx.x * blockDim.x + threadIdx.x; + if (c >= channels) { + return; + } + + // 使用 float 进行累加以保持精度 + float sum = 0.0f; + for (int n = 0; n < batch_size; n++) { + for (int s = 0; s < spatial_size; s++) { + int idx = n * channels * spatial_size + c * spatial_size + s; + sum += static_cast(grad_output[idx]); + } + } + grad_bias[c] = static_cast(sum); +} + +#endif // __GRAD_CUDA_H__ diff --git a/src/infiniop/ops/conv_backward/info.h b/src/infiniop/ops/conv_backward/info.h new file mode 100644 index 000000000..2e412f0c2 --- /dev/null +++ b/src/infiniop/ops/conv_backward/info.h @@ -0,0 +1,70 @@ +#ifndef __CONV_BACKWARD_INFO_H__ +#define __CONV_BACKWARD_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +namespace op::conv_backward { + +class ConvBackwardInfo { + ConvBackwardInfo() = default; + +public: + size_t ndim; + size_t batch; + size_t in_channels; + size_t out_channels; + size_t groups; + std::vector input_dims; + std::vector weight_dims; + std::vector grad_output_dims; + std::vector pads; + std::vector strides; + std::vector dilations; + + static utils::Result create( + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + void *pads, + void *strides, + void *dilations, + size_t groups) { + ConvBackwardInfo info; + info.ndim = input_desc->ndim() - 2; + info.batch = input_desc->dim(0); + info.in_channels = input_desc->dim(1); + info.out_channels = weight_desc->dim(0); + info.groups = groups; + // 校验维度 + if (input_desc->ndim() != weight_desc->ndim() || input_desc->ndim() != grad_output_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + if (input_desc->dim(0) != grad_output_desc->dim(0) || weight_desc->dim(0) != grad_output_desc->dim(1)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + for (size_t i = 2; i < input_desc->ndim(); ++i) { + info.input_dims.push_back(input_desc->dim(i)); + info.weight_dims.push_back(weight_desc->dim(i)); + info.grad_output_dims.push_back(grad_output_desc->dim(i)); + } + + auto pad_ptr = reinterpret_cast(pads); + auto stride_ptr = reinterpret_cast(strides); + auto dilation_ptr = reinterpret_cast(dilations); + + for (size_t i = 0; i < info.ndim; ++i) { + info.pads.push_back(pad_ptr ? static_cast(pad_ptr[i]) : 0); + info.strides.push_back(stride_ptr ? static_cast(stride_ptr[i]) : 1); + info.dilations.push_back(dilation_ptr ? static_cast(dilation_ptr[i]) : 1); + } + return utils::Result(std::move(info)); + } +}; + +} // namespace op::conv_backward + +#endif // __CONV_BACKWARD_INFO_H__ diff --git a/src/infiniop/ops/conv_backward/metax/conv_backward_metax.h b/src/infiniop/ops/conv_backward/metax/conv_backward_metax.h new file mode 100644 index 000000000..d0fd76b9c --- /dev/null +++ b/src/infiniop/ops/conv_backward/metax/conv_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __CONV_BACKWARD_METAX_H__ +#define __CONV_BACKWARD_METAX_H__ + +#include "../conv_backward.h" + +DESCRIPTOR(metax) + +#endif // __CONV_BACKWARD_METAX_H__ diff --git a/src/infiniop/ops/conv_backward/metax/conv_backward_metax.maca b/src/infiniop/ops/conv_backward/metax/conv_backward_metax.maca new file mode 100644 index 000000000..597fc6e08 --- /dev/null +++ b/src/infiniop/ops/conv_backward/metax/conv_backward_metax.maca @@ -0,0 +1,451 @@ +#include "conv_backward_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include "../cuda/bias_grad_kernel.cuh" +#include "../info.h" + +infiniStatus_t launch_bias_grad_kernel(const void *grad_output, void *grad_bias, + const int *grad_output_dims, + size_t conv_ndim, + hcdnnDataType_t data_type, + hcStream_t stream) { + // 只处理 bf16 类型 + if (data_type != HCDNN_DATA_BFLOAT16) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + int batch_size = grad_output_dims[0]; + int channels = grad_output_dims[1]; + int spatial_size = 1; + + // 计算空间维度大小 + for (size_t i = 2; i < conv_ndim + 2; ++i) { + spatial_size *= grad_output_dims[i]; + } + + dim3 block(256); + dim3 grid((channels + block.x - 1) / block.x); + + // 直接调用 bf16 kernel + compute_bias_grad_kernel<__hpcc_bfloat16><<>>( + reinterpret_cast(grad_output), + reinterpret_cast<__hpcc_bfloat16 *>(grad_bias), batch_size, channels, + spatial_size); + + return INFINI_STATUS_SUCCESS; +} + +#define DESTROY_HCDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_HCDNN_DESCRIPTORS() \ + do { \ + DESTROY_HCDNN_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_HCDNN_DESCRIPTOR(grad_output_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_HCDNN_DESCRIPTOR(weight_desc, hcdnnDestroyFilterDescriptor); \ + DESTROY_HCDNN_DESCRIPTOR(grad_input_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_HCDNN_DESCRIPTOR(grad_weight_desc, hcdnnDestroyFilterDescriptor); \ + DESTROY_HCDNN_DESCRIPTOR(grad_bias_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_HCDNN_DESCRIPTOR(conv_desc, hcdnnDestroyConvolutionDescriptor); \ + } while (0) + +namespace op::conv_backward::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_HCDNN_API + // hcdnn描述符(对应cudnn描述符) + hcdnnTensorDescriptor_t input_desc = nullptr; + hcdnnTensorDescriptor_t grad_output_desc = nullptr; + hcdnnFilterDescriptor_t weight_desc = nullptr; + hcdnnTensorDescriptor_t grad_input_desc = nullptr; + hcdnnFilterDescriptor_t grad_weight_desc = nullptr; + hcdnnTensorDescriptor_t grad_bias_desc = nullptr; + hcdnnConvolutionDescriptor_t conv_desc = nullptr; + + // 反向数据和滤波器算法 + hcdnnConvolutionBwdDataAlgo_t bwd_data_algo; + hcdnnConvolutionBwdFilterAlgo_t bwd_filter_algo; + size_t bwd_data_workspace_size = 0; + size_t bwd_filter_workspace_size = 0; + size_t conv_ndim = 0; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_HCDNN_API + infiniStatus_t gethcdnnDataType(infiniDtype_t data_type, + hcdnnDataType_t &hcdnn_data_type) const { + switch (data_type) { + case INFINI_DTYPE_F16: + hcdnn_data_type = HCDNN_DATA_HALF; + break; + case INFINI_DTYPE_F32: + hcdnn_data_type = HCDNN_DATA_FLOAT; + break; + case INFINI_DTYPE_BF16: + hcdnn_data_type = HCDNN_DATA_BFLOAT16; + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + } + + // 计算张量步幅(与cuDNN逻辑一致,从最后一维开始计算) + infiniStatus_t calculateStrides(int ndim, const int *dims, std::vector &strides) const { + strides.resize(ndim); + strides[ndim - 1] = 1; // 最后一维步幅为1 + for (int i = ndim - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[i + 1]; + } + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t createTensorAndFilterDescriptors( + const op::conv_backward::ConvBackwardInfo &info, + hcdnnDataType_t hcdnn_data_type, + infiniopTensorDescriptor_t bias_desc) { + + int ndim = static_cast(info.ndim + 2); + + std::vector input_dims = {static_cast(info.batch), static_cast(info.in_channels)}; + for (size_t i = 0; i < info.ndim; ++i) { + input_dims.push_back(static_cast(info.input_dims[i])); + } + std::vector input_strides; + CHECK_STATUS(calculateStrides(ndim, input_dims.data(), input_strides)); + + std::vector grad_output_dims = {static_cast(info.batch), static_cast(info.out_channels)}; + for (size_t i = 0; i < info.ndim; ++i) { + grad_output_dims.push_back(static_cast(info.grad_output_dims[i])); + } + std::vector grad_output_strides; + CHECK_STATUS(calculateStrides(ndim, grad_output_dims.data(), grad_output_strides)); + + std::vector weight_dims = {static_cast(info.out_channels), static_cast(info.in_channels / info.groups)}; + for (size_t i = 0; i < info.ndim; ++i) { + weight_dims.push_back(static_cast(info.weight_dims[i])); + } + + if (info.ndim == 1) { + input_dims.push_back(1); + input_strides.push_back(1); + grad_output_dims.push_back(1); + grad_output_strides.push_back(1); + weight_dims.push_back(1); + } + + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc)); + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + input_desc, hcdnn_data_type, input_dims.size(), input_dims.data(), input_strides.data())); + + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_output_desc)); + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + grad_output_desc, hcdnn_data_type, grad_output_dims.size(), grad_output_dims.data(), grad_output_strides.data())); + + CHECK_MCDNN(hcdnnCreateFilterDescriptor(&weight_desc)); + CHECK_MCDNN(hcdnnSetFilterNdDescriptor( + weight_desc, hcdnn_data_type, HCDNN_TENSOR_NCHW, weight_dims.size(), weight_dims.data())); + + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_input_desc)); + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + grad_input_desc, hcdnn_data_type, input_dims.size(), input_dims.data(), input_strides.data())); + + CHECK_MCDNN(hcdnnCreateFilterDescriptor(&grad_weight_desc)); + CHECK_MCDNN(hcdnnSetFilterNdDescriptor( + grad_weight_desc, hcdnn_data_type, HCDNN_TENSOR_NCHW, weight_dims.size(), weight_dims.data())); + + if (bias_desc) { + int bias_ndim = (info.ndim == 1) ? 4 : ndim; + std::vector bias_dims(bias_ndim, 1); + bias_dims[1] = static_cast(bias_desc->dim(0)); + + std::vector bias_strides(bias_ndim, 1); + for (int i = bias_ndim - 2; i >= 0; --i) { + bias_strides[i] = bias_strides[i + 1] * bias_dims[i + 1]; + } + + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_bias_desc)); + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + grad_bias_desc, hcdnn_data_type, bias_ndim, bias_dims.data(), bias_strides.data())); + } + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t createConvDescriptor(const op::conv_backward::ConvBackwardInfo &info, + hcdnnDataType_t hcdnn_data_type) { + int conv_dim = (info.ndim == 1) ? 2 : static_cast(info.ndim); // 1D卷积按2D处理 + std::vector pad_vec(info.pads.begin(), info.pads.end()); + std::vector stride_vec(info.strides.begin(), info.strides.end()); + std::vector dilation_vec(info.dilations.begin(), info.dilations.end()); + + if (info.ndim == 1) { + pad_vec.push_back(0); + stride_vec.push_back(1); + dilation_vec.push_back(1); + } + + CHECK_MCDNN(hcdnnCreateConvolutionDescriptor(&conv_desc)); + hcdnnDataType_t compute_type = (hcdnn_data_type == HCDNN_DATA_HALF || hcdnn_data_type == HCDNN_DATA_BFLOAT16) + ? HCDNN_DATA_FLOAT + : hcdnn_data_type; + CHECK_MCDNN(hcdnnSetConvolutionNdDescriptor( + conv_desc, conv_dim, pad_vec.data(), stride_vec.data(), + dilation_vec.data(), HCDNN_CROSS_CORRELATION, compute_type)); + CHECK_MCDNN(hcdnnSetConvolutionGroupCount(conv_desc, static_cast(info.groups))); + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializehcdnnContext( + const op::conv_backward::ConvBackwardInfo &info, + infiniDtype_t data_type, + infiniopTensorDescriptor_t bias_desc) { + hcdnnDataType_t hcdnn_data_type; + CHECK_STATUS(gethcdnnDataType(data_type, hcdnn_data_type)); + CHECK_STATUS(createTensorAndFilterDescriptors(info, hcdnn_data_type, bias_desc)); + CHECK_STATUS(createConvDescriptor(info, hcdnn_data_type)); + + internal->useMcdnn(nullptr, [&](hcdnnHandle_t h) { + // 1. 查找反向数据算法 + int requested_algo_count = 8; + int returned_algo_count = 0; + hcdnnConvolutionBwdDataAlgoPerf_t bwd_data_perf[8]; + + hcdnnStatus_t status = hcdnnFindConvolutionBackwardDataAlgorithm( + h, weight_desc, grad_output_desc, conv_desc, grad_input_desc, + requested_algo_count, &returned_algo_count, bwd_data_perf); + + bool found = false; + if (status == HCDNN_STATUS_SUCCESS && returned_algo_count > 0) { + for (int i = 0; i < returned_algo_count; i++) { + if (bwd_data_perf[i].status == HCDNN_STATUS_SUCCESS) { + bwd_data_algo = bwd_data_perf[i].algo; + bwd_data_workspace_size = bwd_data_perf[i].memory; + found = true; + break; + } + } + } + if (!found) { + // 未找到有效算法,使用默认算法 + bwd_data_algo = HCDNN_CONVOLUTION_BWD_DATA_ALGO_1; + CHECK_MCDNN(hcdnnGetConvolutionBackwardDataWorkspaceSize( + h, weight_desc, grad_output_desc, conv_desc, grad_input_desc, + bwd_data_algo, &bwd_data_workspace_size)); + } + + // 2. 查找反向权重算法 + hcdnnConvolutionBwdFilterAlgoPerf_t bwd_filter_perf[8]; + status = hcdnnFindConvolutionBackwardFilterAlgorithm( + h, input_desc, grad_output_desc, conv_desc, grad_weight_desc, + requested_algo_count, &returned_algo_count, bwd_filter_perf); + + found = false; + if (status == HCDNN_STATUS_SUCCESS && returned_algo_count > 0) { + for (int i = 0; i < returned_algo_count; i++) { + if (bwd_filter_perf[i].status == HCDNN_STATUS_SUCCESS) { + bwd_filter_algo = bwd_filter_perf[i].algo; + bwd_filter_workspace_size = bwd_filter_perf[i].memory; + found = true; + break; + } + } + } + if (!found) { + // 未找到有效算法,使用默认算法 + bwd_filter_algo = HCDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + CHECK_MCDNN(hcdnnGetConvolutionBackwardFilterWorkspaceSize( + h, input_desc, grad_output_desc, conv_desc, grad_weight_desc, + bwd_filter_algo, &bwd_filter_workspace_size)); + } + return INFINI_STATUS_SUCCESS; + }); + + // 工作空间大小取两者最大值 + workspace_size = std::max(bwd_data_workspace_size, bwd_filter_workspace_size); + conv_ndim = info.ndim; + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) +#ifdef ENABLE_HCDNN_API + , + input_desc(other.input_desc), grad_output_desc(other.grad_output_desc), weight_desc(other.weight_desc), grad_input_desc(other.grad_input_desc), grad_weight_desc(other.grad_weight_desc), grad_bias_desc(other.grad_bias_desc), conv_desc(other.conv_desc), bwd_data_algo(other.bwd_data_algo), bwd_filter_algo(other.bwd_filter_algo), bwd_data_workspace_size(other.bwd_data_workspace_size), bwd_filter_workspace_size(other.bwd_filter_workspace_size) + , conv_ndim(other.conv_ndim) +#endif + { +#ifdef ENABLE_HCDNN_API + other.input_desc = nullptr; + other.grad_output_desc = nullptr; + other.weight_desc = nullptr; + other.grad_input_desc = nullptr; + other.grad_weight_desc = nullptr; + other.grad_bias_desc = nullptr; + other.conv_desc = nullptr; + other.bwd_data_algo = static_cast(0); + other.bwd_filter_algo = static_cast(0); + other.bwd_data_workspace_size = 0; + other.bwd_filter_workspace_size = 0; + other.conv_ndim = 0; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_HCDNN_API + CLEANUP_HCDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + const op::conv_backward::ConvBackwardInfo &info, + infiniDtype_t data_type, infiniopTensorDescriptor_t bias_desc) { +#ifdef ENABLE_HCDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializehcdnnContext(info, data_type, bias_desc); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + void *pads, void *strides, void *dilations, + size_t groups) { +#ifdef ENABLE_HCDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto info_result = op::conv_backward::ConvBackwardInfo::create( + grad_output_desc, input_desc, weight_desc, pads, strides, dilations, groups); + CHECK_RESULT(info_result); + auto info = info_result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype, bias_desc); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, opaque->workspace_size, opaque, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *grad_input, void *grad_weight, + void *grad_bias, const void *grad_output, + const void *input, const void *weight, + void *stream) const { +#ifdef ENABLE_HCDNN_API + const float alpha = 1.0f, beta = 0.0f; + auto internal = _opaque->internal; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + CHECK_STATUS(_opaque->internal->useMcdnn((hcStream_t)stream, [&](hcdnnHandle_t handle) { + if (!grad_input || !grad_weight || !grad_output || !input || !weight) { + return INFINI_STATUS_BAD_PARAM; + } + + CHECK_MCDNN(hcdnnConvolutionBackwardData( + handle, + &alpha, + _opaque->weight_desc, + weight, + _opaque->grad_output_desc, + grad_output, + _opaque->conv_desc, + _opaque->bwd_data_algo, + workspace, + _opaque->bwd_data_workspace_size, + &beta, + _opaque->grad_input_desc, + grad_input)); + + CHECK_MCDNN(hcdnnConvolutionBackwardFilter( + handle, + &alpha, + _opaque->input_desc, + input, + _opaque->grad_output_desc, + grad_output, + _opaque->conv_desc, + _opaque->bwd_filter_algo, + workspace, + _opaque->bwd_filter_workspace_size, + &beta, + _opaque->grad_weight_desc, + grad_weight)); + + if (_opaque->grad_bias_desc && grad_bias) { + hcdnnDataType_t grad_output_type; + int grad_output_nbDims; + int grad_output_dims[5], grad_output_strides[5]; + + int query_ndim = (_opaque->conv_ndim == 3) ? 5 : 4; + + hcdnnStatus_t status = hcdnnGetTensorNdDescriptor( + _opaque->grad_output_desc, query_ndim, &grad_output_type, + &grad_output_nbDims, grad_output_dims, grad_output_strides); + if (grad_output_type == HCDNN_DATA_BFLOAT16) { + CHECK_STATUS(launch_bias_grad_kernel( + grad_output, grad_bias, grad_output_dims, _opaque->conv_ndim, + grad_output_type, (hcStream_t)stream)); + } else { + CHECK_MCDNN(hcdnnConvolutionBackwardBias( + handle, + &alpha, + _opaque->grad_output_desc, + grad_output, + &beta, + _opaque->grad_bias_desc, + grad_bias)); + } + } + return INFINI_STATUS_SUCCESS; + })); + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::conv_backward::metax diff --git a/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cu b/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cu new file mode 100644 index 000000000..29e810a4c --- /dev/null +++ b/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cu @@ -0,0 +1,452 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../cuda/bias_grad_kernel.cuh" +#include "../info.h" +#include "conv_backward_nvidia.cuh" + +infiniStatus_t launch_bias_grad_kernel(const void *grad_output, void *grad_bias, + const int *grad_output_dims, + size_t conv_ndim, + cudnnDataType_t data_type, + cudaStream_t stream) { + int batch_size = grad_output_dims[0]; + int channels = grad_output_dims[1]; + int spatial_size = 1; + + for (size_t i = 2; i < conv_ndim + 2; ++i) { + spatial_size *= grad_output_dims[i]; + } + + dim3 block(256); + dim3 grid((channels + block.x - 1) / block.x); + + // 直接调用 bf16 kernel + compute_bias_grad_kernel<__nv_bfloat16><<>>( + reinterpret_cast(grad_output), + reinterpret_cast<__nv_bfloat16 *>(grad_bias), batch_size, channels, + spatial_size); + + return INFINI_STATUS_SUCCESS; +} + +#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_CUDNN_DESCRIPTORS() \ + do { \ + DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(grad_output_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(weight_desc, cudnnDestroyFilterDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(grad_input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(grad_weight_desc, cudnnDestroyFilterDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(grad_bias_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \ + } while (0) + +namespace op::conv_backward::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_CUDNN_API + cudnnTensorDescriptor_t input_desc = nullptr; + cudnnTensorDescriptor_t grad_output_desc = nullptr; + cudnnFilterDescriptor_t weight_desc = nullptr; + cudnnTensorDescriptor_t grad_input_desc = nullptr; + cudnnFilterDescriptor_t grad_weight_desc = nullptr; + cudnnTensorDescriptor_t grad_bias_desc = nullptr; + cudnnConvolutionDescriptor_t conv_desc = nullptr; + + cudnnConvolutionBwdDataAlgo_t bwd_data_algo; + cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo; + size_t bwd_data_workspace_size = 0; + size_t bwd_filter_workspace_size = 0; + size_t conv_ndim = 0; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_CUDNN_API + infiniStatus_t getCudnnDataType(infiniDtype_t data_type, + cudnnDataType_t &cudnn_data_type) const { + if (data_type == INFINI_DTYPE_F16 || data_type == INFINI_DTYPE_F32 || data_type == INFINI_DTYPE_BF16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + return INFINI_STATUS_SUCCESS; + } + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + infiniStatus_t calculateStrides(int ndim, const int *input_dims, + std::vector &input_strides) const { + input_strides.resize(ndim); + input_strides[ndim - 1] = 1; // 最后一维 stride = 1 + for (int i = ndim - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t createTensorAndFilterDescriptors( + const op::conv_backward::ConvBackwardInfo &info, + cudnnDataType_t cudnn_data_type, infiniopTensorDescriptor_t bias_desc) { + + int ndim = static_cast(info.ndim + 2); + + // input + std::vector input_dims = {static_cast(info.batch), + static_cast(info.in_channels)}; + for (size_t i = 0; i < info.ndim; ++i) { + input_dims.push_back(static_cast(info.input_dims[i])); + } + std::vector input_strides; + CHECK_STATUS(calculateStrides(ndim, input_dims.data(), input_strides)); + + // grad_output + std::vector grad_output_dims = {static_cast(info.batch), + static_cast(info.out_channels)}; + for (size_t i = 0; i < info.ndim; ++i) { + grad_output_dims.push_back(static_cast(info.grad_output_dims[i])); + } + std::vector grad_output_strides; + CHECK_STATUS( + calculateStrides(ndim, grad_output_dims.data(), grad_output_strides)); + + // weight + size_t in_channels_per_group = info.in_channels / info.groups; + std::vector weight_dims = {static_cast(info.out_channels), + static_cast(in_channels_per_group)}; + for (size_t i = 0; i < info.ndim; ++i) { + weight_dims.push_back(static_cast(info.weight_dims[i])); + } + + if (info.ndim == 1) { + input_dims.push_back(1); + input_strides.push_back(1); + grad_output_dims.push_back(1); + grad_output_strides.push_back(1); + weight_dims.push_back(1); + } + + // input + CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc)); + CHECK_CUDNN(cudnnSetTensorNdDescriptor(input_desc, cudnn_data_type, + input_dims.size(), input_dims.data(), + input_strides.data())); + + // grad_output + CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_output_desc)); + CHECK_CUDNN(cudnnSetTensorNdDescriptor( + grad_output_desc, cudnn_data_type, grad_output_dims.size(), + grad_output_dims.data(), grad_output_strides.data())); + + // weight + CHECK_CUDNN(cudnnCreateFilterDescriptor(&weight_desc)); + CHECK_CUDNN(cudnnSetFilterNdDescriptor( + weight_desc, cudnn_data_type, CUDNN_TENSOR_NCHW, weight_dims.size(), + weight_dims.data())); + + // grad_input + CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_input_desc)); + CHECK_CUDNN(cudnnSetTensorNdDescriptor(grad_input_desc, cudnn_data_type, + input_dims.size(), input_dims.data(), + input_strides.data())); + + // grad_weight + CHECK_CUDNN(cudnnCreateFilterDescriptor(&grad_weight_desc)); + CHECK_CUDNN(cudnnSetFilterNdDescriptor( + grad_weight_desc, cudnn_data_type, CUDNN_TENSOR_NCHW, + weight_dims.size(), weight_dims.data())); + + // grad_bias (optional) + if (bias_desc) { + int bias_ndim = (info.ndim == 1) ? 4 : ndim; + + std::vector bias_dims(bias_ndim, 1); + bias_dims[1] = static_cast(bias_desc->dim(0)); // out_channels + + std::vector bias_strides(bias_ndim, 1); + for (int i = bias_ndim - 2; i >= 0; --i) { + bias_strides[i] = bias_strides[i + 1] * bias_dims[i + 1]; + } + + CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_bias_desc)); + CHECK_CUDNN(cudnnSetTensorNdDescriptor(grad_bias_desc, cudnn_data_type, + bias_ndim, bias_dims.data(), + bias_strides.data())); + } + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t + createConvDescriptor(const op::conv_backward::ConvBackwardInfo &info, + cudnnDataType_t cudnn_data_type) { + int conv_dim = (info.ndim == 1) ? 2 : static_cast(info.ndim); + std::vector pad_vec(info.pads.begin(), info.pads.end()); + std::vector stride_vec(info.strides.begin(), info.strides.end()); + std::vector dilation_vec(info.dilations.begin(), info.dilations.end()); + + if (info.ndim == 1) { + pad_vec.push_back(0); + stride_vec.push_back(1); + dilation_vec.push_back(1); + } + + CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&conv_desc)); + cudnnDataType_t compute_type = (cudnn_data_type == CUDNN_DATA_BFLOAT16 || cudnn_data_type == CUDNN_DATA_HALF) + ? CUDNN_DATA_FLOAT + : cudnn_data_type; + CHECK_CUDNN(cudnnSetConvolutionNdDescriptor( + conv_desc, conv_dim, pad_vec.data(), stride_vec.data(), + dilation_vec.data(), CUDNN_CROSS_CORRELATION, compute_type)); + CHECK_CUDNN(cudnnSetConvolutionGroupCount(conv_desc, + static_cast(info.groups))); + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t + initializeCudnnContext(const op::conv_backward::ConvBackwardInfo &info, + infiniDtype_t data_type, + infiniopTensorDescriptor_t bias_desc) { + + cudnnDataType_t cudnn_data_type; + CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type)); + CHECK_STATUS( + createTensorAndFilterDescriptors(info, cudnn_data_type, bias_desc)); + CHECK_STATUS(createConvDescriptor(info, cudnn_data_type)); + + // Query workspace size + internal->useCudnn(nullptr, [&](cudnnHandle_t h) { + // 1. 查找适合的反向数据算法 + int requestedAlgoCount = 8; + int returnedAlgoCount = 0; + cudnnConvolutionBwdDataAlgoPerf_t bwd_data_perf[8]; + + cudnnStatus_t status = cudnnFindConvolutionBackwardDataAlgorithm( + h, weight_desc, grad_output_desc, conv_desc, grad_input_desc, + requestedAlgoCount, &returnedAlgoCount, bwd_data_perf); + bool found = false; + if (status == CUDNN_STATUS_SUCCESS && returnedAlgoCount > 0) { + for (int i = 0; i < returnedAlgoCount; i++) { + if (bwd_data_perf[i].status == CUDNN_STATUS_SUCCESS) { + bwd_data_algo = bwd_data_perf[i].algo; + bwd_data_workspace_size = bwd_data_perf[i].memory; + found = true; + break; + } + } + if (!found) { + // 如果没找到成功的算法,用默认的 + bwd_data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + cudnnGetConvolutionBackwardDataWorkspaceSize( + h, weight_desc, grad_output_desc, conv_desc, grad_input_desc, + bwd_data_algo, &bwd_data_workspace_size); + } + } else { + // 查找失败,回退到默认算法 + bwd_data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + cudnnGetConvolutionBackwardDataWorkspaceSize( + h, weight_desc, grad_output_desc, conv_desc, grad_input_desc, + bwd_data_algo, &bwd_data_workspace_size); + } + + // 2. 查找适合的反向权重算法 + cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_perf[8]; + + status = cudnnFindConvolutionBackwardFilterAlgorithm( + h, input_desc, grad_output_desc, conv_desc, grad_weight_desc, + requestedAlgoCount, &returnedAlgoCount, bwd_filter_perf); + + if (status == CUDNN_STATUS_SUCCESS && returnedAlgoCount > 0) { + found = false; + for (int i = 0; i < returnedAlgoCount; i++) { + if (bwd_filter_perf[i].status == CUDNN_STATUS_SUCCESS) { + bwd_filter_algo = bwd_filter_perf[i].algo; + bwd_filter_workspace_size = bwd_filter_perf[i].memory; + found = true; + break; + } + } + if (!found) { + // 如果没找到成功的算法,用默认的 + bwd_filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + cudnnGetConvolutionBackwardFilterWorkspaceSize( + h, input_desc, grad_output_desc, conv_desc, grad_weight_desc, + bwd_filter_algo, &bwd_filter_workspace_size); + } + } else { + bwd_filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + cudnnGetConvolutionBackwardFilterWorkspaceSize( + h, input_desc, grad_output_desc, conv_desc, grad_weight_desc, + bwd_filter_algo, &bwd_filter_workspace_size); + } + return INFINI_STATUS_SUCCESS; + }); + workspace_size = std::max(bwd_data_workspace_size, bwd_filter_workspace_size); + + conv_ndim = info.ndim; + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) +#ifdef ENABLE_CUDNN_API + , + input_desc(other.input_desc), grad_output_desc(other.grad_output_desc), + weight_desc(other.weight_desc), grad_input_desc(other.grad_input_desc), + grad_weight_desc(other.grad_weight_desc), + grad_bias_desc(other.grad_bias_desc), conv_desc(other.conv_desc), + bwd_data_algo(other.bwd_data_algo), + bwd_filter_algo(other.bwd_filter_algo), + bwd_data_workspace_size(other.bwd_data_workspace_size), + bwd_filter_workspace_size(other.bwd_filter_workspace_size), + conv_ndim(other.conv_ndim) +#endif + { +#ifdef ENABLE_CUDNN_API + other.input_desc = nullptr; + other.grad_output_desc = nullptr; + other.weight_desc = nullptr; + other.grad_input_desc = nullptr; + other.grad_weight_desc = nullptr; + other.grad_bias_desc = nullptr; + other.conv_desc = nullptr; + other.bwd_data_algo = static_cast(0); + other.bwd_filter_algo = static_cast(0); + other.bwd_data_workspace_size = 0; + other.bwd_filter_workspace_size = 0; + other.conv_ndim = 0; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_CUDNN_API + CLEANUP_CUDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + const op::conv_backward::ConvBackwardInfo &info, + infiniDtype_t data_type, infiniopTensorDescriptor_t bias_desc) { +#ifdef ENABLE_CUDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializeCudnnContext(info, data_type, bias_desc); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + void *pads, void *strides, void *dilations, + size_t groups) { +#ifdef ENABLE_CUDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto info_result = op::conv_backward::ConvBackwardInfo::create( + grad_output_desc, input_desc, weight_desc, pads, strides, dilations, + groups); + CHECK_RESULT(info_result); + auto info = info_result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype, bias_desc); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, opaque->workspace_size, opaque, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *grad_input, void *grad_weight, + void *grad_bias, const void *grad_output, + const void *input, const void *weight, + void *stream) const { +#ifdef ENABLE_CUDNN_API + const float alpha = 1.0f, beta = 0.0f; + auto internal = _opaque->internal; + + return internal->useCudnn((cudaStream_t)stream, [&](cudnnHandle_t h) { + if (!grad_input || !grad_weight || !grad_output || !input || !weight) { + printf("Error: Null pointer in calculate function\n"); + return INFINI_STATUS_BAD_PARAM; + } + + CHECK_CUDNN(cudnnConvolutionBackwardData( + h, &alpha, _opaque->weight_desc, weight, _opaque->grad_output_desc, + grad_output, _opaque->conv_desc, _opaque->bwd_data_algo, workspace, + _opaque->bwd_data_workspace_size, &beta, _opaque->grad_input_desc, + grad_input)); + + CHECK_CUDNN(cudnnConvolutionBackwardFilter( + h, &alpha, _opaque->input_desc, input, _opaque->grad_output_desc, + grad_output, _opaque->conv_desc, _opaque->bwd_filter_algo, workspace, + _opaque->bwd_filter_workspace_size, &beta, _opaque->grad_weight_desc, + grad_weight)); + + // grad_bias = conv_bwd_bias(grad_output) + if (_opaque->grad_bias_desc && grad_bias) { + cudnnDataType_t grad_output_type; + int grad_output_nbDims; + int grad_output_dims[5], grad_output_strides[5]; + + int query_ndim = (_opaque->conv_ndim == 3) ? 5 : 4; + + CHECK_CUDNN(cudnnGetTensorNdDescriptor( + _opaque->grad_output_desc, query_ndim, &grad_output_type, + &grad_output_nbDims, grad_output_dims, grad_output_strides)); + if (grad_output_type == CUDNN_DATA_BFLOAT16) { + CHECK_STATUS(launch_bias_grad_kernel( + grad_output, grad_bias, grad_output_dims, _opaque->conv_ndim, + grad_output_type, (cudaStream_t)stream)); + } else { + CHECK_CUDNN(cudnnConvolutionBackwardBias( + h, &alpha, _opaque->grad_output_desc, grad_output, &beta, + _opaque->grad_bias_desc, grad_bias)); + } + } + return INFINI_STATUS_SUCCESS; + }); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::conv_backward::nvidia diff --git a/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cuh b/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cuh new file mode 100644 index 000000000..363e979e1 --- /dev/null +++ b/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CONV_BACKWARD_NVIDIA_CUH__ +#define __CONV_BACKWARD_NVIDIA_CUH__ + +#include "../conv_backward.h" + +DESCRIPTOR(nvidia) + +#endif // __CONV_BACKWARD_NVIDIA_CUH__ diff --git a/src/infiniop/ops/conv_backward/operator.cc b/src/infiniop/ops/conv_backward/operator.cc new file mode 100644 index 000000000..f02e31cb3 --- /dev/null +++ b/src/infiniop/ops/conv_backward/operator.cc @@ -0,0 +1,135 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/conv_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/conv_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/conv_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/conv_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateConvBackwardDescriptor( + infiniopHandle_t handle, + infiniopConvBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + void *pads, + void *strides, + void *dilations, + size_t groups) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::conv_backward::NAMESPACE::Descriptor::create( \ + handle, reinterpret_cast(desc_ptr), \ + grad_output_desc, input_desc, weight_desc, bias_desc, pads, strides, dilations, groups) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CREATE +} + +__C infiniStatus_t infiniopGetConvBackwardWorkspaceSize( + infiniopConvBackwardDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopConvBackward( + infiniopConvBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + void *grad_weight, + void *grad_bias, + const void *grad_output, + const void *input, + const void *weight, + void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_input, grad_weight, grad_bias, grad_output, input, weight, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyConvBackwardDescriptor(infiniopConvBackwardDescriptor_t desc) { +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef DELETE +} diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc new file mode 100644 index 000000000..af97c1d09 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc @@ -0,0 +1,321 @@ +#include "cross_entropy_loss_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include + +namespace op::cross_entropy_loss::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + std::vector logits_shape; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const std::vector &shape) + : handle(handle_ptr), logits_shape(shape) { + // 计算workspace大小:需要存储per-sample loss + size_t N = logits_shape[0]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + workspace_size = N * inner_size * sizeof(float); + } + + void cross_entropy_f16_as_float(float *workspace, float *loss_result, + const fp16_t *logits, const int64_t *target) const { + size_t N = logits_shape[0]; + size_t C = logits_shape[1]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + + // 转换F16 logits为float + size_t total_logits_size = N * C * inner_size; + std::vector float_logits(total_logits_size); + for (size_t i = 0; i < total_logits_size; ++i) { + float_logits[i] = utils::cast(logits[i]); + } + + // 使用float精度计算 + cross_entropy_cpu_float(workspace, loss_result, float_logits.data(), target); + } + + // 通用的float版本交叉熵计算 + void cross_entropy_cpu_float(float *workspace, float *loss_result, + const float *logits, const int64_t *target) const { + size_t N = logits_shape[0]; + size_t C = logits_shape[1]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + + const int64_t ignore_index = -100; + float *per_sample_loss = workspace; + + // 计算每个样本的损失 + for (size_t n = 0; n < N; ++n) { + for (size_t inner = 0; inner < inner_size; ++inner) { + size_t sample_idx = n * inner_size + inner; + int64_t t = target[sample_idx]; + + // 检查ignore_index或无效target + if (t == ignore_index || t < 0 || t >= static_cast(C)) { + per_sample_loss[sample_idx] = 0.0f; + continue; + } + + // 计算这个位置的logits基址 + size_t base_offset = n * C * inner_size + inner; + + // 数值稳定的softmax计算:先找最大值 + float max_logit = -std::numeric_limits::infinity(); + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + max_logit = std::max(max_logit, logits[logit_idx]); + } + + // 计算exp的和(减去最大值保证数值稳定) + float sum_exp = 0.0f; + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + sum_exp += std::exp(logits[logit_idx] - max_logit); + } + + // 计算目标类别的logit + size_t target_logit_idx = base_offset + static_cast(t) * inner_size; + float target_logit = logits[target_logit_idx]; + + // 计算交叉熵损失:log_softmax[target] = logit[target] - log(sum_exp) - max_logit + // 所以 -log_softmax[target] = log(sum_exp) + max_logit - logit[target] + per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit; + } + } + + // 计算平均损失(忽略ignore_index的样本) + double total_loss = 0.0; + size_t valid_count = 0; + size_t total_samples = N * inner_size; + + for (size_t i = 0; i < total_samples; ++i) { + if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast(C)) { + total_loss += static_cast(per_sample_loss[i]); + valid_count++; + } + } + + *loss_result = valid_count > 0 ? static_cast(total_loss / valid_count) : 0.0f; + } + + // 通用模板版本(用于F32和BF16) + template + void cross_entropy_cpu_generic(float *workspace, T *loss_result, + const T *logits, const int64_t *target) const { + size_t N = logits_shape[0]; + size_t C = logits_shape[1]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + + const int64_t ignore_index = -100; + float *per_sample_loss = workspace; + + // 计算每个样本的损失 + for (size_t n = 0; n < N; ++n) { + for (size_t inner = 0; inner < inner_size; ++inner) { + size_t sample_idx = n * inner_size + inner; + int64_t t = target[sample_idx]; + + // 检查ignore_index或无效target + if (t == ignore_index || t < 0 || t >= static_cast(C)) { + per_sample_loss[sample_idx] = 0.0f; + continue; + } + + // 计算这个位置的logits基址 + size_t base_offset = n * C * inner_size + inner; + + // 数值稳定的softmax计算:先找最大值 + float max_logit = -std::numeric_limits::infinity(); + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + float logit_val; + if constexpr (std::is_same::value) { + logit_val = utils::cast(logits[logit_idx]); + } else { + logit_val = logits[logit_idx]; + } + max_logit = std::max(max_logit, logit_val); + } + + // 计算exp的和 + float sum_exp = 0.0f; + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + float logit_val; + if constexpr (std::is_same::value) { + logit_val = utils::cast(logits[logit_idx]); + } else { + logit_val = logits[logit_idx]; + } + sum_exp += std::exp(logit_val - max_logit); + } + + // 计算目标类别的logit + size_t target_logit_idx = base_offset + static_cast(t) * inner_size; + float target_logit; + if constexpr (std::is_same::value) { + target_logit = utils::cast(logits[target_logit_idx]); + } else { + target_logit = logits[target_logit_idx]; + } + + // 计算交叉熵损失 + per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit; + } + } + + // 计算平均损失 + double total_loss = 0.0; + size_t valid_count = 0; + size_t total_samples = N * inner_size; + + for (size_t i = 0; i < total_samples; ++i) { + if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast(C)) { + total_loss += static_cast(per_sample_loss[i]); + valid_count++; + } + } + + float mean_loss = valid_count > 0 ? static_cast(total_loss / valid_count) : 0.0f; + + // 转换回输出类型 + if constexpr (std::is_same::value) { + *loss_result = utils::cast(mean_loss); + } else { + *loss_result = static_cast(mean_loss); + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + logits_shape(std::move(other.logits_shape)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, const std::vector &shape) { + Opaque opaque(handle_ptr, shape); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *loss, const void *logits, const void *target, + infiniDtype_t dtype) const { + if (!workspace || !loss || !logits || !target) { + return INFINI_STATUS_BAD_PARAM; + } + + if (workspace_size < this->workspace_size) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + float *workspace_ptr = static_cast(workspace); + const int64_t *target_ptr = static_cast(target); + + switch (dtype) { + case INFINI_DTYPE_F32: { + const float *logits_ptr = static_cast(logits); + float *loss_ptr = static_cast(loss); + cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr); + break; + } + + case INFINI_DTYPE_F16: { + const fp16_t *logits_ptr = static_cast(logits); + fp16_t *loss_ptr = static_cast(loss); + + // F16特殊处理:使用float计算 + float temp_loss; + cross_entropy_f16_as_float(workspace_ptr, &temp_loss, logits_ptr, target_ptr); + *loss_ptr = utils::cast(temp_loss); + break; + } + + case INFINI_DTYPE_BF16: { + const bf16_t *logits_ptr = static_cast(logits); + bf16_t *loss_ptr = static_cast(loss); + cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr); + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } + + size_t get_workspace_size() const { + return workspace_size; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t /*loss_desc*/, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t /*target_desc*/) { + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + const auto &orig_shape = logits_desc->shape(); + std::vector logits_shape; + + if (orig_shape.size() == 1) { + logits_shape = {1, orig_shape[0]}; + } else { + logits_shape = orig_shape; + } + + if (logits_shape.size() < 2) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + auto opaque_result = Opaque::create(handle, logits_shape); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, opaque->get_workspace_size(), opaque, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *loss, const void *logits, + const void *target, void *stream) const { + return _opaque->calculate(workspace, workspace_size, loss, logits, target, _dtype); +} + +} // namespace op::cross_entropy_loss::cpu diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h new file mode 100644 index 000000000..8afec63d0 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_LOSS_CPU_H__ +#define __CROSS_ENTROPY_LOSS_CPU_H__ + +#include "../cross_entropy_loss.h" + +DESCRIPTOR(cpu) + +#endif // __CROSS_ENTROPY_LOSS_CPU_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h new file mode 100644 index 000000000..dad108d78 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h @@ -0,0 +1,48 @@ +#ifndef __CROSS_ENTROPY_LOSS_H__ +#define __CROSS_ENTROPY_LOSS_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::cross_entropy_loss::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t loss_desc, \ + infiniopTensorDescriptor_t logits_desc, \ + infiniopTensorDescriptor_t target_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *loss, \ + const void *logits, \ + const void *target, \ + void *stream) const; \ + }; \ + } + +#endif // __CROSS_ENTROPY_LOSS_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh new file mode 100644 index 000000000..5279011ef --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh @@ -0,0 +1,67 @@ +#ifndef __CROSS_ENTROPY_KERNEL_CUH__ +#define __CROSS_ENTROPY_KERNEL_CUH__ + +#include +#include +#include + +__device__ __forceinline__ float to_float(float val) { return val; } + +__device__ __forceinline__ float to_float(half val) { + return __half2float(val); +} + +__device__ __forceinline__ float to_float(__hpcc_bfloat16 val) { + return __bfloat162float(val); +} + +template +__global__ void cross_entropy_loss_kernel(T_out *loss, const T_in *logits, + const int64_t *target, int N, int C, + long long inner_size, + int64_t ignore_index) { + + long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x; + long long total = (long long)N * inner_size; + if (idx >= total) { + return; + } + + int n = (int)(idx / inner_size); + int inner = (int)(idx % inner_size); + + int64_t t = target[idx]; + + if (t == ignore_index) { + loss[idx] = (T_out)0.0f; + return; + } + if (t < 0 || t >= C) { + loss[idx] = (T_out)0.0f; + return; + } + + const long long base_offset = ((long long)n * C * inner_size) + inner; + + // 1. 找到 logits 中的最大值 + float max_val = -HUGE_VALF; // 使用浮点数的最大负值 + for (int c = 0; c < C; ++c) { + long long offset = base_offset + (long long)c * inner_size; + max_val = fmaxf(max_val, to_float(logits[offset])); + } + + // 2. 计算 sum(exp(x - max_val)) + float sum_exp = 0.0f; + for (int c = 0; c < C; ++c) { + long long offset = base_offset + (long long)c * inner_size; + sum_exp += expf(to_float(logits[offset]) - max_val); + } + + // 3. 计算最终 loss + long long target_offset = base_offset + (long long)t * inner_size; + float logit_tgt = to_float(logits[target_offset]); + + loss[idx] = (T_out)(logf(sum_exp) + max_val - logit_tgt); +} + +#endif // __CROSS_ENTROPY_KERNEL_CUH__ diff --git a/src/infiniop/ops/cross_entropy_loss/info.h b/src/infiniop/ops/cross_entropy_loss/info.h new file mode 100644 index 000000000..5278bf912 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/info.h @@ -0,0 +1,36 @@ +#ifndef __CROSS_ENTROPY_LOSS_INFO_H__ +#define __CROSS_ENTROPY_LOSS_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::cross_entropy_loss { + +class CrossEntropyInfo { +public: + CrossEntropyInfo() = default; + size_t batch = 0; + size_t num_classes = 0; + infiniDtype_t dtype; + + static utils::Result create( + infiniopTensorDescriptor_t loss, + infiniopTensorDescriptor_t logits, + infiniopTensorDescriptor_t target) { + + if (logits->ndim() != 2 || loss->ndim() != 1 || target->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + CrossEntropyInfo info; + info.batch = logits->dim(0); + info.num_classes = logits->dim(1); + info.dtype = logits->dtype(); + return utils::Result(std::move(info)); + } +}; + +} // namespace op::cross_entropy_loss + +#endif // __CROSS_ENTROPY_LOSS_INFO_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h new file mode 100644 index 000000000..382d555e0 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_METAX_H__ +#define __CROSS_ENTROPY_METAX_H__ + +#include "../cross_entropy_loss.h" + +DESCRIPTOR(metax) + +#endif // __CROSS_ENTROPY_METAX_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca new file mode 100644 index 000000000..94f611e7a --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca @@ -0,0 +1,145 @@ +#include +#include +#include +#include +#include +#include +#include +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include "cross_entropy_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::cross_entropy_loss::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + std::vector logits_shape; + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + ~Opaque() = default; +}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t /*loss_desc*/, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t /*target_desc*/) { + + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + if (dtype != INFINI_DTYPE_F32 && dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_BF16) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto opaque = new Opaque(handle->internal()); + const auto &orig_shape = logits_desc->shape(); + + if (orig_shape.size() == 1) { + opaque->logits_shape = {1, orig_shape[0]}; + } else { + opaque->logits_shape = orig_shape; + } + + if (opaque->logits_shape.size() < 2) return INFINI_STATUS_BAD_TENSOR_SHAPE; + + const auto &s = opaque->logits_shape; + long long N = (long long)s[0]; + long long inner = 1; + for (size_t i = 2; i < s.size(); ++i) inner *= (long long)s[i]; + + size_t workspace_size = (size_t)(N * inner) * sizeof(float); + *desc_ptr = new Descriptor(dtype, workspace_size, opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, void *loss, + const void *logits, const void *target, void *stream_) const { + + const auto &shape = _opaque->logits_shape; + int N = (int)shape[0]; + int C = (int)shape[1]; + long long inner_size = 1; + for (size_t i = 2; i < shape.size(); ++i) + inner_size *= shape[i]; + + long long total = (long long)N * inner_size; + + size_t need_ws = (size_t)total * sizeof(float); + if (workspace_size < need_ws) return INFINI_STATUS_INTERNAL_ERROR; + float* per_sample_loss = reinterpret_cast(workspace); + + const int64_t *typed_target = reinterpret_cast(target); + const int64_t ignore_index = -100; + hcStream_t stream = (hcStream_t)stream_; + + dim3 blockSize(256); + dim3 gridSize((total + blockSize.x - 1) / blockSize.x); + + if (_dtype == INFINI_DTYPE_F32) { + cross_entropy_loss_kernel + <<>>( + per_sample_loss, (const float*)logits, typed_target, + N, C, inner_size, ignore_index); + } else if (_dtype == INFINI_DTYPE_F16) { + cross_entropy_loss_kernel + <<>>( + per_sample_loss, (const half*)logits, typed_target, + N, C, inner_size, ignore_index); + } else if (_dtype == INFINI_DTYPE_BF16) { + cross_entropy_loss_kernel<__hpcc_bfloat16, float> + <<>>( + per_sample_loss, (const __hpcc_bfloat16*)logits, typed_target, + N, C, inner_size, ignore_index); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + if (hcGetLastError() != hcSuccess) return INFINI_STATUS_INTERNAL_ERROR; + + std::vector h_loss((size_t)total); + std::vector h_target((size_t)total); + if (hcMemcpyAsync(h_loss.data(), per_sample_loss, need_ws, hcMemcpyDeviceToHost, stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + if (hcMemcpyAsync(h_target.data(), typed_target, (size_t)total * sizeof(int64_t), hcMemcpyDeviceToHost, stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + if (hcStreamSynchronize(stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + + double acc = 0.0; + long long count = 0; + for (long long i = 0; i < total; ++i) { + if (h_target[i] != ignore_index) { + acc += (double)h_loss[i]; + count++; + } + } + double mean = (count > 0) ? (acc / (double)count) : 0.0; + + if (_dtype == INFINI_DTYPE_F32) { + float v = (float)mean; + if (hcMemcpyAsync(loss, &v, sizeof(float), hcMemcpyHostToDevice, stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + } else if (_dtype == INFINI_DTYPE_F16) { + half v = __float2half((float)mean); + if (hcMemcpyAsync(loss, &v, sizeof(half), hcMemcpyHostToDevice, stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + } else if (_dtype == INFINI_DTYPE_BF16) { + __hpcc_bfloat16 v = __float2bfloat16_rn((float)mean); + if (hcMemcpyAsync(loss, &v, sizeof(__hpcc_bfloat16), hcMemcpyHostToDevice, stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + } + if (hcStreamSynchronize(stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::cross_entropy_loss::metax diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu new file mode 100644 index 000000000..3d795a67a --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu @@ -0,0 +1,217 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "cross_entropy_loss_nvidia.cuh" + +namespace op::cross_entropy_loss::nvidia { +namespace cuda { + +__device__ __forceinline__ float to_float(float v) { return v; } +__device__ __forceinline__ float to_float(double v) { return (float)v; } +__device__ __forceinline__ float to_float(half v) { return __half2float(v); } +__device__ __forceinline__ float to_float(__nv_bfloat16 v) { + return __bfloat162float(v); +} + +template +__global__ void +softmaxCrossEntropy_per_sample(T_out *__restrict__ loss, + const T_in *__restrict__ logits, + const int64_t *__restrict__ target, int N, int C, + long long inner_size, int64_t ignore_index) { + long long total = (long long)N * inner_size; + long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total) { + return; + } + + int n = (int)(idx / inner_size); + int inr = (int)(idx % inner_size); + + int64_t t = target[(long long)n * inner_size + inr]; + if (ignore_index != LLONG_MIN && t == ignore_index) { + loss[idx] = (T_out)0; + return; + } + if (t < 0 || t >= C) { + loss[idx] = (T_out)0; + return; + } + + const long long base = ((long long)n * C * inner_size) + inr; + + // 数值稳定 LSE:lse = log(sum exp(x - m)) + m + float m = -CUDART_INF_F; + for (int c = 0; c < C; ++c) { + m = fmaxf(m, to_float(logits[base + (long long)c * inner_size])); + } + + float sum_exp = 0.f; + for (int c = 0; c < C; ++c) { + sum_exp += expf(to_float(logits[base + (long long)c * inner_size]) - m); + } + + float lse = logf(sum_exp) + m; + float logit_t = to_float(logits[base + (long long)(int)t * inner_size]); + loss[idx] = (T_out)(lse - logit_t); +} + +} // namespace cuda + +struct Descriptor::Opaque { + std::shared_ptr internal; + std::vector logits_shape; + Opaque(std::shared_ptr p) : internal(p) {} + ~Opaque() = default; +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t /*loss_desc*/, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t /*target_desc*/) { +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + const auto &orig = logits_desc->shape(); + auto opaque = new Opaque(handle->internal()); + + if (orig.size() == 1) { + opaque->logits_shape = {1, orig[0]}; + } else { + opaque->logits_shape = orig; + } + + const auto &s = opaque->logits_shape; + long long N = (long long)s[0]; + long long inner = 1; + for (size_t i = 2; i < s.size(); ++i) { + inner *= (long long)s[i]; + } + + size_t workspace_size = (size_t)(N * inner) * sizeof(float); + *desc_ptr = new Descriptor(dtype, workspace_size, opaque, handle->device, + handle->device_id); + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *loss, const void *logits, + const void *target, void *stream) const { +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) + const auto &s = _opaque->logits_shape; + int N = (int)s[0]; + int C = (int)s[1]; + long long inner = 1; + for (size_t i = 2; i < s.size(); ++i) { + inner *= (long long)s[i]; + } + long long total = (long long)N * inner; + + size_t need_ws = (size_t)total * sizeof(float); + if (workspace_size < need_ws) { + return INFINI_STATUS_INTERNAL_ERROR; + } + float *per_sample = reinterpret_cast(workspace); + + const int64_t *tgt_i64 = reinterpret_cast(target); + const int64_t ignore_index = -100; + + // 1) 写 per-sample loss -> workspace(float) + dim3 block(256); + dim3 grid((total + block.x - 1) / block.x); + cudaStream_t st = (cudaStream_t)stream; + + if (_dtype == INFINI_DTYPE_F32) { + cuda::softmaxCrossEntropy_per_sample<<>>( + per_sample, (const float *)logits, tgt_i64, N, C, inner, ignore_index); + } else if (_dtype == INFINI_DTYPE_F16) { + cuda::softmaxCrossEntropy_per_sample<<>>( + per_sample, (const half *)logits, tgt_i64, N, C, inner, ignore_index); + } else if (_dtype == INFINI_DTYPE_BF16) { + cuda::softmaxCrossEntropy_per_sample<__nv_bfloat16, float> + <<>>(per_sample, (const __nv_bfloat16 *)logits, + tgt_i64, N, C, inner, ignore_index); + } + { + auto err = cudaGetLastError(); + if (err != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } + + // 2) host 侧 mean(仅统计 target != ignore_index) + std::vector h_loss((size_t)total); + std::vector h_tgt((size_t)total); + if (cudaMemcpyAsync(h_loss.data(), per_sample, need_ws, + cudaMemcpyDeviceToHost, st) + != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + if (cudaMemcpyAsync(h_tgt.data(), tgt_i64, (size_t)total * sizeof(int64_t), + cudaMemcpyDeviceToHost, st) + != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + if (cudaStreamSynchronize(st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + double acc = 0.0; + long long cnt = 0; + for (long long i = 0; i < total; ++i) { + if (h_tgt[i] != ignore_index) { + acc += (double)h_loss[i]; + ++cnt; + } + } + double mean = (cnt > 0) ? (acc / (double)cnt) : 0.0; + + // 3) 把标量 mean 写回 device 的 loss 指针(按输入 dtype 写 1 个元素) + if (_dtype == INFINI_DTYPE_F32) { + float v = (float)mean; + if (cudaMemcpyAsync(loss, &v, sizeof(float), cudaMemcpyHostToDevice, st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } else if (_dtype == INFINI_DTYPE_F16) { + half v = __float2half((float)mean); + if (cudaMemcpyAsync(loss, &v, sizeof(half), cudaMemcpyHostToDevice, st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } else if (_dtype == INFINI_DTYPE_BF16) { + __nv_bfloat16 v = __float2bfloat16((float)mean); + if (cudaMemcpyAsync(loss, &v, sizeof(__nv_bfloat16), cudaMemcpyHostToDevice, + st) + != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } + if (cudaStreamSynchronize(st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} +} // namespace op::cross_entropy_loss::nvidia diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh new file mode 100644 index 000000000..843fc943d --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_LOSS_CUDA_CUH__ +#define __CROSS_ENTROPY_LOSS_CUDA_CUH__ + +#include "../cross_entropy_loss.h" + +DESCRIPTOR(nvidia) + +#endif // __CROSS_ENTROPY_LOSS_CUDA_CUH__ diff --git a/src/infiniop/ops/cross_entropy_loss/operator.cc b/src/infiniop/ops/cross_entropy_loss/operator.cc new file mode 100644 index 000000000..e9a47558f --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/operator.cc @@ -0,0 +1,143 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cross_entropy_loss.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cross_entropy_loss_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cross_entropy_loss_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cross_entropy_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor( + infiniopHandle_t handle, + infiniopCrossEntropyLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t loss_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cross_entropy_loss::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast< \ + op::cross_entropy_loss::NAMESPACE::Descriptor **>(desc_ptr), \ + loss_desc, logits_desc, target_desc) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize( + infiniopCrossEntropyLossDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast< \ + const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc) \ + ->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopCrossEntropyLoss( + infiniopCrossEntropyLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *loss, + const void *logits, + const void *target, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast< \ + const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc) \ + ->calculate(workspace, workspace_size, loss, logits, target, \ + stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor( + infiniopCrossEntropyLossDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast< \ + const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc new file mode 100644 index 000000000..508dcecc6 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc @@ -0,0 +1,284 @@ +#include "interpolate_nearest_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include + +namespace op::interpolate_nearest::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + InterpolateNearestInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const InterpolateNearestInfo &interpolate_info) + : handle(handle_ptr), info(interpolate_info) { + workspace_size = 0; + } + + template + size_t compute_input_index_1d(size_t idx) const { + size_t temp = idx; + + // 1D插值:3D张量 (N, C, W) + size_t w = temp % info.output_size[0]; + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + size_t input_w = std::min(static_cast(std::floor(static_cast(w) * inv_scale)), + info.input_size[0] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2]; + } + + // 计算2D插值的输入索引 + template + size_t compute_input_index_2d(size_t idx) const { + size_t temp = idx; + + // 2D插值:4D张量 (N, C, H, W) + size_t w = temp % info.output_size[1]; // width在索引1 + temp /= info.output_size[1]; + size_t h = temp % info.output_size[0]; // height在索引0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_h = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_w = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + + size_t input_h = std::min(static_cast(std::floor(static_cast(h) * inv_scale_h)), + info.input_size[0] - 1); + size_t input_w = std::min(static_cast(std::floor(static_cast(w) * inv_scale_w)), + info.input_size[1] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3]; + } + + // 计算3D插值的输入索引 + template + size_t compute_input_index_3d(size_t idx) const { + size_t temp = idx; + + // 3D插值:5D张量 (N, C, D, H, W) + size_t w = temp % info.output_size[2]; // width在索引2 + temp /= info.output_size[2]; + size_t h = temp % info.output_size[1]; // height在索引1 + temp /= info.output_size[1]; + size_t d = temp % info.output_size[0]; // depth在索引0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_d = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_h = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + float inv_scale_w = static_cast(info.input_size[2]) / static_cast(info.output_size[2]); + + size_t input_d = std::min(static_cast(std::floor(static_cast(d) * inv_scale_d)), + info.input_size[0] - 1); + size_t input_h = std::min(static_cast(std::floor(static_cast(h) * inv_scale_h)), + info.input_size[1] - 1); + size_t input_w = std::min(static_cast(std::floor(static_cast(w) * inv_scale_w)), + info.input_size[2] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4]; + } + + // 计算输出索引 + template + size_t compute_output_index(size_t idx) const { + size_t temp = idx; + size_t w, h, d, c, b; + + switch (info.dim) { + case INTERPOLATE_1D: { + // 3D张量 (N, C, W) + w = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2]; + } + + case INTERPOLATE_2D: { + // 4D张量 (N, C, H, W) + w = temp % info.output_size[1]; + temp /= info.output_size[1]; + h = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3]; + } + + case INTERPOLATE_3D: { + // 5D张量 (N, C, D, H, W) + w = temp % info.output_size[2]; + temp /= info.output_size[2]; + h = temp % info.output_size[1]; + temp /= info.output_size[1]; + d = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4]; + } + + default: + return 0; + } + } + + // 计算总元素数 + size_t calculate_total_elements() const { + size_t total = info.batch_size * info.channels; + switch (info.dim) { + case INTERPOLATE_1D: + total *= info.output_size[0]; // width + break; + case INTERPOLATE_2D: + total *= info.output_size[0] * info.output_size[1]; // height * width + break; + case INTERPOLATE_3D: + total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width + break; + } + return total; + } + + // 主要的插值计算函数 + template + void interpolate_nearest_cpu(T *output, const T *input) const { + size_t total_elements = calculate_total_elements(); + +#pragma omp parallel for schedule(static) + for (ptrdiff_t idx = 0; idx < static_cast(total_elements); ++idx) { + size_t input_idx; + + switch (info.dim) { + case INTERPOLATE_1D: + input_idx = compute_input_index_1d(idx); + break; + case INTERPOLATE_2D: + input_idx = compute_input_index_2d(idx); + break; + case INTERPOLATE_3D: + input_idx = compute_input_index_3d(idx); + break; + default: + continue; + } + + size_t output_idx = compute_output_index(idx); + output[output_idx] = input[input_idx]; + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + const InterpolateNearestInfo &info, + infiniDtype_t data_type) { + if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16 && data_type != INFINI_DTYPE_I8) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *output, const void *input, infiniDtype_t dtype) const { + + if (!output || !input) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = static_cast(output); + const float *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_F16: { + fp16_t *typed_output = static_cast(output); + const fp16_t *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_BF16: { + bf16_t *typed_output = static_cast(output); + const bf16_t *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_I8: { + int8_t *typed_output = static_cast(output); + const int8_t *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + // 检查数据类型支持 + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_I8); + + InterpolateNearestInfo info; + CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc)); + + auto opaque_result = Opaque::create(handle, info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, info, opaque->workspace_size, opaque, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + return _opaque->calculate(workspace, workspace_size, output, input, _dtype); +} + +} // namespace op::interpolate_nearest::cpu diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h new file mode 100644 index 000000000..78dd3ff97 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h @@ -0,0 +1,8 @@ +#ifndef __INTERPOLATE_NEAREST_CPU_H__ +#define __INTERPOLATE_NEAREST_CPU_H__ + +#include "../interpolate_nearest.h" + +DESCRIPTOR(cpu) + +#endif // __INTERPOLATE_NEAREST_CPU_H__ diff --git a/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh new file mode 100644 index 000000000..60c798792 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh @@ -0,0 +1,168 @@ +#ifndef INTERPOLATE_NEAREST_KERNEL_CUH +#define INTERPOLATE_NEAREST_KERNEL_CUH + +#include "../info.h" +#include + +template +__device__ inline size_t +compute_input_index_1d(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + + // 1D 插值:3D 张量 (N, C, W) + size_t w = temp % info.output_size[0]; // width 在索引 0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + size_t input_w = min(static_cast(floorf(static_cast(w) * inv_scale)), + info.input_size[0] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2]; +} + +template +__device__ inline size_t +compute_input_index_2d(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + + // 2D 插值:4D 张量 (N, C, H, W) + size_t w = temp % info.output_size[1]; // width 在索引 1 + temp /= info.output_size[1]; + size_t h = temp % info.output_size[0]; // height 在索引 0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_h = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_w = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + + size_t input_h = min(static_cast(floorf(static_cast(h) * inv_scale_h)), + info.input_size[0] - 1); + size_t input_w = min(static_cast(floorf(static_cast(w) * inv_scale_w)), + info.input_size[1] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3]; +} + +template +__device__ inline size_t +compute_input_index_3d(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + + // 3D 插值:5D 张量 (N, C, D, H, W) + size_t w = temp % info.output_size[2]; // width 在索引 2 + temp /= info.output_size[2]; + size_t h = temp % info.output_size[1]; // height 在索引 1 + temp /= info.output_size[1]; + size_t d = temp % info.output_size[0]; // depth 在索引 0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_d = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_h = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + float inv_scale_w = static_cast(info.input_size[2]) / static_cast(info.output_size[2]); + + size_t input_d = min(static_cast(floorf(static_cast(d) * inv_scale_d)), + info.input_size[0] - 1); + size_t input_h = min(static_cast(floorf(static_cast(h) * inv_scale_h)), + info.input_size[1] - 1); + size_t input_w = min(static_cast(floorf(static_cast(w) * inv_scale_w)), + info.input_size[2] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4]; +} + +template +__device__ inline size_t +compute_output_index(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + size_t w, h, d, c, b; + + switch (info.dim) { + case INTERPOLATE_1D: { + // 3D 张量 (N, C, W) + w = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2]; + } + + case INTERPOLATE_2D: { + // 4D 张量 (N, C, H, W) + w = temp % info.output_size[1]; + temp /= info.output_size[1]; + h = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3]; + } + + case INTERPOLATE_3D: { + // 5D 张量 (N, C, D, H, W) + w = temp % info.output_size[2]; + temp /= info.output_size[2]; + h = temp % info.output_size[1]; + temp /= info.output_size[1]; + d = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4]; + } + + default: + return 0; + } +} + +__host__ __device__ inline size_t +calculate_total_elements(const InterpolateNearestInfo &info) { + size_t total = info.batch_size * info.channels; + switch (info.dim) { + case INTERPOLATE_1D: + total *= info.output_size[0]; // width + break; + case INTERPOLATE_2D: + total *= info.output_size[0] * info.output_size[1]; // height * width + break; + case INTERPOLATE_3D: + total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width + break; + } + return total; +} + +template +__global__ void interpolate_nearest_kernel(T *output, const T *input, + InterpolateNearestInfo info) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total_elements = calculate_total_elements(info); + + if (idx < total_elements) { + size_t input_idx; + + switch (info.dim) { + case INTERPOLATE_1D: + input_idx = compute_input_index_1d(idx, info); + break; + case INTERPOLATE_2D: + input_idx = compute_input_index_2d(idx, info); + break; + case INTERPOLATE_3D: + input_idx = compute_input_index_3d(idx, info); + break; + default: + return; + } + + size_t output_idx = compute_output_index(idx, info); + output[output_idx] = input[input_idx]; + } +} + +#endif // INTERPOLATE_NEAREST_KERNEL_CUH diff --git a/src/infiniop/ops/interpolate_nearest/info.h b/src/infiniop/ops/interpolate_nearest/info.h new file mode 100644 index 000000000..162d6eb02 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/info.h @@ -0,0 +1,118 @@ +#ifndef __INTERPOLATE_NEAREST_INFO_H__ +#define __INTERPOLATE_NEAREST_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +enum InterpolateDim { + INTERPOLATE_1D = 1, // 3D 张量 (N, C, W) + INTERPOLATE_2D = 2, // 4D 张量 (N, C, H, W) + INTERPOLATE_3D = 3 // 5D 张量 (N, C, D, H, W) +}; + +struct InterpolateNearestInfo { + size_t batch_size; + size_t channels; + + // 输入和输出的空间维度大小 + size_t input_size[3]; // [depth/height/width] 根据维度使用不同数量 + size_t output_size[3]; // [depth/height/width] 根据维度使用不同数量 + + InterpolateDim dim; // 插值维度:1D, 2D, 3D + infiniDtype_t dtype; + + // 张量步长(最多支持 5D 张量) + size_t input_stride[5]; + size_t output_stride[5]; + + static infiniStatus_t create( + InterpolateNearestInfo *info, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + + // 检查数据类型 + if (input_desc->dtype() != output_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto input_shape = input_desc->shape(); + auto output_shape = output_desc->shape(); + auto input_stride = input_desc->strides(); + auto output_stride = output_desc->strides(); + + // 根据张量维度确定插值类型 + if (input_desc->ndim() == 3 && output_desc->ndim() == 3) { + // 1D 插值:3D 张量 (N, C, W) + info->dim = INTERPOLATE_1D; + info->batch_size = input_shape[0]; + info->channels = input_shape[1]; + info->input_size[0] = input_shape[2]; // width + info->output_size[0] = output_shape[2]; // width + + // 检查 N,C 维度匹配 + if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 复制步长 + for (int i = 0; i < 3; ++i) { + info->input_stride[i] = input_stride[i]; + info->output_stride[i] = output_stride[i]; + } + + } else if (input_desc->ndim() == 4 && output_desc->ndim() == 4) { + // 2D 插值:4D 张量 (N, C, H, W) + info->dim = INTERPOLATE_2D; + info->batch_size = input_shape[0]; + info->channels = input_shape[1]; + info->input_size[0] = input_shape[2]; // height + info->input_size[1] = input_shape[3]; // width + info->output_size[0] = output_shape[2]; // height + info->output_size[1] = output_shape[3]; // width + + // 检查 N,C 维度匹配 + if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 复制步长 + for (int i = 0; i < 4; ++i) { + info->input_stride[i] = input_stride[i]; + info->output_stride[i] = output_stride[i]; + } + + } else if (input_desc->ndim() == 5 && output_desc->ndim() == 5) { + // 3D 插值:5D 张量 (N, C, D, H, W) + info->dim = INTERPOLATE_3D; + info->batch_size = input_shape[0]; + info->channels = input_shape[1]; + info->input_size[0] = input_shape[2]; // depth + info->input_size[1] = input_shape[3]; // height + info->input_size[2] = input_shape[4]; // width + info->output_size[0] = output_shape[2]; // depth + info->output_size[1] = output_shape[3]; // height + info->output_size[2] = output_shape[4]; // width + + // 检查 N,C 维度匹配 + if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 复制步长 + for (int i = 0; i < 5; ++i) { + info->input_stride[i] = input_stride[i]; + info->output_stride[i] = output_stride[i]; + } + + } else { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info->dtype = input_desc->dtype(); + return INFINI_STATUS_SUCCESS; + } +}; + +#endif // __INTERPOLATE_NEAREST_INFO_H__ diff --git a/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h new file mode 100644 index 000000000..73499c2ff --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h @@ -0,0 +1,51 @@ +#ifndef __INTERPOLATE_NEAREST_H__ +#define __INTERPOLATE_NEAREST_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::interpolate_nearest::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + \ + InterpolateNearestInfo _info; \ + infiniDtype_t _dtype; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + InterpolateNearestInfo info, \ + size_t workspace_size, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _dtype(dtype), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __INTERPOLATE_NEAREST_H__ diff --git a/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h new file mode 100644 index 000000000..1619dbf2f --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h @@ -0,0 +1,8 @@ +#ifndef __INTERPOLATE_NEAREST_METAX_H__ +#define __INTERPOLATE_NEAREST_METAX_H__ + +#include "../interpolate_nearest.h" + +DESCRIPTOR(metax) + +#endif // __INTERPOLATE_NEAREST_METAX_H__ diff --git a/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca new file mode 100644 index 000000000..5cf0e5e66 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca @@ -0,0 +1,86 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include "interpolate_nearest_metax.h" +#include +#include + +#include "../cuda/kernel.cuh" + +namespace op::interpolate_nearest::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + Opaque(std::shared_ptr internal_) : internal(internal_) {} +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32 && + dtype != INFINI_DTYPE_BF16 && dtype != INFINI_DTYPE_I8) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + InterpolateNearestInfo info; + CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc)); + + *desc_ptr = new Descriptor(dtype, info, 0, new Opaque{handle->internal()}, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +inline void launch_interpolate_nearest_kernel(T *output, const T *input, InterpolateNearestInfo info, int grid_size, int block_size, hcStream_t stream) { + interpolate_nearest_kernel<<>>(output, input, info); +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + + int total_elements = calculate_total_elements(_info); + int block_size = 256; + int grid_size = (total_elements + block_size - 1) / block_size; + + switch (_dtype) { + case INFINI_DTYPE_F32: { + auto typed_output = reinterpret_cast(output); + auto typed_input = reinterpret_cast(input); + launch_interpolate_nearest_kernel(typed_output, typed_input, _info, grid_size, block_size, stream); + break; + } + case INFINI_DTYPE_F16: { + auto typed_output = reinterpret_cast(output); + auto typed_input = reinterpret_cast(input); + launch_interpolate_nearest_kernel(typed_output, typed_input, _info, grid_size, block_size, stream); + break; + } + case INFINI_DTYPE_BF16: { + auto typed_output = reinterpret_cast<__hpcc_bfloat16 *>(output); + auto typed_input = reinterpret_cast(input); + launch_interpolate_nearest_kernel<__hpcc_bfloat16>(typed_output, typed_input, _info, grid_size, block_size, stream); + break; + } + case INFINI_DTYPE_I8: { + auto typed_output = reinterpret_cast(output); + auto typed_input = reinterpret_cast(input); + launch_interpolate_nearest_kernel(typed_output, typed_input, _info, grid_size, block_size, stream); + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::interpolate_nearest::metax diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu new file mode 100644 index 000000000..a7b63c6f4 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu @@ -0,0 +1,93 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "interpolate_nearest_nvidia.cuh" +#include +#include +#include + +namespace op::interpolate_nearest::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + + Opaque(std::shared_ptr internal_) + : internal(internal_) {} +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + // Check supported data types + if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32 && dtype != INFINI_DTYPE_BF16 && dtype != INFINI_DTYPE_I8) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + InterpolateNearestInfo info; + CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc)); + + *desc_ptr = new Descriptor(dtype, info, 0, new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + + size_t total_elements = calculate_total_elements(_info); + + int block_size = 256; + int grid_size = (total_elements + block_size - 1) / block_size; + + switch (_dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = reinterpret_cast(output); + const float *typed_input = reinterpret_cast(input); + interpolate_nearest_kernel + <<>>(typed_output, typed_input, + _info); + } break; + + case INFINI_DTYPE_F16: { + half *typed_output = reinterpret_cast(output); + const half *typed_input = reinterpret_cast(input); + interpolate_nearest_kernel<<>>( + typed_output, typed_input, _info); + } break; + + case INFINI_DTYPE_BF16: { + auto typed_output = reinterpret_cast<__nv_bfloat16 *>(output); + auto typed_input = reinterpret_cast(input); + interpolate_nearest_kernel<__nv_bfloat16> + <<>>(typed_output, typed_input, + _info); + } break; + + case INFINI_DTYPE_I8: { + auto typed_output = reinterpret_cast(output); + auto typed_input = reinterpret_cast(input); + interpolate_nearest_kernel + <<>>(typed_output, typed_input, + _info); + } break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + CHECK_CUDA(cudaGetLastError()); + CHECK_CUDA(cudaStreamSynchronize(cuda_stream)); + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::interpolate_nearest::nvidia diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh new file mode 100644 index 000000000..aab5f7882 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh @@ -0,0 +1,9 @@ +#ifndef __INTERPOLATE_NEAREST_NVIDIA_CUH__ +#define __INTERPOLATE_NEAREST_NVIDIA_CUH__ + +#include "../../../devices/nvidia/nvidia_handle.h" +#include "../interpolate_nearest.h" + +DESCRIPTOR(nvidia) + +#endif // __INTERPOLATE_NEAREST_NVIDIA_CUH__ diff --git a/src/infiniop/ops/interpolate_nearest/operator.cc b/src/infiniop/ops/interpolate_nearest/operator.cc new file mode 100644 index 000000000..0a0f99ee1 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/interpolate_nearest.h" + +#ifdef ENABLE_CPU_API +#include "cpu/interpolate_nearest_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/interpolate_nearest_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/interpolate_nearest_metax.h" +#endif + +__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor( + infiniopHandle_t handle, + infiniopInterpolateNearestDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::interpolate_nearest::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize( + infiniopInterpolateNearestDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopInterpolateNearest( + infiniopInterpolateNearestDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor( + infiniopInterpolateNearestDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc new file mode 100644 index 000000000..5c729e7e8 --- /dev/null +++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc @@ -0,0 +1,322 @@ +#include "maxpool_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include + +namespace op::maxpool::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + MaxPoolInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const MaxPoolInfo &maxpool_info) + : handle(handle_ptr), info(maxpool_info) { + // CPU实现通常不需要额外的工作空间 + workspace_size = 0; + } + + // 获取数据类型的最小值 + template + static T get_min_value() { + if constexpr (std::is_same::value) { + return -std::numeric_limits::infinity(); + } else if constexpr (std::is_same::value) { + return _f32_to_f16(-std::numeric_limits::infinity()); + } else if constexpr (std::is_same::value) { + return _f32_to_bf16(-std::numeric_limits::infinity()); + } else { + return std::numeric_limits::lowest(); + } + } + + // 比较两个值的大小(处理半精度类型) + template + static bool is_greater(const T &a, const T &b) { + if constexpr (std::is_same::value) { + return utils::cast(a) > utils::cast(b); + } else if constexpr (std::is_same::value) { + return utils::cast(a) > utils::cast(b); + } else { + return a > b; + } + } + + // 1D最大池化 + template + void maxpool_1d(T *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_width = info.input_dims[0]; + size_t output_width = info.output_dims[0]; + size_t kernel_width = info.kernel_sizes[0]; + size_t stride_width = info.strides[0]; + size_t pad_width = info.pads[0]; + + // 并行处理每个批次和通道 +#pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_width + c * input_width; + size_t output_offset = b * channels * output_width + c * output_width; + + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + bool found_valid = false; + + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kw = start_w; kw < end_w; ++kw) { + if (kw >= 0 && kw < static_cast(input_width)) { + T val = input[input_offset + kw]; + if (!found_valid || is_greater(val, max_val)) { + max_val = val; + found_valid = true; + } + } + } + + output[output_offset + ow] = max_val; + } + } + } + } + + // 2D最大池化 + template + void maxpool_2d(T *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_height = info.input_dims[0]; + size_t input_width = info.input_dims[1]; + size_t output_height = info.output_dims[0]; + size_t output_width = info.output_dims[1]; + size_t kernel_height = info.kernel_sizes[0]; + size_t kernel_width = info.kernel_sizes[1]; + size_t stride_height = info.strides[0]; + size_t stride_width = info.strides[1]; + size_t pad_height = info.pads[0]; + size_t pad_width = info.pads[1]; + + // 并行处理每个批次和通道 +#pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_height * input_width + c * input_height * input_width; + size_t output_offset = b * channels * output_height * output_width + c * output_height * output_width; + + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + bool found_valid = false; + + int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + int end_h = start_h + static_cast(kernel_height); + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kh = start_h; kh < end_h; ++kh) { + for (int kw = start_w; kw < end_w; ++kw) { + if (kh >= 0 && kh < static_cast(input_height) && kw >= 0 && kw < static_cast(input_width)) { + T val = input[input_offset + kh * input_width + kw]; + if (!found_valid || is_greater(val, max_val)) { + max_val = val; + found_valid = true; + } + } + } + } + + output[output_offset + oh * output_width + ow] = max_val; + } + } + } + } + } + + // 3D最大池化 + template + void maxpool_3d(T *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_depth = info.input_dims[0]; + size_t input_height = info.input_dims[1]; + size_t input_width = info.input_dims[2]; + size_t output_depth = info.output_dims[0]; + size_t output_height = info.output_dims[1]; + size_t output_width = info.output_dims[2]; + size_t kernel_depth = info.kernel_sizes[0]; + size_t kernel_height = info.kernel_sizes[1]; + size_t kernel_width = info.kernel_sizes[2]; + size_t stride_depth = info.strides[0]; + size_t stride_height = info.strides[1]; + size_t stride_width = info.strides[2]; + size_t pad_depth = info.pads[0]; + size_t pad_height = info.pads[1]; + size_t pad_width = info.pads[2]; + + // 并行处理每个批次和通道 +#pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_depth * input_height * input_width + c * input_depth * input_height * input_width; + size_t output_offset = b * channels * output_depth * output_height * output_width + c * output_depth * output_height * output_width; + + for (size_t od = 0; od < output_depth; ++od) { + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + bool found_valid = false; + + int start_d = static_cast(od * stride_depth) - static_cast(pad_depth); + int end_d = start_d + static_cast(kernel_depth); + int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + int end_h = start_h + static_cast(kernel_height); + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kd = start_d; kd < end_d; ++kd) { + for (int kh = start_h; kh < end_h; ++kh) { + for (int kw = start_w; kw < end_w; ++kw) { + if (kd >= 0 && kd < static_cast(input_depth) && kh >= 0 && kh < static_cast(input_height) && kw >= 0 && kw < static_cast(input_width)) { + T val = input[input_offset + kd * input_height * input_width + kh * input_width + kw]; + if (!found_valid || is_greater(val, max_val)) { + max_val = val; + found_valid = true; + } + } + } + } + } + + output[output_offset + od * output_height * output_width + oh * output_width + ow] = max_val; + } + } + } + } + } + } + + // 主要的最大池化计算函数 + template + void maxpool_cpu(T *output, const T *input) const { + switch (info.ndim) { + case 1: + maxpool_1d(output, input); + break; + case 2: + maxpool_2d(output, input); + break; + case 3: + maxpool_3d(output, input); + break; + default: + break; + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + MaxPoolInfo &info, + infiniDtype_t data_type) { + if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *output, const void *input, infiniDtype_t dtype) const { + + if (!output || !input) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = static_cast(output); + const float *typed_input = static_cast(input); + maxpool_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_F16: { + fp16_t *typed_output = static_cast(output); + const fp16_t *typed_input = static_cast(input); + maxpool_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_BF16: { + bf16_t *typed_output = static_cast(output); + const bf16_t *typed_input = static_cast(input); + maxpool_cpu(typed_output, typed_input); + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle, info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + return _opaque->calculate(workspace, workspace_size, output, input, _dtype); +} + +} // namespace op::maxpool::cpu diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h new file mode 100644 index 000000000..f3ecd349d --- /dev/null +++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h @@ -0,0 +1,8 @@ +#ifndef __MAX_POOL_CPU_H__ +#define __MAX_POOL_CPU_H__ + +#include "../maxpool.h" + +DESCRIPTOR(cpu) + +#endif // __MAX_POOL_CPU_H__ diff --git a/src/infiniop/ops/maxpool/info.h b/src/infiniop/ops/maxpool/info.h new file mode 100644 index 000000000..ff56fe28c --- /dev/null +++ b/src/infiniop/ops/maxpool/info.h @@ -0,0 +1,113 @@ +#ifndef __MAX_POOL_INFO_H__ +#define __MAX_POOL_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +namespace op::maxpool { + +inline utils::Result calculateMaxPoolOutputSize( + size_t input_size, + size_t kernel_size, + size_t stride, + size_t padding = 0, + bool ceil_mode = false) { + + if (stride == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + if (kernel_size == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + + // 理论最大输出数 + size_t max_output = 0; + if (ceil_mode) { + max_output = (input_size + 2 * padding - kernel_size + stride - 1) / stride + 1; + } else { + max_output = (input_size + 2 * padding - kernel_size) / stride + 1; + } + + size_t valid_output = 0; + for (size_t i = 0; i < max_output; ++i) { + int64_t start = static_cast(i) * stride - padding; + int64_t end = start + kernel_size; + // 判断区间 [start, end) 和 [0, input_size) 是否有交集 + int64_t real_start = std::max(start, int64_t(0)); + int64_t real_end = std::min(end, int64_t(input_size)); + if (real_end > real_start) { + ++valid_output; + } + } + return utils::Result(valid_output); +} + +class MaxPoolInfo { + MaxPoolInfo() = default; + +public: + std::vector input_dims; + std::vector output_dims; + std::vector kernel_sizes; + std::vector strides; + std::vector pads; + bool ceil_mode; + size_t ndim; + size_t batch; + size_t channels; + + static utils::Result create( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + MaxPoolInfo info; + + if (input_desc->ndim() < 3 || input_desc->ndim() > 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->ndim() != output_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.ndim = input_desc->ndim() - 2; // spatial dimensions + info.batch = input_desc->dim(0); + info.channels = input_desc->dim(1); + info.ceil_mode = ceil_mode; + + auto kernel_ptr = reinterpret_cast(kernel_size); + auto stride_ptr = reinterpret_cast(strides); + auto pad_ptr = reinterpret_cast(pads); + + // Get spatial dimensions + for (size_t i = 0; i < info.ndim; ++i) { + info.input_dims.push_back(input_desc->dim(i + 2)); + info.kernel_sizes.push_back(kernel_ptr[i]); + info.strides.push_back(stride_ptr[i]); + info.pads.push_back(pad_ptr[i]); + auto output_size = calculateMaxPoolOutputSize( + info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode); + CHECK_RESULT(output_size); + size_t expected_size = output_size.take(); + if (expected_size != output_desc->dim(i + 2)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.output_dims.push_back(output_desc->dim(i + 2)); + } + return utils::Result(std::move(info)); + } +}; +} // namespace op::maxpool + +#endif // __MAX_POOL_INFO_H__ diff --git a/src/infiniop/ops/maxpool/maxpool.h b/src/infiniop/ops/maxpool/maxpool.h new file mode 100644 index 000000000..5ee7703c5 --- /dev/null +++ b/src/infiniop/ops/maxpool/maxpool.h @@ -0,0 +1,53 @@ +#ifndef __MAX_POOL_H__ +#define __MAX_POOL_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::maxpool::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + MaxPoolInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + MaxPoolInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + void *kernel_size, \ + void *strides, \ + void *pads, \ + bool ceil_mode); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __MAX_POOL_H__ diff --git a/src/infiniop/ops/maxpool/metax/maxpool_metax.cc b/src/infiniop/ops/maxpool/metax/maxpool_metax.cc new file mode 100644 index 000000000..b70286abd --- /dev/null +++ b/src/infiniop/ops/maxpool/metax/maxpool_metax.cc @@ -0,0 +1,217 @@ +#include "maxpool_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" + +#define DESTROY_hcdnn_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_hcdnn_DESCRIPTORS() \ + do { \ + DESTROY_hcdnn_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(output_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(pooling_desc, hcdnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::maxpool::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_HCDNN_API + hcdnnTensorDescriptor_t input_desc = nullptr; + hcdnnTensorDescriptor_t output_desc = nullptr; + hcdnnPoolingDescriptor_t pooling_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_HCDNN_API + infiniStatus_t createPoolingDescriptors(const MaxPoolInfo &info, + hcdnnDataType_t hcdnn_data_type) { + // 创建输入输出张量描述符 + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc)); + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&output_desc)); + CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_desc)); + + // 构建输入输出维度(NCHW格式) + std::vector input_dims = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims = {static_cast(info.batch), + static_cast(info.channels)}; + for (size_t i = 0; i < info.ndim; ++i) { + input_dims.push_back(static_cast(info.input_dims[i])); + output_dims.push_back(static_cast(info.output_dims[i])); + } + + // 1D池化补充维度 + if (info.ndim == 1) { + input_dims.push_back(1); + output_dims.push_back(1); + } + + // 计算输入输出张量的步幅 + std::vector input_strides(input_dims.size(), 1); + std::vector output_strides(output_dims.size(), 1); + for (int i = input_dims.size() - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + output_strides[i] = output_strides[i + 1] * output_dims[i + 1]; + } + + // 设置张量描述符(NCHW格式) + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + input_desc, hcdnn_data_type, input_dims.size(), input_dims.data(), input_strides.data())); + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + output_desc, hcdnn_data_type, output_dims.size(), output_dims.data(), + output_strides.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const MaxPoolInfo &info) { + // 构建池化参数 + std::vector kernel_size, strides, pads; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_size.push_back(static_cast(info.kernel_sizes[i])); + strides.push_back(static_cast(info.strides[i])); + pads.push_back(static_cast(info.pads[i])); + } + + // 1D池化补充维度 + if (info.ndim == 1) { + kernel_size.push_back(1); + strides.push_back(1); + pads.push_back(0); + } + + // 设置最大池化描述符(确定性模式) + CHECK_MCDNN(hcdnnSetPoolingNdDescriptor( + pooling_desc, HCDNN_POOLING_MAX_DETERMINISTIC, // 确定性最大池化 + HCDNN_NOT_PROPAGATE_NAN, // 不传播NaN + kernel_size.size(), + kernel_size.data(), + pads.data(), + strides.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializehcdnnContext(MaxPoolInfo &info, + infiniDtype_t data_type) { + hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type); + CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // 最大池化通常不需要工作空间 + workspace_size = 0; + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) +#ifdef ENABLE_HCDNN_API + , + input_desc(other.input_desc), output_desc(other.output_desc), pooling_desc(other.pooling_desc) +#endif + { +#ifdef ENABLE_HCDNN_API + other.input_desc = nullptr; + other.output_desc = nullptr; + other.pooling_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_HCDNN_API + CLEANUP_hcdnn_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + MaxPoolInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_HCDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializehcdnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_HCDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + +#ifdef ENABLE_HCDNN_API + const float alpha = 1.0f, beta = 0.0f; + + // 执行最大池化前向计算 + CHECK_STATUS(_opaque->internal->useMcdnn( + (hcStream_t)stream, [&](hcdnnHandle_t handle) { + CHECK_MCDNN(hcdnnPoolingForward(handle, _opaque->pooling_desc, &alpha, + _opaque->input_desc, input, &beta, + _opaque->output_desc, output)); + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::maxpool::metax diff --git a/src/infiniop/ops/maxpool/metax/maxpool_metax.h b/src/infiniop/ops/maxpool/metax/maxpool_metax.h new file mode 100644 index 000000000..5051358de --- /dev/null +++ b/src/infiniop/ops/maxpool/metax/maxpool_metax.h @@ -0,0 +1,8 @@ +#ifndef __MAX_POOL_METAX_H__ +#define __MAX_POOL_METAX_H__ + +#include "../maxpool.h" + +DESCRIPTOR(metax) + +#endif // __MAX_POOL_METAX_CUH__ diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu new file mode 100644 index 000000000..8b94a29c1 --- /dev/null +++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu @@ -0,0 +1,240 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "maxpool_nvidia.cuh" + +#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_CUDNN_DESCRIPTORS() \ + do { \ + DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::maxpool::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_CUDNN_API + cudnnTensorDescriptor_t input_desc = nullptr; + cudnnTensorDescriptor_t output_desc = nullptr; + cudnnPoolingDescriptor_t pooling_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_CUDNN_API + infiniStatus_t getCudnnDataType(infiniDtype_t data_type, + cudnnDataType_t &cudnn_data_type) const { + if (data_type == INFINI_DTYPE_F16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_F32) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_BF16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t createPoolingDescriptors(const MaxPoolInfo &info, + cudnnDataType_t cudnn_data_type) { + // Create CUDNN descriptors + CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc)); + CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc)); + + // Setup tensor descriptors + std::vector input_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + + for (size_t i = 0; i < info.ndim; ++i) { + input_dims_vec.push_back(static_cast(info.input_dims[i])); + output_dims_vec.push_back(static_cast(info.output_dims[i])); + } + + if (info.ndim == 1) { + // For 1D pooling, add dummy dimension + input_dims_vec.push_back(1); + output_dims_vec.push_back(1); + } + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(), + input_dims_vec.data())); + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(), + output_dims_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const MaxPoolInfo &info) { + // Setup pooling descriptor + std::vector kernel_vec, stride_vec, pad_vec; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_vec.push_back(static_cast(info.kernel_sizes[i])); + stride_vec.push_back(static_cast(info.strides[i])); + pad_vec.push_back(static_cast(info.pads[i])); + } + + if (info.ndim == 1) { + // For 1D pooling, add dummy dimension + kernel_vec.push_back(1); + stride_vec.push_back(1); + pad_vec.push_back(0); + } + + CHECK_CUDNN(cudnnSetPoolingNdDescriptor( + pooling_desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, + kernel_vec.size(), kernel_vec.data(), pad_vec.data(), + stride_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializeCudnnContext(MaxPoolInfo &info, + infiniDtype_t data_type) { + cudnnDataType_t cudnn_data_type; + CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type)); + + CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // Max pooling typically doesn't need workspace + workspace_size = 0; + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) + // clang-format off +#ifdef ENABLE_CUDNN_API + , input_desc(other.input_desc) + , output_desc(other.output_desc) + , pooling_desc(other.pooling_desc) +#endif + // clang-format on + { +#ifdef ENABLE_CUDNN_API + other.input_desc = nullptr; + other.output_desc = nullptr; + other.pooling_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_CUDNN_API + CLEANUP_CUDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + MaxPoolInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_CUDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializeCudnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_CUDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + +#ifdef ENABLE_CUDNN_API + const float alpha = 1.0f, beta = 0.0f; + + // 打印input展平后的前十个数据 + // printf("MaxPool input (first 10 elements): "); + // const uint16_t *input_data = static_cast(input); + // for (int i = 0; i < 10; ++i) { + // // 将BF16转换为float显示 + // union { + // uint32_t bits; + // float value; + // } converter; + // uint16_t bf16_val = input_data[i]; + // converter.bits = static_cast(bf16_val) << 16; + // printf("%f ", converter.value); + // } + // printf("\n"); + + CHECK_STATUS(_opaque->internal->useCudnn( + (cudaStream_t)stream, [&](cudnnHandle_t handle) { + CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha, + _opaque->input_desc, input, &beta, + _opaque->output_desc, output)); + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::maxpool::nvidia diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh new file mode 100644 index 000000000..539ad5a1a --- /dev/null +++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MAX_POOL_CUDA_CUH__ +#define __MAX_POOL_CUDA_CUH__ + +#include "../maxpool.h" + +DESCRIPTOR(nvidia) + +#endif // __MAX_POOL_CUDA_CUH__ diff --git a/src/infiniop/ops/maxpool/operator.cc b/src/infiniop/ops/maxpool/operator.cc new file mode 100644 index 000000000..aedfc0585 --- /dev/null +++ b/src/infiniop/ops/maxpool/operator.cc @@ -0,0 +1,155 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/maxpool.h" + +#ifdef ENABLE_CPU_API +#include "cpu/maxpool_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/maxpool_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/maxpool_metax.h" +#endif + +__C infiniStatus_t infiniopCreateMaxPoolDescriptor( + infiniopHandle_t handle, + infiniopMaxPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::maxpool::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + kernel_size, \ + strides, \ + pads, \ + ceil_mode) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize( + infiniopMaxPoolDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopMaxPool( + infiniopMaxPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, \ + output, \ + input, \ + stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.cc b/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.cc new file mode 100644 index 000000000..ee2ba3628 --- /dev/null +++ b/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.cc @@ -0,0 +1,638 @@ +#include "maxpool_backward_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include +#include + +namespace op::maxpool_backward::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + MaxPoolBackwardInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const MaxPoolBackwardInfo &maxpool_info) + : handle(handle_ptr), info(maxpool_info) { + workspace_size = 0; + } + + // F16专用:使用float计算的最大池化反向传播 + void maxpool_backward_f16_as_float(fp16_t *grad_input, const fp16_t *grad_output, const fp16_t *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + + // 计算总的输入和输出大小 + size_t total_input_size = batch_size * channels; + size_t total_output_size = batch_size * channels; + + for (size_t i = 0; i < info.ndim; ++i) { + total_input_size *= info.input_dims[i]; + total_output_size *= info.output_dims[i]; + } + + // 分配float临时缓冲区 + std::vector float_input(total_input_size); + std::vector float_grad_output(total_output_size); + std::vector float_grad_input(total_input_size, 0.0f); + + // 转换输入数据为float + for (size_t i = 0; i < total_input_size; ++i) { + float_input[i] = utils::cast(input[i]); + } + for (size_t i = 0; i < total_output_size; ++i) { + float_grad_output[i] = utils::cast(grad_output[i]); + } + + // 使用float精度进行计算 + maxpool_backward_cpu_float(float_grad_input.data(), float_grad_output.data(), float_input.data()); + + // 转换结果回F16 + for (size_t i = 0; i < total_input_size; ++i) { + grad_input[i] = utils::cast(float_grad_input[i]); + } + } + + // Float版本的最大池化反向传播 + void maxpool_backward_cpu_float(float *grad_input, const float *grad_output, const float *input) const { + switch (info.ndim) { + case 1: + maxpool_backward_1d_float(grad_input, grad_output, input); + break; + case 2: + maxpool_backward_2d_float(grad_input, grad_output, input); + break; + case 3: + maxpool_backward_3d_float(grad_input, grad_output, input); + break; + default: + break; + } + } + + // 1D float版本 + void maxpool_backward_1d_float(float *grad_input, const float *grad_output, const float *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_width = info.input_dims[0]; + size_t output_width = info.output_dims[0]; + size_t kernel_width = info.kernel_sizes[0]; + size_t stride_width = info.strides[0]; + size_t pad_width = info.pads[0]; + + // 初始化梯度输入为零 + size_t total_input_size = batch_size * channels * input_width; + std::fill(grad_input, grad_input + total_input_size, 0.0f); + + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_width + c * input_width; + size_t output_offset = b * channels * output_width + c * output_width; + + for (size_t ow = 0; ow < output_width; ++ow) { + float max_val = -std::numeric_limits::infinity(); + size_t max_idx = 0; + bool found_max = false; + + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kw = start_w; kw < end_w; ++kw) { + if (kw >= 0 && kw < static_cast(input_width)) { + size_t real_kw = static_cast(kw); + float val = input[input_offset + real_kw]; + + if (!found_max || val > max_val || (val == max_val && real_kw < max_idx)) { + max_val = val; + max_idx = real_kw; + found_max = true; + } + } + } + + if (found_max) { + grad_input[input_offset + max_idx] += grad_output[output_offset + ow]; + } + } + } + } + } + + // 2D float版本 + void maxpool_backward_2d_float(float *grad_input, const float *grad_output, const float *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_height = info.input_dims[0]; + size_t input_width = info.input_dims[1]; + size_t output_height = info.output_dims[0]; + size_t output_width = info.output_dims[1]; + size_t kernel_height = info.kernel_sizes[0]; + size_t kernel_width = info.kernel_sizes[1]; + size_t stride_height = info.strides[0]; + size_t stride_width = info.strides[1]; + size_t pad_height = info.pads[0]; + size_t pad_width = info.pads[1]; + + // 初始化梯度输入为零 + size_t total_input_size = batch_size * channels * input_height * input_width; + std::fill(grad_input, grad_input + total_input_size, 0.0f); + + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_height * input_width + c * input_height * input_width; + size_t output_offset = b * channels * output_height * output_width + c * output_height * output_width; + + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + float max_val = -std::numeric_limits::infinity(); + size_t max_h = 0, max_w = 0; + bool found_max = false; + + int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + int end_h = start_h + static_cast(kernel_height); + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kh = start_h; kh < end_h; ++kh) { + for (int kw = start_w; kw < end_w; ++kw) { + if (kh >= 0 && kh < static_cast(input_height) && kw >= 0 && kw < static_cast(input_width)) { + size_t real_kh = static_cast(kh); + size_t real_kw = static_cast(kw); + float val = input[input_offset + real_kh * input_width + real_kw]; + + size_t linear_idx = real_kh * input_width + real_kw; + size_t old_linear_idx = found_max ? max_h * input_width + max_w : SIZE_MAX; + + if (!found_max || val > max_val || (val == max_val && linear_idx < old_linear_idx)) { + max_val = val; + max_h = real_kh; + max_w = real_kw; + found_max = true; + } + } + } + } + + if (found_max) { + size_t grad_input_idx = input_offset + max_h * input_width + max_w; + grad_input[grad_input_idx] += grad_output[output_offset + oh * output_width + ow]; + } + } + } + } + } + } + + // 3D float版本 + void maxpool_backward_3d_float(float *grad_input, const float *grad_output, const float *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_depth = info.input_dims[0]; + size_t input_height = info.input_dims[1]; + size_t input_width = info.input_dims[2]; + size_t output_depth = info.output_dims[0]; + size_t output_height = info.output_dims[1]; + size_t output_width = info.output_dims[2]; + size_t kernel_depth = info.kernel_sizes[0]; + size_t kernel_height = info.kernel_sizes[1]; + size_t kernel_width = info.kernel_sizes[2]; + size_t stride_depth = info.strides[0]; + size_t stride_height = info.strides[1]; + size_t stride_width = info.strides[2]; + size_t pad_depth = info.pads[0]; + size_t pad_height = info.pads[1]; + size_t pad_width = info.pads[2]; + + // 初始化梯度输入为零 + size_t total_input_size = batch_size * channels * input_depth * input_height * input_width; + std::fill(grad_input, grad_input + total_input_size, 0.0f); + + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_depth * input_height * input_width + c * input_depth * input_height * input_width; + size_t output_offset = b * channels * output_depth * output_height * output_width + c * output_depth * output_height * output_width; + + for (size_t od = 0; od < output_depth; ++od) { + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + float max_val = -std::numeric_limits::infinity(); + size_t max_d = 0, max_h = 0, max_w = 0; + bool found_max = false; + + int start_d = static_cast(od * stride_depth) - static_cast(pad_depth); + int end_d = start_d + static_cast(kernel_depth); + int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + int end_h = start_h + static_cast(kernel_height); + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kd = start_d; kd < end_d; ++kd) { + for (int kh = start_h; kh < end_h; ++kh) { + for (int kw = start_w; kw < end_w; ++kw) { + if (kd >= 0 && kd < static_cast(input_depth) && kh >= 0 && kh < static_cast(input_height) && kw >= 0 && kw < static_cast(input_width)) { + + size_t real_kd = static_cast(kd); + size_t real_kh = static_cast(kh); + size_t real_kw = static_cast(kw); + + float val = input[input_offset + real_kd * input_height * input_width + real_kh * input_width + real_kw]; + + size_t linear_idx = real_kd * input_height * input_width + real_kh * input_width + real_kw; + size_t old_linear_idx = found_max ? max_d * input_height * input_width + max_h * input_width + max_w : SIZE_MAX; + + if (!found_max || val > max_val || (val == max_val && linear_idx < old_linear_idx)) { + max_val = val; + max_d = real_kd; + max_h = real_kh; + max_w = real_kw; + found_max = true; + } + } + } + } + } + + if (found_max) { + size_t grad_input_idx = input_offset + max_d * input_height * input_width + max_h * input_width + max_w; + grad_input[grad_input_idx] += grad_output[output_offset + od * output_height * output_width + oh * output_width + ow]; + } + } + } + } + } + } + } + + // 获取数据类型的最小值 + template + static T get_min_value() { + if constexpr (std::is_same::value) { + return -std::numeric_limits::infinity(); + } else if constexpr (std::is_same::value) { + return _f32_to_f16(-std::numeric_limits::infinity()); + } else if constexpr (std::is_same::value) { + return _f32_to_bf16(-std::numeric_limits::infinity()); + } else { + return std::numeric_limits::lowest(); + } + } + + // 比较两个值的大小(处理半精度类型) + template + static bool is_greater(const T &a, const T &b) { + if constexpr (std::is_same::value) { + return utils::cast(a) > utils::cast(b); + } else if constexpr (std::is_same::value) { + return utils::cast(a) > utils::cast(b); + } else { + return a > b; + } + } + + // 检查两个值是否相等(处理半精度类型) + template + static bool values_equal(const T &a, const T &b) { + if constexpr (std::is_same::value) { + return utils::cast(a) == utils::cast(b); + } else if constexpr (std::is_same::value) { + return utils::cast(a) == utils::cast(b); + } else { + return a == b; + } + } + + // 原始的通用实现(用于F32和BF16) + template + void maxpool_backward_cpu(T *grad_input, const T *grad_output, const T *input) const { + switch (info.ndim) { + case 1: + maxpool_backward_1d_generic(grad_input, grad_output, input); + break; + case 2: + maxpool_backward_2d_generic(grad_input, grad_output, input); + break; + case 3: + maxpool_backward_3d_generic(grad_input, grad_output, input); + break; + default: + break; + } + } + + template + void maxpool_backward_1d_generic(T *grad_input, const T *grad_output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_width = info.input_dims[0]; + size_t output_width = info.output_dims[0]; + size_t kernel_width = info.kernel_sizes[0]; + size_t stride_width = info.strides[0]; + size_t pad_width = info.pads[0]; + + size_t total_input_size = batch_size * channels * input_width; + std::fill(grad_input, grad_input + total_input_size, T{}); + + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_width + c * input_width; + size_t output_offset = b * channels * output_width + c * output_width; + + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + size_t max_idx = 0; + bool found_max = false; + + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kw = start_w; kw < end_w; ++kw) { + if (kw >= 0 && kw < static_cast(input_width)) { + size_t real_kw = static_cast(kw); + T val = input[input_offset + real_kw]; + + if (!found_max || is_greater(val, max_val) || (values_equal(val, max_val) && real_kw < max_idx)) { + max_val = val; + max_idx = real_kw; + found_max = true; + } + } + } + + if (found_max) { + if constexpr (std::is_same::value) { + float current = utils::cast(grad_input[input_offset + max_idx]); + float to_add = utils::cast(grad_output[output_offset + ow]); + grad_input[input_offset + max_idx] = utils::cast(current + to_add); + } else { + grad_input[input_offset + max_idx] += grad_output[output_offset + ow]; + } + } + } + } + } + } + + template + void maxpool_backward_2d_generic(T *grad_input, const T *grad_output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_height = info.input_dims[0]; + size_t input_width = info.input_dims[1]; + size_t output_height = info.output_dims[0]; + size_t output_width = info.output_dims[1]; + size_t kernel_height = info.kernel_sizes[0]; + size_t kernel_width = info.kernel_sizes[1]; + size_t stride_height = info.strides[0]; + size_t stride_width = info.strides[1]; + size_t pad_height = info.pads[0]; + size_t pad_width = info.pads[1]; + + size_t total_input_size = batch_size * channels * input_height * input_width; + std::fill(grad_input, grad_input + total_input_size, T{}); + + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_height * input_width + c * input_height * input_width; + size_t output_offset = b * channels * output_height * output_width + c * output_height * output_width; + + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + size_t max_h = 0, max_w = 0; + bool found_max = false; + + int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + int end_h = start_h + static_cast(kernel_height); + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kh = start_h; kh < end_h; ++kh) { + for (int kw = start_w; kw < end_w; ++kw) { + if (kh >= 0 && kh < static_cast(input_height) && kw >= 0 && kw < static_cast(input_width)) { + size_t real_kh = static_cast(kh); + size_t real_kw = static_cast(kw); + T val = input[input_offset + real_kh * input_width + real_kw]; + + size_t linear_idx = real_kh * input_width + real_kw; + size_t old_linear_idx = found_max ? max_h * input_width + max_w : SIZE_MAX; + + if (!found_max || is_greater(val, max_val) || (values_equal(val, max_val) && linear_idx < old_linear_idx)) { + max_val = val; + max_h = real_kh; + max_w = real_kw; + found_max = true; + } + } + } + } + + if (found_max) { + size_t grad_input_idx = input_offset + max_h * input_width + max_w; + if constexpr (std::is_same::value) { + float current = utils::cast(grad_input[grad_input_idx]); + float to_add = utils::cast(grad_output[output_offset + oh * output_width + ow]); + grad_input[grad_input_idx] = utils::cast(current + to_add); + } else { + grad_input[grad_input_idx] += grad_output[output_offset + oh * output_width + ow]; + } + } + } + } + } + } + } + + template + void maxpool_backward_3d_generic(T *grad_input, const T *grad_output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_depth = info.input_dims[0]; + size_t input_height = info.input_dims[1]; + size_t input_width = info.input_dims[2]; + size_t output_depth = info.output_dims[0]; + size_t output_height = info.output_dims[1]; + size_t output_width = info.output_dims[2]; + size_t kernel_depth = info.kernel_sizes[0]; + size_t kernel_height = info.kernel_sizes[1]; + size_t kernel_width = info.kernel_sizes[2]; + size_t stride_depth = info.strides[0]; + size_t stride_height = info.strides[1]; + size_t stride_width = info.strides[2]; + size_t pad_depth = info.pads[0]; + size_t pad_height = info.pads[1]; + size_t pad_width = info.pads[2]; + + size_t total_input_size = batch_size * channels * input_depth * input_height * input_width; + std::fill(grad_input, grad_input + total_input_size, T{}); + + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_depth * input_height * input_width + c * input_depth * input_height * input_width; + size_t output_offset = b * channels * output_depth * output_height * output_width + c * output_depth * output_height * output_width; + + for (size_t od = 0; od < output_depth; ++od) { + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + size_t max_d = 0, max_h = 0, max_w = 0; + bool found_max = false; + + int start_d = static_cast(od * stride_depth) - static_cast(pad_depth); + int end_d = start_d + static_cast(kernel_depth); + int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + int end_h = start_h + static_cast(kernel_height); + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kd = start_d; kd < end_d; ++kd) { + for (int kh = start_h; kh < end_h; ++kh) { + for (int kw = start_w; kw < end_w; ++kw) { + if (kd >= 0 && kd < static_cast(input_depth) && kh >= 0 && kh < static_cast(input_height) && kw >= 0 && kw < static_cast(input_width)) { + + size_t real_kd = static_cast(kd); + size_t real_kh = static_cast(kh); + size_t real_kw = static_cast(kw); + + T val = input[input_offset + real_kd * input_height * input_width + real_kh * input_width + real_kw]; + + size_t linear_idx = real_kd * input_height * input_width + real_kh * input_width + real_kw; + size_t old_linear_idx = found_max ? max_d * input_height * input_width + max_h * input_width + max_w : SIZE_MAX; + + if (!found_max || is_greater(val, max_val) || (values_equal(val, max_val) && linear_idx < old_linear_idx)) { + max_val = val; + max_d = real_kd; + max_h = real_kh; + max_w = real_kw; + found_max = true; + } + } + } + } + } + + if (found_max) { + size_t grad_input_idx = input_offset + max_d * input_height * input_width + max_h * input_width + max_w; + if constexpr (std::is_same::value) { + float current = utils::cast(grad_input[grad_input_idx]); + float to_add = utils::cast(grad_output[output_offset + od * output_height * output_width + oh * output_width + ow]); + grad_input[grad_input_idx] = utils::cast(current + to_add); + } else { + grad_input[grad_input_idx] += grad_output[output_offset + od * output_height * output_width + oh * output_width + ow]; + } + } + } + } + } + } + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + MaxPoolBackwardInfo &info, + infiniDtype_t data_type) { + if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *grad_input, const void *grad_output, + const void *input, infiniDtype_t dtype) const { + + if (!grad_input || !grad_output || !input) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_grad_input = static_cast(grad_input); + const float *typed_grad_output = static_cast(grad_output); + const float *typed_input = static_cast(input); + maxpool_backward_cpu(typed_grad_input, typed_grad_output, typed_input); + break; + } + + case INFINI_DTYPE_F16: { + // F16特殊处理:转换为float计算 + fp16_t *typed_grad_input = static_cast(grad_input); + const fp16_t *typed_grad_output = static_cast(grad_output); + const fp16_t *typed_input = static_cast(input); + maxpool_backward_f16_as_float(typed_grad_input, typed_grad_output, typed_input); + break; + } + + case INFINI_DTYPE_BF16: { + bf16_t *typed_grad_input = static_cast(grad_input); + const bf16_t *typed_grad_output = static_cast(grad_output); + const bf16_t *typed_input = static_cast(input); + maxpool_backward_cpu(typed_grad_input, typed_grad_output, typed_input); + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + auto result = MaxPoolBackwardInfo::create(grad_input_desc, grad_output_desc, input_desc, + kernel_size, strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle, info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *grad_input, const void *grad_output, + const void *input, void *stream) const { + return _opaque->calculate(workspace, workspace_size, grad_input, grad_output, input, _dtype); +} + +} // namespace op::maxpool_backward::cpu diff --git a/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.h b/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.h new file mode 100644 index 000000000..753a01f58 --- /dev/null +++ b/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.h @@ -0,0 +1,8 @@ +#ifndef __MAXPOOL_BACKWARD_CPU_H__ +#define __MAXPOOL_BACKWARD_CPU_H__ + +#include "../maxpool_backward.h" + +DESCRIPTOR(cpu) + +#endif // __MAXPOOL_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/maxpool_backward/info.h b/src/infiniop/ops/maxpool_backward/info.h new file mode 100644 index 000000000..e8d88c577 --- /dev/null +++ b/src/infiniop/ops/maxpool_backward/info.h @@ -0,0 +1,81 @@ +#ifndef __MAXPOOL_BACKWARD_INFO_H__ +#define __MAXPOOL_BACKWARD_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +namespace op::maxpool_backward { + +class MaxPoolBackwardInfo { + MaxPoolBackwardInfo() = default; + +public: + std::vector input_dims; // original input dimensions + std::vector output_dims; // pooled output dimensions + std::vector kernel_sizes; + std::vector strides; + std::vector pads; + bool ceil_mode; + size_t ndim; + size_t batch; + size_t channels; + + static utils::Result create( + infiniopTensorDescriptor_t grad_input_desc, // gradient w.r.t. input + infiniopTensorDescriptor_t grad_output_desc, // gradient w.r.t. output + infiniopTensorDescriptor_t input_desc, // original input + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + MaxPoolBackwardInfo info; + + // Validate tensor dimensions + if (input_desc->ndim() < 3 || input_desc->ndim() > 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->ndim() != grad_input_desc->ndim() || grad_output_desc->ndim() != grad_input_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // Check batch and channel dimensions match + if (input_desc->dim(0) != grad_input_desc->dim(0) || input_desc->dim(1) != grad_input_desc->dim(1) || grad_output_desc->dim(0) != grad_input_desc->dim(0) || grad_output_desc->dim(1) != grad_input_desc->dim(1)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // Check spatial dimensions consistency + for (size_t i = 2; i < input_desc->ndim(); ++i) { + if (input_desc->dim(i) != grad_input_desc->dim(i)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } + + info.ndim = input_desc->ndim() - 2; // spatial dimensions + info.batch = input_desc->dim(0); + info.channels = input_desc->dim(1); + info.ceil_mode = ceil_mode; + + auto kernel_ptr = reinterpret_cast(kernel_size); + auto stride_ptr = reinterpret_cast(strides); + auto pad_ptr = reinterpret_cast(pads); + + // Store spatial dimensions and pooling parameters + for (size_t i = 0; i < info.ndim; ++i) { + info.input_dims.push_back(input_desc->dim(i + 2)); + info.output_dims.push_back(grad_output_desc->dim(i + 2)); + info.kernel_sizes.push_back(kernel_ptr[i]); + info.strides.push_back(stride_ptr[i]); + info.pads.push_back(pad_ptr[i]); + } + + return utils::Result(std::move(info)); + } +}; + +} // namespace op::maxpool_backward + +#endif // __MAXPOOL_BACKWARD_INFO_H__ diff --git a/src/infiniop/ops/maxpool_backward/maxpool_backward.h b/src/infiniop/ops/maxpool_backward/maxpool_backward.h new file mode 100644 index 000000000..459559b4a --- /dev/null +++ b/src/infiniop/ops/maxpool_backward/maxpool_backward.h @@ -0,0 +1,55 @@ +#ifndef __MAXPOOL_BACKWARD_H__ +#define __MAXPOOL_BACKWARD_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::maxpool_backward::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + MaxPoolBackwardInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + MaxPoolBackwardInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t grad_input_desc, \ + infiniopTensorDescriptor_t grad_output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + void *kernel_size, \ + void *strides, \ + void *pads, \ + bool ceil_mode); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *grad_input, \ + const void *grad_output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __MAXPOOL_BACKWARD_H__ diff --git a/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.cc b/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.cc new file mode 100644 index 000000000..0a8b1a1ee --- /dev/null +++ b/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.cc @@ -0,0 +1,248 @@ +#include "maxpool_backward_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" + +#define DESTROY_HCDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_HCDNN_DESCRIPTORS() \ + do { \ + DESTROY_HCDNN_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_HCDNN_DESCRIPTOR(grad_input_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_HCDNN_DESCRIPTOR(grad_output_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_HCDNN_DESCRIPTOR(pooling_backward_desc, \ + hcdnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::maxpool_backward::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_HCDNN_API + hcdnnTensorDescriptor_t input_desc = nullptr; + hcdnnTensorDescriptor_t grad_input_desc = nullptr; + hcdnnTensorDescriptor_t grad_output_desc = nullptr; + hcdnnPoolingDescriptor_t pooling_backward_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_HCDNN_API + void calculateStrides(const std::vector &dims, std::vector &strides, + int ndim) const { + strides[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + strides[d] = strides[d + 1] * dims[d + 1]; + } + } + + infiniStatus_t createPoolingDescriptors(const MaxPoolBackwardInfo &info, + hcdnnDataType_t hcdnn_data_type) { + // 创建hcdnn描述符 + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc)); + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_input_desc)); + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_output_desc)); + CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_backward_desc)); + + // 构建输入、输出梯度维度(NCHW格式) + std::vector input_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + for (size_t i = 0; i < info.ndim; ++i) { + input_dims_vec.push_back(static_cast(info.input_dims[i])); + output_dims_vec.push_back(static_cast(info.output_dims[i])); + } + + // 1D池化补充维度 + if (info.ndim == 1) { + input_dims_vec.push_back(1); + output_dims_vec.push_back(1); + } + + // 计算内存步幅 + std::vector input_strides_vec(input_dims_vec.size()); + std::vector output_strides_vec(output_dims_vec.size()); + calculateStrides(input_dims_vec, input_strides_vec, input_dims_vec.size()); + calculateStrides(output_dims_vec, output_strides_vec, output_dims_vec.size()); + + // 设置张量描述符(带步幅) + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + input_desc, hcdnn_data_type, input_dims_vec.size(), + input_dims_vec.data(), input_strides_vec.data())); + + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + grad_input_desc, hcdnn_data_type, input_dims_vec.size(), + input_dims_vec.data(), input_strides_vec.data())); + + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + grad_output_desc, hcdnn_data_type, output_dims_vec.size(), + output_dims_vec.data(), output_strides_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const MaxPoolBackwardInfo &info) { + // 构建池化参数 + std::vector kernel_vec, stride_vec, pad_vec; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_vec.push_back(static_cast(info.kernel_sizes[i])); + stride_vec.push_back(static_cast(info.strides[i])); + pad_vec.push_back(static_cast(info.pads[i])); + } + + // 1D池化补充维度 + if (info.ndim == 1) { + kernel_vec.push_back(1); + stride_vec.push_back(1); + pad_vec.push_back(0); + } + + // 设置最大池化反向描述符(确定性模式) + CHECK_MCDNN(hcdnnSetPoolingNdDescriptor( + pooling_backward_desc, HCDNN_POOLING_MAX_DETERMINISTIC, // 确定性最大池化 + HCDNN_NOT_PROPAGATE_NAN, // 不传播NaN + kernel_vec.size(), + kernel_vec.data(), + pad_vec.data(), + stride_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializeHcdnnContext(MaxPoolBackwardInfo &info, + infiniDtype_t data_type) { + hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type); + + CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // 计算工作空间大小(需存储前向输出用于反向计算) + CHECK_MCDNN(hcdnnGetTensorSizeInBytes(grad_output_desc, &workspace_size)); + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) +#ifdef ENABLE_HCDNN_API + , + input_desc(other.input_desc), grad_input_desc(other.grad_input_desc), grad_output_desc(other.grad_output_desc), pooling_backward_desc(other.pooling_backward_desc) +#endif + { +#ifdef ENABLE_HCDNN_API + other.input_desc = nullptr; + other.grad_input_desc = nullptr; + other.grad_output_desc = nullptr; + other.pooling_backward_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_HCDNN_API + CLEANUP_HCDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + MaxPoolBackwardInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_HCDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializeHcdnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_HCDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = MaxPoolBackwardInfo::create(grad_input_desc, grad_output_desc, input_desc, + kernel_size, strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *grad_input, const void *grad_output, + const void *input, void *stream) const { + +#ifdef ENABLE_HCDNN_API + const float alpha = 1.0f, beta = 0.0f; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + CHECK_STATUS(_opaque->internal->useMcdnn( + (hcStream_t)stream, [&](hcdnnHandle_t handle) { + void *temp_output = workspace; + CHECK_MCDNN(hcdnnPoolingForward( + handle, _opaque->pooling_backward_desc, &alpha, + _opaque->input_desc, input, &beta, _opaque->grad_output_desc, temp_output)); + + CHECK_MCDNN(hcdnnPoolingBackward( + handle, _opaque->pooling_backward_desc, &alpha, + _opaque->grad_output_desc, temp_output, // 前向输出(用于定位最大值) + _opaque->grad_output_desc, grad_output, // 输出梯度 + _opaque->input_desc, input, // 前向输入 + &beta, + _opaque->grad_input_desc, grad_input // 输入梯度(输出) + )); + return INFINI_STATUS_SUCCESS; + })); + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::maxpool_backward::metax diff --git a/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.h b/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.h new file mode 100644 index 000000000..5133090e2 --- /dev/null +++ b/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __MAXPOOL_BACKWARD_METAX_H__ +#define __MAXPOOL_BACKWARD_METAX_H__ + +#include "../maxpool_backward.h" + +DESCRIPTOR(metax) + +#endif // __MAXPOOL_BACKWARD_METAX_H__ diff --git a/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cu b/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cu new file mode 100644 index 000000000..7247ffede --- /dev/null +++ b/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cu @@ -0,0 +1,270 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "maxpool_backward_nvidia.cuh" + +#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_CUDNN_DESCRIPTORS() \ + do { \ + DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(grad_input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(grad_output_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(pooling_backward_desc, \ + cudnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::maxpool_backward::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_CUDNN_API + cudnnTensorDescriptor_t input_desc = nullptr; + cudnnTensorDescriptor_t grad_input_desc = nullptr; + cudnnTensorDescriptor_t grad_output_desc = nullptr; + cudnnPoolingDescriptor_t pooling_backward_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_CUDNN_API + infiniStatus_t getCudnnDataType(infiniDtype_t data_type, + cudnnDataType_t &cudnn_data_type) const { + if (data_type == INFINI_DTYPE_F16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_F32) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_BF16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + } + + void calculateStrides(const std::vector &dims, std::vector &strides, + int ndim) const { + strides[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + strides[d] = strides[d + 1] * dims[d + 1]; + } + } + + infiniStatus_t createPoolingDescriptors(const MaxPoolBackwardInfo &info, + cudnnDataType_t cudnn_data_type) { + // Create CUDNN descriptors + CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_input_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_output_desc)); + CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_backward_desc)); + + // Setup tensor descriptors + std::vector input_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + + for (size_t i = 0; i < info.ndim; ++i) { + input_dims_vec.push_back(static_cast(info.input_dims[i])); + output_dims_vec.push_back(static_cast(info.output_dims[i])); + } + + if (info.ndim == 1) { + // For 1D pooling, add dummy dimension + input_dims_vec.push_back(1); + output_dims_vec.push_back(1); + } + + // Calculate memory strides + std::vector input_strides_vec(input_dims_vec.size()); + std::vector output_strides_vec(output_dims_vec.size()); + calculateStrides(input_dims_vec, input_strides_vec, input_dims_vec.size()); + calculateStrides(output_dims_vec, output_strides_vec, + output_dims_vec.size()); + + // Set tensor descriptors with strides + CHECK_CUDNN(cudnnSetTensorNdDescriptor( + input_desc, cudnn_data_type, input_dims_vec.size(), + input_dims_vec.data(), input_strides_vec.data())); + + CHECK_CUDNN(cudnnSetTensorNdDescriptor( + grad_input_desc, cudnn_data_type, input_dims_vec.size(), + input_dims_vec.data(), input_strides_vec.data())); + + CHECK_CUDNN(cudnnSetTensorNdDescriptor( + grad_output_desc, cudnn_data_type, output_dims_vec.size(), + output_dims_vec.data(), output_strides_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const MaxPoolBackwardInfo &info) { + // Setup pooling descriptor + std::vector kernel_vec, stride_vec, pad_vec, pad_vec_backward; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_vec.push_back(static_cast(info.kernel_sizes[i])); + stride_vec.push_back(static_cast(info.strides[i])); + pad_vec.push_back(static_cast(info.pads[i])); + } + + if (info.ndim == 1) { + // For 1D pooling, add dummy dimension + kernel_vec.push_back(1); + stride_vec.push_back(1); + pad_vec.push_back(0); + } + + CHECK_CUDNN(cudnnSetPoolingNdDescriptor( + pooling_backward_desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, + kernel_vec.size(), kernel_vec.data(), pad_vec.data(), + stride_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializeCudnnContext(MaxPoolBackwardInfo &info, + infiniDtype_t data_type) { + cudnnDataType_t cudnn_data_type; + CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type)); + + CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // Calculate workspace size, workspace is required for forward output + CHECK_CUDNN(cudnnGetTensorSizeInBytes(grad_output_desc, &workspace_size)); + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) + // clang-format off +#ifdef ENABLE_CUDNN_API + , input_desc(other.input_desc) + , grad_input_desc(other.grad_input_desc) + , grad_output_desc(other.grad_output_desc) + , pooling_backward_desc(other.pooling_backward_desc) +#endif + // clang-format on + { +#ifdef ENABLE_CUDNN_API + other.input_desc = nullptr; + other.grad_input_desc = nullptr; + other.grad_output_desc = nullptr; + other.pooling_backward_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_CUDNN_API + CLEANUP_CUDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + MaxPoolBackwardInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_CUDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializeCudnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_CUDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = MaxPoolBackwardInfo::create(grad_input_desc, grad_output_desc, input_desc, + kernel_size, strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *grad_input, const void *grad_output, + const void *input, void *stream) const { + +#ifdef ENABLE_CUDNN_API + const float alpha = 1.0f, beta = 0.0f; + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + CHECK_STATUS(_opaque->internal->useCudnn( + (cudaStream_t)stream, [&](cudnnHandle_t handle) { + size_t grad_input_size = 0; + CHECK_CUDNN(cudnnGetTensorSizeInBytes(_opaque->grad_input_desc, + &grad_input_size)); + CHECK_CUDA(cudaMemset(grad_input, 0, grad_input_size)); + CHECK_CUDA(cudaMemset(workspace, 0, _workspace_size)); + + void *temp_output = workspace; + CHECK_CUDNN(cudnnPoolingForward( + handle, _opaque->pooling_backward_desc, &alpha, _opaque->input_desc, + input, &beta, _opaque->grad_output_desc, temp_output)); + + CHECK_CUDNN(cudnnPoolingBackward( + handle, _opaque->pooling_backward_desc, &alpha, + _opaque->grad_output_desc, temp_output, _opaque->grad_output_desc, + grad_output, _opaque->input_desc, input, &beta, + _opaque->grad_input_desc, grad_input)); + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::maxpool_backward::nvidia diff --git a/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cuh b/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cuh new file mode 100644 index 000000000..f83fee580 --- /dev/null +++ b/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MAXPOOL_BACKWARD_NVIDIA_CUH__ +#define __MAXPOOL_BACKWARD_NVIDIA_CUH__ + +#include "../maxpool_backward.h" + +DESCRIPTOR(nvidia) + +#endif // __MAXPOOL_BACKWARD_NVIDIA_CUH__ diff --git a/src/infiniop/ops/maxpool_backward/operator.cc b/src/infiniop/ops/maxpool_backward/operator.cc new file mode 100644 index 000000000..386d9d3b3 --- /dev/null +++ b/src/infiniop/ops/maxpool_backward/operator.cc @@ -0,0 +1,159 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/maxpool_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/maxpool_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/maxpool_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/maxpool_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateMaxPoolBackwardDescriptor( + infiniopHandle_t handle, + infiniopMaxPoolBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t grad_output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::maxpool_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input_desc, \ + grad_output_desc, \ + input_desc, \ + kernel_size, \ + strides, \ + pads, \ + ceil_mode) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetMaxPoolBackwardWorkspaceSize( + infiniopMaxPoolBackwardDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopMaxPoolBackward( + infiniopMaxPoolBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *grad_output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, \ + grad_input, \ + grad_output, \ + input, \ + stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyMaxPoolBackwardDescriptor(infiniopMaxPoolBackwardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/averagepool.py b/test/infiniop/averagepool.py new file mode 100644 index 000000000..79799ca6a --- /dev/null +++ b/test/infiniop/averagepool.py @@ -0,0 +1,239 @@ +import torch +import ctypes +from ctypes import c_uint64, c_bool + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from typing import Tuple +import math +from torch.nn import functional as F + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +_TEST_CASES = [ + # ============ 1D Average Pooling Tests (converted to MaxPool format) ============ + # Basic cases + ((4, 8, 128), None, (3,), (1,), (0,), False), # kernel=3, stride=1, pad=0 + ((2, 16, 256), None, (5,), (2,), (2,), False), # kernel=5, stride=2, pad=2 + ((8, 4, 64), None, (7,), (3,), (1,), False), # kernel=7, stride=3, pad=1 + # ceil_mode variations + ((1, 3, 99), None, (4,), (3,), (1,), True), # kernel=4, stride=3, pad=1 + ((3, 2, 77), None, (6,), (4,), (0,), True), # kernel=6, stride=4, pad=0 + # ============ 2D Average Pooling Tests ============ + # Basic cases with square kernels + ((2, 3, 64, 64), None, (3, 3), (1, 1), (1, 1), False), + ((4, 16, 128, 128), None, (5, 5), (2, 2), (2, 2), False), + ((1, 8, 96, 96), None, (7, 7), (3, 3), (0, 0), False), + # Rectangular kernels + ((2, 4, 80, 120), None, (3, 5), (1, 2), (1, 2), False), + ((1, 6, 72, 48), None, (7, 3), (2, 1), (3, 1), False), + ((3, 2, 56, 84), None, (2, 4), (2, 3), (0, 2), False), + # ceil_mode variations + ((1, 1, 33, 33), None, (4, 4), (3, 3), (1, 1), True), + ((2, 5, 77, 89), None, (5, 3), (4, 2), (2, 1), True), + # ============ 3D Average Pooling Tests ============ + # Basic cubic kernels + ((1, 2, 32, 32, 32), None, (3, 3, 3), (1, 1, 1), (1, 1, 1), False), + ((2, 4, 48, 48, 48), None, (5, 5, 5), (2, 2, 2), (2, 2, 2), False), + ((1, 1, 64, 64, 64), None, (7, 7, 7), (3, 3, 3), (0, 0, 0), False), + # Non-cubic kernels + ((1, 3, 24, 36, 48), None, (2, 3, 4), (1, 2, 2), (0, 1, 2), False), + ((2, 2, 40, 32, 56), None, (5, 3, 7), (2, 1, 3), (2, 1, 3), False), + ((1, 1, 28, 44, 36), None, (3, 5, 2), (2, 3, 1), (1, 2, 1), False), + # ceil_mode variations + ((1, 1, 27, 27, 27), None, (4, 4, 4), (3, 3, 3), (1, 1, 1), True), + ((2, 2, 33, 45, 39), None, (5, 3, 4), (3, 2, 3), (2, 1, 1), True), +] + +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16] +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + + +def averagepool(input_tensor, kernel_size, stride, padding, ceil_mode, output_tensor): + ndim = len(input_tensor.shape) - 2 + if ndim == 1: + result = F.avg_pool1d( + input_tensor.to(torch.float32), kernel_size[0], stride[0], padding[0], ceil_mode=ceil_mode + ) + elif ndim == 2: + result = F.avg_pool2d( + input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode + ) + elif ndim == 3: + result = F.avg_pool3d( + input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode + ) + else: + raise ValueError(f"Unsupported spatial dimensions: {ndim}") + + # 将计算结果转换回原始数据类型 + output_tensor.copy_(result.to(output_tensor.dtype)) + + +def infer_output_shape(input_shape, kernel_size, stride, padding, ceil_mode): + def calc_output_size(input_size, k, s, p, ceil_mode): + return ( + math.ceil((input_size + 2 * p - k) / s + 1) + if ceil_mode + else math.floor((input_size + 2 * p - k) / s + 1) + ) + + batch, channel, *spatial = input_shape + output_spatial = [ + calc_output_size(spatial[i], kernel_size[i], stride[i], padding[i], ceil_mode) + for i in range(len(spatial)) + ] + return (batch, channel) + tuple(output_spatial) + + +def tuple_to_void_p(py_tuple: Tuple): + arr = (ctypes.c_uint64 * len(py_tuple))(*py_tuple) + return ctypes.cast(arr, ctypes.c_void_p) + + +def test( + handle, + device, + input_shape, + input_stride, + kernel_size, + stride, + padding, + ceil_mode, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + output_shape = infer_output_shape( + input_shape, kernel_size, stride, padding, ceil_mode + ) + output_tensor = TestTensor(output_shape, None, dt=tensor_dtype, device=device) + + print( + f"Testing AvgPool on {InfiniDeviceNames[device]} with input_shape: {input_shape}, kernel_size: {kernel_size}, stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, dtype: {InfiniDtypeNames[tensor_dtype]}" + ) + + averagepool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + output_tensor.torch_tensor(), + ) + + if sync: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAvgPoolDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + tuple_to_void_p(kernel_size), + tuple_to_void_p(stride), + tuple_to_void_p(padding), + c_bool(ceil_mode), + ) + ) + + for tensor in [input_tensor, output_tensor]: + if tensor: + tensor.destroy_desc() + + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAvgPoolWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_averagepool(): + check_error( + LIBINFINIOP.infiniopAvgPool( + descriptor, + workspace.data(), + workspace_size.value, + output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + lib_averagepool() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ) + + assert torch.allclose( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ), f"Mismatch for shape {input_shape}, kernel {kernel_size}" + + if PROFILE: + profile_operation( + "PyTorch", + lambda: averagepool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + output_tensor.torch_tensor(), + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", lib_averagepool, device, NUM_PRERUN, NUM_ITERATIONS + ) + + check_error(LIBINFINIOP.infiniopDestroyAvgPoolDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/averagepool_backward.py b/test/infiniop/averagepool_backward.py new file mode 100644 index 000000000..6dd6613ec --- /dev/null +++ b/test/infiniop/averagepool_backward.py @@ -0,0 +1,261 @@ +import torch +import ctypes +from ctypes import c_uint64, c_bool +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +import math +from torch.nn import functional as F +from typing import Tuple + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +_TEST_CASES = [ + # ============ 1D Average Pooling Tests (converted to MaxPool format) ============ + # Basic cases + ((4, 8, 128), None, (3,), (1,), (0,), False), # kernel=3, stride=1, pad=0 + ((2, 16, 256), None, (5,), (2,), (2,), False), # kernel=5, stride=2, pad=2 + ((8, 4, 64), None, (7,), (3,), (1,), False), # kernel=7, stride=3, pad=1 + # ceil_mode variations + ((1, 3, 99), None, (4,), (3,), (1,), True), # kernel=4, stride=3, pad=1 + ((3, 2, 77), None, (6,), (4,), (0,), True), # kernel=6, stride=4, pad=0 + # ============ 2D Average Pooling Tests ============ + # Basic cases with square kernels + ((2, 3, 64, 64), None, (3, 3), (1, 1), (1, 1), False), + ((4, 16, 128, 128), None, (5, 5), (2, 2), (2, 2), False), + ((1, 8, 96, 96), None, (7, 7), (3, 3), (0, 0), False), + # Rectangular kernels + ((2, 4, 80, 120), None, (3, 5), (1, 2), (1, 2), False), + ((1, 6, 72, 48), None, (7, 3), (2, 1), (3, 1), False), + ((3, 2, 56, 84), None, (2, 4), (2, 3), (0, 2), False), + # ceil_mode variations + ((1, 1, 33, 33), None, (4, 4), (3, 3), (1, 1), True), + ((2, 5, 77, 89), None, (5, 3), (4, 2), (2, 1), True), + # ============ 3D Average Pooling Tests ============ + # Basic cubic kernels + ((1, 2, 32, 32, 32), None, (3, 3, 3), (1, 1, 1), (1, 1, 1), False), + ((2, 4, 48, 48, 48), None, (5, 5, 5), (2, 2, 2), (2, 2, 2), False), + ((1, 1, 64, 64, 64), None, (7, 7, 7), (3, 3, 3), (0, 0, 0), False), + # Non-cubic kernels + ((1, 3, 24, 36, 48), None, (2, 3, 4), (1, 2, 2), (0, 1, 2), False), + ((2, 2, 40, 32, 56), None, (5, 3, 7), (2, 1, 3), (2, 1, 3), False), + ((1, 1, 28, 44, 36), None, (3, 5, 2), (2, 3, 1), (1, 2, 1), False), + # ceil_mode variations + ((1, 1, 27, 27, 27), None, (4, 4, 4), (3, 3, 3), (1, 1, 1), True), + ((2, 2, 33, 45, 39), None, (5, 3, 4), (3, 2, 3), (2, 1, 1), True), +] + +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + + +def averagepool_backward( + input_tensor, + grad_output_tensor, + kernel_size, + stride, + padding, + ceil_mode, + grad_input_tensor, +): + input_tensor_f32 = input_tensor.to(torch.float32).detach().clone().requires_grad_(True) + grad_output_tensor_f32 = grad_output_tensor.to(torch.float32) + + ndim = len(input_tensor.shape) - 2 + if ndim == 1: + output = F.avg_pool1d( + input_tensor_f32, kernel_size[0], stride[0], padding[0], ceil_mode=ceil_mode + ) + elif ndim == 2: + output = F.avg_pool2d( + input_tensor_f32, kernel_size, stride, padding, ceil_mode=ceil_mode + ) + elif ndim == 3: + output = F.avg_pool3d( + input_tensor_f32, kernel_size, stride, padding, ceil_mode=ceil_mode + ) + else: + raise ValueError("Unsupported dimension") + + output.backward(grad_output_tensor_f32) + + # 将计算得到的梯度转换回原始数据类型,并复制到梯度输入张量中 + grad_input_tensor.copy_(input_tensor_f32.grad.to(grad_input_tensor.dtype)) + + +def infer_output_shape(input_shape, kernel_size, stride, padding, ceil_mode): + def calc_output_size(input_size, k, s, p, ceil_mode): + if ceil_mode: + return math.ceil((input_size + 2 * p - k) / s + 1) + else: + return math.floor((input_size + 2 * p - k) / s + 1) + + return (input_shape[0], input_shape[1]) + tuple( + calc_output_size( + input_shape[i + 2], kernel_size[i], stride[i], padding[i], ceil_mode + ) + for i in range(len(kernel_size)) + ) + + +def tuple_to_void_p(py_tuple: Tuple): + array = ctypes.c_uint64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.c_void_p) + + +def test( + handle, + device, + input_shape, + input_stride, + kernel_size, + stride, + padding, + ceil_mode, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + output_shape = infer_output_shape( + input_shape, kernel_size, stride, padding, ceil_mode + ) + grad_output_tensor = TestTensor( + output_shape, None, dt=tensor_dtype, device=device, scale=1.0 + ) + grad_input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device + ) + + print( + f"Testing AvgPoolBackward on {InfiniDeviceNames[device]} with input: {input_shape}, kernel: {kernel_size}, stride: {stride}, pad: {padding}, ceil_mode: {ceil_mode}" + ) + print( + f"Input Tensor: {input_tensor.shape}, Grad Output Tensor: {grad_output_tensor.shape}, Grad Input Tensor: {grad_input_tensor.shape}" + ) + + averagepool_backward( + input_tensor.torch_tensor(), + grad_output_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + grad_input_tensor.torch_tensor(), + ) + + if sync: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAvgPoolBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_input_tensor.descriptor, + grad_output_tensor.descriptor, + input_tensor.descriptor, + tuple_to_void_p(kernel_size), + tuple_to_void_p(stride), + tuple_to_void_p(padding), + c_bool(ceil_mode), + ) + ) + + for tensor in [input_tensor, grad_output_tensor, grad_input_tensor]: + if tensor: + tensor.destroy_desc() + + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAvgPoolBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_averagepool_backward(): + check_error( + LIBINFINIOP.infiniopAvgPoolBackward( + descriptor, + workspace.data(), + workspace_size.value, + grad_input_tensor.data(), + grad_output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + lib_averagepool_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + grad_input_tensor.actual_tensor(), + grad_input_tensor.torch_tensor(), + atol, + rtol, + ) + assert torch.allclose( + grad_input_tensor.actual_tensor(), + grad_input_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ) + + if PROFILE: + profile_operation( + "PyTorch", + lambda: averagepool_backward( + input_tensor.torch_tensor(), + grad_output_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + grad_input_tensor.torch_tensor(), + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + "lib", lib_averagepool_backward, device, NUM_PRERUN, NUM_ITERATIONS + ) + + check_error(LIBINFINIOP.infiniopDestroyAvgPoolBackwardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mAvgPoolBackward Test Passed!\033[0m") diff --git a/test/infiniop/conv_backward.py b/test/infiniop/conv_backward.py new file mode 100644 index 000000000..e7a262963 --- /dev/null +++ b/test/infiniop/conv_backward.py @@ -0,0 +1,319 @@ +import torch +import ctypes +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from typing import List, Tuple +import math + +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16] +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +_TEST_CASES = [ + # 1D Conv Backward Tests + # x_shape, x_stride, w_shape, w_stride, pads, strides, dilations, group + ((2, 4, 16), (64, 16, 1), (8, 4, 5), (20, 5, 1), (0,), (1,), (1,), 1), + ((2, 4, 32), (128, 32, 1), (8, 4, 3), (12, 3, 1), (1,), (2,), (1,), 1), + ((1, 2, 64), (128, 64, 1), (4, 2, 7), (14, 7, 1), (2,), (3,), (1,), 1), + # 2D Conv Backward Tests + ( + (2, 3, 10, 10), + (300, 100, 10, 1), + (6, 3, 3, 3), + (27, 9, 3, 1), + (0, 1), + (1, 2), + (2, 2), + 1, + ), + ( + (1, 2, 28, 14), + (784, 392, 14, 1), + (4, 2, 5, 3), + (30, 15, 3, 1), + (1, 2), + (3, 2), + (1, 1), + 1, + ), + # 3D Conv Backward Tests + ( + (1, 2, 5, 5, 5), + (250, 125, 25, 5, 1), + (4, 2, 2, 2, 2), + (16, 8, 4, 2, 1), + (0, 1, 1), + (1, 1, 1), + (1, 1, 1), + 1, + ), + # Grouped convolution test case + ((2, 4, 16), (64, 16, 1), (4, 2, 3), (6, 3, 1), (1,), (1,), (1,), 2), +] + + +def inferShapeStride( + x_shape: List[int], + w_shape: List[int], + pads: List[int], + strides: List[int], + dilations: List[int], +) -> Tuple[Tuple[int, ...], Tuple[int, ...]]: + assert ( + len(x_shape) + == len(w_shape) + == len(pads) + 2 + == len(dilations) + 2 + == len(strides) + 2 + ), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2" + output_dims = [ + math.floor( + (x_shape[i + 2] + 2 * pads[i] - dilations[i] * (w_shape[i + 2] - 1) - 1) + / strides[i] + + 1 + ) + for i in range(len(pads)) + ] + output_shape = (x_shape[0], w_shape[0]) + tuple(output_dims) + output_strides = [1] + for s in reversed(output_shape[1:]): + output_strides.insert(0, output_strides[0] * s) + output_strides = tuple(output_strides) + return output_shape, output_strides + + +def tuple_to_void_p(py_tuple: Tuple): + array = ctypes.c_int * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.c_void_p) + + +def test( + handle, + device, + input_shape, + input_stride, + weight_shape, + weight_stride, + pads, + strides, + dilations, + groups, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + assert len(pads) == len(strides) == len(dilations) + input = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=0.01 + ) + weight = TestTensor( + weight_shape, weight_stride, dt=tensor_dtype, device=device, scale=0.01 + ) + output_shape, output_stride = inferShapeStride( + input_shape, weight_shape, pads, strides, dilations + ) + # grad_output = TestTensor(output_shape, output_stride, dt=tensor_dtype, device=device) + bias = TestTensor( + (weight.shape[0],), (1,), dt=tensor_dtype, device=device, scale=0.01 + ) + # bias = None # Disable bias for now + # 1. PyTorch reference backward + input_torch = input.torch_tensor().detach().clone().requires_grad_(True) + weight_torch = weight.torch_tensor().detach().clone().requires_grad_(True) + bias_torch = ( + bias.torch_tensor().detach().clone().requires_grad_(True) + if bias is not None + else None + ) + grad_output_torch = torch.randn( + output_shape, dtype=input_torch.dtype, device=input_torch.device + ) + + # Define forward function for reuse + def forward_pass(input_t, weight_t, bias_t): + if len(input_shape) == 3: + return torch.nn.functional.conv1d( + input_t, + weight_t, + bias=bias_t, + stride=strides, + padding=pads, + dilation=dilations, + groups=groups, + ) + elif len(input_shape) == 4: + return torch.nn.functional.conv2d( + input_t, + weight_t, + bias=bias_t, + stride=strides, + padding=pads, + dilation=dilations, + groups=groups, + ) + elif len(input_shape) == 5: + return torch.nn.functional.conv3d( + input_t, + weight_t, + bias=bias_t, + stride=strides, + padding=pads, + dilation=dilations, + groups=groups, + ) + else: + raise NotImplementedError("Unsupported ndim") + + # Forward + y_ref = forward_pass(input_torch, weight_torch, bias_torch) + print( + f"PyTorch output shape: {y_ref.shape}, dtype: {y_ref.dtype}, device: {y_ref.device}" + ) + y_ref.backward(grad_output_torch) + grad_input_ref = input_torch.grad + grad_weight_ref = weight_torch.grad + grad_bias_ref = bias_torch.grad if bias is not None else None + + # 2. infiniop backward + grad_output_tensor = TestTensor( + output_shape, output_stride, dt=tensor_dtype, device=device + ) + grad_output_tensor.actual_tensor().copy_(grad_output_torch) + grad_input = TestTensor(input_shape, input_stride, dt=tensor_dtype, device=device) + grad_weight = TestTensor( + weight_shape, weight_stride, dt=tensor_dtype, device=device + ) + grad_bias = ( + TestTensor((weight.shape[0],), (1,), dt=tensor_dtype, device=device) + if bias is not None + else None + ) + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateConvBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_output_tensor.descriptor, + input.descriptor, + weight.descriptor, + bias.descriptor if bias is not None else None, + tuple_to_void_p(pads), + tuple_to_void_p(strides), + tuple_to_void_p(dilations), + groups, + ) + ) + + for tensor in [ + input, + grad_output_tensor, + weight, + bias, + grad_input, + grad_weight, + grad_bias, + ]: + if tensor is not None: + tensor.destroy_desc() + + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetConvBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, input.device) + + def lib_conv_backward(): + check_error( + LIBINFINIOP.infiniopConvBackward( + descriptor, + workspace.data(), + workspace_size.value, + grad_input.data(), + grad_weight.data(), + grad_bias.data() if grad_bias is not None else None, + grad_output_tensor.data(), + input.data(), + weight.data(), + None, + ) + ) + + lib_conv_backward() + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + # Compare grad_input + if DEBUG: + debug(grad_input.actual_tensor(), grad_input_ref, atol=atol, rtol=rtol) + assert torch.allclose( + grad_input.actual_tensor(), grad_input_ref, atol=atol, rtol=rtol + ) + # Compare grad_weight + if DEBUG: + debug(grad_weight.actual_tensor(), grad_weight_ref, atol=atol, rtol=rtol) + assert torch.allclose( + grad_weight.actual_tensor(), grad_weight_ref, atol=atol, rtol=rtol + ) + # Compare grad_bias + if grad_bias is not None: + if DEBUG: + debug(grad_bias.actual_tensor(), grad_bias_ref, atol=atol, rtol=rtol) + assert torch.allclose( + grad_bias.actual_tensor(), grad_bias_ref, atol=atol, rtol=rtol + ) + + if PROFILE: + # PyTorch backward function that recreates the computation graph each time + def torch_conv_backward(): + # Recreate tensors with gradients for each call + input_t = input.torch_tensor().detach().clone().requires_grad_(True) + weight_t = weight.torch_tensor().detach().clone().requires_grad_(True) + bias_t = ( + bias.torch_tensor().detach().clone().requires_grad_(True) + if bias is not None + else None + ) + # Forward pass + y = forward_pass(input_t, weight_t, bias_t) + # Backward pass + y.backward(grad_output_torch) + + # fmt: off + profile_operation("PyTorch", torch_conv_backward, device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lib_conv_backward, device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyConvBackwardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mConvBackward test passed!\033[0m") diff --git a/test/infiniop/cross_entropy_loss.py b/test/infiniop/cross_entropy_loss.py new file mode 100644 index 000000000..acc5cadc4 --- /dev/null +++ b/test/infiniop/cross_entropy_loss.py @@ -0,0 +1,213 @@ +import torch +import ctypes +from ctypes import c_uint64 +import numpy as np + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + infiniopOperatorDescriptor_t, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + TestWorkspace, + InfiniDeviceEnum, +) +from torch.nn import functional as F + +_TEST_CASES = [ + # Single sample classification + ((10,), 10), + ((200,), 200), + # 2D: (N, C) - batch classification + ((4, 10), 10), + ((8, 5), 5), + ((16, 100), 100), + ((32, 1000), 1000), + ((64, 21), 21), + ((128, 50), 50), + # 3D: (N, C, d1) - sequence classification + ((4, 10, 5), 10), + # 4D: (N, C, d1, d2) - image segmentation + ((2, 8, 8, 8), 8), + # 5D: (N, C, d1, d2, d3) - 3D segmentation + ((3, 10, 10, 20, 30), 10), +] + +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16] +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cross_entropy_loss_pytorch(logits, target): + return F.cross_entropy(logits.double(), target.long(), reduction="mean") + + +def test( + handle, + device, + input_shape, + num_classes, + tensor_dtype=InfiniDtype.F32, + sync=None, +): + # 根据输入形状确定logits和target的形状 + if len(input_shape) == 1: + # Shape (C,) - single sample classification + logits_shape = (num_classes,) + target_shape = (1,) # 修改:使用 (1,) 而不是标量 + else: + # Shape (N, C, [d1], [d2], ...) + logits_shape = input_shape + target_shape = (input_shape[0],) + input_shape[2:] + + print( + f"Testing CrossEntropyLoss on {InfiniDeviceNames[device]} with logits_shape: {logits_shape}, target_shape: {target_shape}, dtype:{InfiniDtypeNames[tensor_dtype]}" + ) + + # 创建logits张量 + logits = TestTensor(logits_shape, None, dt=tensor_dtype, device=device) + + # 创建target张量 + target_torch = torch.randint( + 0, + num_classes, + target_shape, + dtype=torch.long, + device=logits.torch_tensor().device, + ) + target = TestTensor.from_torch(target_torch, dt=InfiniDtype.I64, device=device) + + # 创建loss张量 + loss = TestTensor((1,), None, dt=tensor_dtype, device=device) + + # 计算PyTorch参考损失 + if len(input_shape) == 1: + # 对于一维logits,target需要是标量 + target_scalar = target.torch_tensor()[0] + pytorch_loss = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar) + else: + pytorch_loss = cross_entropy_loss_pytorch( + logits.torch_tensor(), target.torch_tensor() + ) + + # 将参考结果存储到loss张量 + loss.torch_tensor()[0] = pytorch_loss.to(loss.torch_tensor().dtype) + + if sync: + sync() + + # 创建算子描述符 + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCrossEntropyLossDescriptor( + handle, + ctypes.byref(descriptor), + loss.descriptor, + logits.descriptor, + target.descriptor, + ) + ) + + # 销毁tensor的描述符以防止内核直接使用 + for tensor in [logits, target, loss]: + tensor.destroy_desc() + + # 获取工作空间大小并创建工作空间 + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCrossEntropyLossWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + # PyTorch参考实现函数 + def torch_cross_entropy(): + if len(input_shape) == 1: + target_scalar = target.torch_tensor()[0] + result = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar) + else: + result = cross_entropy_loss_pytorch( + logits.torch_tensor(), target.torch_tensor() + ) + loss.torch_tensor()[0] = result.to(loss.torch_tensor().dtype) + + # InfiniOP实现函数 + def lib_cross_entropy(): + check_error( + LIBINFINIOP.infiniopCrossEntropyLoss( + descriptor, + workspace.data(), + workspace_size.value, + loss.data(), + logits.data(), + target.data(), + None, + ) + ) + + # 执行InfiniOP算子 + lib_cross_entropy() + + if sync: + sync() + + # 验证结果 + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + actual_loss = loss.actual_tensor()[0] + expected_loss = loss.torch_tensor()[0] + + if DEBUG: + print(f"Expected loss: {expected_loss.item()}") + print(f"Actual loss: {actual_loss.item()}") + if target_shape: + print( + f"Target shape: {target_shape}, first few targets: {target.torch_tensor().flatten()[:5]}" + ) + else: + print(f"Target (scalar): {target.torch_tensor()[0].item()}") + debug(actual_loss, expected_loss, atol=atol, rtol=rtol) + + if not torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol): + print("--- ERROR ANALYSIS ---") + print(f"Expected: {expected_loss.item()}, Actual: {actual_loss.item()}") + print(f"Difference: {abs(actual_loss - expected_loss).item()}") + print(f"Tolerance: atol={atol}, rtol={rtol}") + + assert torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol) + + # Profile功能 + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCrossEntropyLossDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mAll CrossEntropyLoss tests passed!\033[0m") diff --git a/test/infiniop/interpolate_nearest.py b/test/infiniop/interpolate_nearest.py new file mode 100644 index 000000000..335bcd7fd --- /dev/null +++ b/test/infiniop/interpolate_nearest.py @@ -0,0 +1,254 @@ +import torch +import ctypes +from ctypes import c_uint64 + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +from typing import List, Tuple +import math +from torch.nn import functional as F + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# Test cases: (input_shape, input_stride, output_shape, output_stride) +_TEST_CASES = [ + # 2D test cases - simplified to one line each + ((1, 1, 2, 2), None, (1, 1, 4, 4), None), # Simple contiguous case + ((1, 3, 4, 4), (48, 16, 4, 1), (1, 3, 8, 8), (192, 64, 8, 1)), # 2D upscaling 2x + ((1, 3, 8, 8), (192, 64, 8, 1), (1, 3, 4, 4), (48, 16, 4, 1)), # 2D downscaling 2x + ((2, 4, 2, 2), (16, 4, 2, 1), (2, 4, 6, 6), (144, 36, 6, 1)), # Batch upscaling + ( + (1, 1, 3, 5), + (15, 15, 5, 1), + (1, 1, 9, 10), + (90, 90, 10, 1), + ), # Different aspect ratio + ( + (4, 64, 16, 16), + (16384, 256, 16, 1), + (4, 64, 32, 32), + (65536, 1024, 32, 1), + ), # Large batch + ((1, 1, 1, 1), (1, 1, 1, 1), (1, 1, 7, 7), (49, 49, 7, 1)), # Small to large + ( + (1, 2, 3, 4), + (24, 1, 8, 2), + (1, 2, 6, 8), + (96, 1, 16, 2), + ), # Non-contiguous layout + ((2, 3, 2, 2), (32, 8, 4, 1), (2, 3, 4, 4), (128, 32, 8, 1)), # Padded strides + # 1D test cases + ((1, 3, 8), (24, 8, 1), (1, 3, 16), (48, 16, 1)), # 1D upscaling 2x + ((2, 5, 10), (50, 10, 1), (2, 5, 5), (25, 5, 1)), # 1D downscaling 2x + ((4, 2, 32), (64, 32, 1), (4, 2, 64), (128, 64, 1)), # 1D larger upscaling + # 3D test cases + ( + (1, 2, 2, 2, 2), + (16, 8, 4, 2, 1), + (1, 2, 4, 4, 4), + (128, 64, 16, 4, 1), + ), # 3D upscaling 2x + ( + (1, 1, 2, 3, 4), + (24, 24, 12, 4, 1), + (1, 1, 4, 6, 8), + (192, 192, 48, 8, 1), + ), # 3D uniform upscaling + ( + (3, 2, 5, 5, 5), + (250, 125, 25, 5, 1), + (3, 2, 3, 3, 3), + (54, 27, 9, 3, 1), + ), # 3D non-uniform scaling +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.I8] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def interpolate_nearest(input_tensor, output_shape, output_tensor): + """ + Perform nearest neighbor interpolation using PyTorch as reference + """ + # Extract spatial dimensions (H, W) + target_size = output_shape[2:] # Skip batch and channel dimensions + + # Use PyTorch's interpolate function with nearest mode + if input_tensor.dtype in [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + ]: + # 对于整数类型,先转换为 float32,进行插值,再转换回原类型 + original_dtype = input_tensor.dtype + + # 转换为 float32 进行插值 + float_input = input_tensor.float() + result = F.interpolate(float_input, size=target_size, mode="nearest") + + # 转换回原始类型 + result = result.to(original_dtype) + else: + result = F.interpolate(input_tensor, size=target_size, mode="nearest") + + output_tensor.copy_(result) + + +def test( + handle, + device, + input_shape, + input_stride, + output_shape, + output_stride, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + # Create input and output tensors + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + output_tensor = TestTensor( + output_shape, output_stride, dt=tensor_dtype, device=device + ) + + print( + f"Testing InterpolateNearest on {InfiniDeviceNames[device]} with " + f"input_shape: {input_shape}, output_shape: {output_shape}, " + f"input_stride: {input_stride}, output_stride: {output_stride}, " + f"dtype: {InfiniDtypeNames[tensor_dtype]}" + ) + + # Compute reference result using PyTorch + interpolate_nearest( + input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor() + ) + + if sync is not None: + sync() + + # Create descriptor for our interpolate_nearest operator + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateInterpolateNearestDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, output_tensor]: + if tensor is not None: + tensor.destroy_desc() + + # Get workspace size + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetInterpolateNearestWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_interpolate_nearest(): + check_error( + LIBINFINIOP.infiniopInterpolateNearest( + descriptor, + workspace.data(), + workspace_size.value, + output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + # Execute the operation + lib_interpolate_nearest() + + # Check results + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ) + + assert torch.allclose( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ), f"Results don't match for shape {input_shape} -> {output_shape}" + + # Profiling workflow + if PROFILE: + profile_operation( + "PyTorch", + lambda: interpolate_nearest( + input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor() + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", + lambda: lib_interpolate_nearest(), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + + # Clean up + check_error(LIBINFINIOP.infiniopDestroyInterpolateNearestDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index e8963849c..53e7fe485 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -4,7 +4,7 @@ infiniopOperatorDescriptor_t, ) -from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float +from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_bool class OpRegister: @@ -619,3 +619,259 @@ def softplus_(lib): ] lib.infiniopDestroySoftplusDescriptor.restype = c_int32 lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def avg_pool_(lib): + lib.infiniopCreateAvgPoolDescriptor.restype = c_int32 + lib.infiniopCreateAvgPoolDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # output_desc + infiniopTensorDescriptor_t, # input_desc + c_void_p, # kernel_size + c_void_p, # strides + c_void_p, # pads + c_bool, # ceil_mode + ] + + lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32 + lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopAvgPool.restype = c_int32 + lib.infiniopAvgPool.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # output + c_void_p, # input + c_void_p, # stream + ] + + lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32 + lib.infiniopDestroyAvgPoolDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def avg_pool_backward_(lib): + lib.infiniopCreateAvgPoolBackwardDescriptor.restype = c_int32 + lib.infiniopCreateAvgPoolBackwardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # grad_input_desc + infiniopTensorDescriptor_t, # grad_output_desc + infiniopTensorDescriptor_t, # input_desc + c_void_p, # kernel_size + c_void_p, # strides + c_void_p, # pads + c_bool, # ceil_mode + ] + + lib.infiniopGetAvgPoolBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetAvgPoolBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopAvgPoolBackward.restype = c_int32 + lib.infiniopAvgPoolBackward.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # grad_input + c_void_p, # grad_output + c_void_p, # input + c_void_p, # stream + ] + + lib.infiniopDestroyAvgPoolBackwardDescriptor.restype = c_int32 + lib.infiniopDestroyAvgPoolBackwardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def cross_entropy_loss_(lib): + lib.infiniopCreateCrossEntropyLossDescriptor.restype = c_int32 + lib.infiniopCreateCrossEntropyLossDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # loss_desc + infiniopTensorDescriptor_t, # logits_desc + infiniopTensorDescriptor_t, # target_desc + ] + + lib.infiniopGetCrossEntropyLossWorkspaceSize.restype = c_int32 + lib.infiniopGetCrossEntropyLossWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopCrossEntropyLoss.restype = c_int32 + lib.infiniopCrossEntropyLoss.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # loss + c_void_p, # logits + c_void_p, # target + c_void_p, # stream + ] + + lib.infiniopDestroyCrossEntropyLossDescriptor.restype = c_int32 + lib.infiniopDestroyCrossEntropyLossDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def interpolate_nearest_(lib): + lib.infiniopCreateInterpolateNearestDescriptor.restype = c_int32 + lib.infiniopCreateInterpolateNearestDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # output_desc + infiniopTensorDescriptor_t, # input_desc + ] + + lib.infiniopGetInterpolateNearestWorkspaceSize.restype = c_int32 + lib.infiniopGetInterpolateNearestWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopInterpolateNearest.restype = c_int32 + lib.infiniopInterpolateNearest.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # output + c_void_p, # input + c_void_p, # stream + ] + + lib.infiniopDestroyInterpolateNearestDescriptor.restype = c_int32 + lib.infiniopDestroyInterpolateNearestDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def max_pool_(lib): + lib.infiniopCreateMaxPoolDescriptor.restype = c_int32 + lib.infiniopCreateMaxPoolDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # output_desc + infiniopTensorDescriptor_t, # input_desc + c_void_p, # kernel_size + c_void_p, # strides + c_void_p, # pads + c_bool, # ceil_mode + ] + + lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32 + lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMaxPool.restype = c_int32 + lib.infiniopMaxPool.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # output + c_void_p, # input + c_void_p, # stream + ] + + lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32 + lib.infiniopDestroyMaxPoolDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def max_pool_backward_(lib): + lib.infiniopCreateMaxPoolBackwardDescriptor.restype = c_int32 + lib.infiniopCreateMaxPoolBackwardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # grad_input_desc + infiniopTensorDescriptor_t, # grad_output_desc + infiniopTensorDescriptor_t, # input_desc + c_void_p, # kernel_size + c_void_p, # strides + c_void_p, # pads + c_bool, # ceil_mode + ] + + lib.infiniopGetMaxPoolBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetMaxPoolBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMaxPoolBackward.restype = c_int32 + lib.infiniopMaxPoolBackward.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # grad_input + c_void_p, # grad_output + c_void_p, # input + c_void_p, # stream + ] + + lib.infiniopDestroyMaxPoolBackwardDescriptor.restype = c_int32 + lib.infiniopDestroyMaxPoolBackwardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def conv_backward_(lib): + lib.infiniopCreateConvBackwardDescriptor.restype = c_int32 + lib.infiniopCreateConvBackwardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # grad_output_desc + infiniopTensorDescriptor_t, # input_desc + infiniopTensorDescriptor_t, # weight_desc + infiniopTensorDescriptor_t, # bias_desc (can be None) + c_void_p, # pads + c_void_p, # strides + c_void_p, # dilations + c_size_t, # ndim + ] + + lib.infiniopGetConvBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetConvBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopConvBackward.restype = c_int32 + lib.infiniopConvBackward.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # grad_input + c_void_p, # grad_weight + c_void_p, # grad_bias + c_void_p, # grad_output + c_void_p, # input + c_void_p, # weight + c_void_p, # stream + ] + + lib.infiniopDestroyConvBackwardDescriptor.restype = c_int32 + lib.infiniopDestroyConvBackwardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index 510e3d2fa..455c753d2 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -66,10 +66,22 @@ def __init__( torch_strides.append(strides[i]) else: torch_shape.append(shape[i]) + torch_dtype = to_torch_dtype(dt) if mode == "random": - self._torch_tensor = torch.rand( - torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] - ) + if dt == InfiniDtype.I8: + self._torch_tensor = torch.randint( + -128, + 128, + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) + else: + self._torch_tensor = torch.rand( + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) elif mode == "zeros": self._torch_tensor = torch.zeros( torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] @@ -79,7 +91,13 @@ def __init__( torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] ) elif mode == "randint": - self._torch_tensor = torch.randint(-2000000000,2000000000, torch_shape,dtype=to_torch_dtype(dt), device=torch_device_map[device]) + self._torch_tensor = torch.randint( + -2000000000, + 2000000000, + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) elif mode == "manual": assert set_tensor is not None assert torch_shape == list(set_tensor.shape) @@ -91,9 +109,32 @@ def __init__( raise ValueError("Unsupported mode") if scale is not None: - self._torch_tensor *= scale + if torch_dtype in [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + ]: + # 对于整数类型,先转换为 float,应用 scale,再转换回原类型 + self._torch_tensor = (self._torch_tensor.float() * scale).to( + torch_dtype + ) + else: + self._torch_tensor *= scale + if bias is not None: - self._torch_tensor += bias + if torch_dtype in [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + ]: + # 对于整数类型,先转换为 float,应用 bias,再转换回原类型 + self._torch_tensor = (self._torch_tensor.float() + bias).to(torch_dtype) + else: + self._torch_tensor += bias if strides is not None: self._data_tensor = rearrange_tensor(self._torch_tensor, torch_strides) diff --git a/test/infiniop/maxpool.py b/test/infiniop/maxpool.py new file mode 100644 index 000000000..81ddce060 --- /dev/null +++ b/test/infiniop/maxpool.py @@ -0,0 +1,242 @@ +import torch +import ctypes +from ctypes import c_uint64, c_bool + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +from typing import List, Tuple +import math +from torch.nn import functional as F + +# Configuration for profiling +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# Test cases: (input_shape, input_stride, kernel_size, stride, padding, ceil_mode) +_TEST_CASES = [ + # 1D max pooling cases + ((1, 3, 8), None, (2,), (2,), (0,), False), + ((2, 4, 16), None, (3,), (2,), (1,), False), + ((3, 2, 77), None, (6,), (4,), (3,), True), + # 2D max pooling cases + ((1, 1, 4, 4), None, (2, 2), (2, 2), (0, 0), False), + ((2, 3, 8, 8), None, (3, 3), (2, 2), (1, 1), False), + ((1, 64, 32, 32), None, (2, 2), (2, 2), (0, 0), False), + ((4, 128, 16, 16), None, (3, 3), (1, 1), (1, 1), False), + # 3D max pooling cases + ((1, 1, 4, 4, 4), None, (2, 2, 2), (2, 2, 2), (0, 0, 0), False), + ((2, 2, 8, 8, 8), None, (2, 3, 3), (2, 2, 2), (0, 1, 1), False), + # Cases with ceil_mode=True + ((1, 1, 7, 7), None, (3, 3), (2, 2), (1, 1), True), + ((1, 2, 5), None, (3,), (2,), (0,), True), +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + + +def max_pool(input_tensor, kernel_size, stride, padding, ceil_mode): + """ + Perform max pooling using PyTorch as reference + """ + ndim = len(input_tensor.shape) - 2 # Spatial dimensions + + if ndim == 1: + result = F.max_pool1d( + input_tensor, + kernel_size=kernel_size[0], + stride=stride[0], + padding=padding[0], + ceil_mode=ceil_mode, + ) + elif ndim == 2: + result = F.max_pool2d( + input_tensor, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ceil_mode=ceil_mode, + ) + elif ndim == 3: + result = F.max_pool3d( + input_tensor, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ceil_mode=ceil_mode, + ) + else: + raise ValueError(f"Unsupported spatial dimensions: {ndim}") + + return result + + +def tuple_to_void_p(py_tuple: Tuple): + """Convert a python tuple to a ctype void pointer""" + array = ctypes.c_uint64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.c_void_p) + + +def test( + handle, + device, + input_shape, + input_stride, + kernel_size, + stride, + padding, + ceil_mode, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + # Create input tensor + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + + # Compute reference result using PyTorch + torch_ref_output = max_pool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + ) + + # Use PyTorch输出shape来初始化output_tensor + output_tensor = TestTensor( + torch_ref_output.shape, None, dt=tensor_dtype, device=device + ) + + print( + f"Testing MaxPool on {InfiniDeviceNames[device]} with " + f"input_shape: {input_shape}, kernel_size: {kernel_size}, " + f"stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, " + f"dtype: {InfiniDtypeNames[tensor_dtype]}" + ) + + if sync is not None: + sync() + + # Create descriptor for our max pool operator + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMaxPoolDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + tuple_to_void_p(kernel_size), + tuple_to_void_p(stride), + tuple_to_void_p(padding), + c_bool(ceil_mode), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, output_tensor]: + if tensor is not None: + tensor.destroy_desc() + + # Get workspace size + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMaxPoolWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_max_pool(): + check_error( + LIBINFINIOP.infiniopMaxPool( + descriptor, + workspace.data(), + workspace_size.value, + output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + # Execute the operation + lib_max_pool() + + # Check results + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + output_tensor.actual_tensor(), + torch_ref_output, + atol=atol, + rtol=rtol, + ) + + assert torch.allclose( + output_tensor.actual_tensor(), + torch_ref_output, + atol=atol, + rtol=rtol, + ), f"Results don't match for input_shape {input_shape}, kernel_size {kernel_size}" + + # Profiling workflow + if PROFILE: + profile_operation( + "PyTorch", + lambda: max_pool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", lambda: lib_max_pool(), device, NUM_PRERUN, NUM_ITERATIONS + ) + + # Clean up + check_error(LIBINFINIOP.infiniopDestroyMaxPoolDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/maxpool_backward.py b/test/infiniop/maxpool_backward.py new file mode 100644 index 000000000..4d8bc073c --- /dev/null +++ b/test/infiniop/maxpool_backward.py @@ -0,0 +1,315 @@ +import torch +import ctypes +from ctypes import c_uint64, c_bool + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +from typing import List, Tuple +import math +from torch.nn import functional as F + +# Configuration for profiling +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# Test cases: (input_shape, input_stride, kernel_size, stride, padding, ceil_mode) +_TEST_CASES = [ + # 1D MaxPool + ((1, 1, 8), None, (2,), (2,), (1,), False), + ((2, 4, 16), None, (3,), (2,), (1,), False), + ((1, 2, 5), None, (3,), (2,), (0,), True), + ((8, 16, 1024), None, (5,), (3,), (2,), False), + # 2D MaxPool + ((2, 1, 9, 4), None, (2, 2), (2, 2), (0, 0), False), + ((3, 6, 16, 8), None, (3, 3), (2, 2), (1, 1), False), + ((3, 9, 16, 32), None, (4, 3), (2, 2), (1, 1), True), + ((5, 12, 24, 40), None, (4, 5), (2, 3), (1, 1), True), + # 3D MaxPool + ((1, 1, 4, 4, 4), None, (2, 2, 2), (2, 2, 2), (0, 0, 0), False), + ((2, 2, 8, 8, 8), None, (2, 3, 3), (2, 2, 2), (0, 1, 1), False), + ((1, 1, 10, 20, 30), None, (2, 3, 4), (2, 2, 3), (1, 1, 2), False), + # Large batch/channel + ((32, 64, 16, 16), None, (2, 2), (2, 2), (0, 0), False), + ((16, 32, 8, 8, 8), None, (2, 2, 2), (2, 2, 2), (0, 0, 0), False), +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + + +def max_pool_output_shape_pt( + input_shape, kernel_size, stride, padding, ceil_mode, dtype=torch.float32 +): + # Create a dummy tensor to get PyTorch的output shape + dummy = torch.zeros(input_shape, dtype=dtype) + ndim = len(input_shape) - 2 + if ndim == 1: + out = F.max_pool1d( + dummy, kernel_size[0], stride[0], padding[0], ceil_mode=ceil_mode + ) + elif ndim == 2: + out = F.max_pool2d(dummy, kernel_size, stride, padding, ceil_mode=ceil_mode) + elif ndim == 3: + out = F.max_pool3d(dummy, kernel_size, stride, padding, ceil_mode=ceil_mode) + else: + raise ValueError("Unsupported ndim") + return tuple(out.shape) + + +def max_pool_backward( + input_tensor, + grad_output_tensor, + kernel_size, + stride, + padding, + ceil_mode, + grad_input_tensor, +): + """ + Perform max pooling backward using PyTorch as reference + """ + input_tensor = input_tensor.detach().clone().requires_grad_(True) + ndim = len(input_tensor.shape) - 2 # Spatial dimensions + + # First do forward pass to get indices + if ndim == 1: + output = F.max_pool1d( + input_tensor, + kernel_size=kernel_size[0], + stride=stride[0], + padding=padding[0], + ceil_mode=ceil_mode, + ) + elif ndim == 2: + output = F.max_pool2d( + input_tensor, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ceil_mode=ceil_mode, + ) + elif ndim == 3: + output = F.max_pool3d( + input_tensor, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ceil_mode=ceil_mode, + ) + else: + raise ValueError(f"Unsupported spatial dimensions: {ndim}") + output.backward(grad_output_tensor) + grad_input_tensor.copy_(input_tensor.grad) + + +def tuple_to_void_p(py_tuple: Tuple): + """Convert a python tuple to a ctype void pointer""" + array = ctypes.c_uint64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.c_void_p) + + +def test( + handle, + device, + input_shape, + input_stride, + kernel_size, + stride, + padding, + ceil_mode, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + # Create input tensor (original input for forward pass) + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + + # 用PyTorch得出的output shape来初始化grad_output_tensor + torch_dtype = torch.float32 # 只用于推理shape,实际TestTensor用自己的dtype + output_shape = max_pool_output_shape_pt( + input_shape, kernel_size, stride, padding, ceil_mode, dtype=torch_dtype + ) + + # Create grad_output tensor (gradient w.r.t. pooling output) + grad_output_tensor = TestTensor( + output_shape, None, dt=tensor_dtype, device=device, scale=1.0 + ) + + # Create grad_input tensor (gradient w.r.t. pooling input) + grad_input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device + ) + + print( + f"Testing MaxPoolBackward on {InfiniDeviceNames[device]} with " + f"input_shape: {input_shape}, output_shape: {output_shape}, " + f"kernel_size: {kernel_size}, stride: {stride}, padding: {padding}, " + f"ceil_mode: {ceil_mode}, dtype: {InfiniDtypeNames[tensor_dtype]}" + ) + + # Compute reference result using PyTorch + try: + max_pool_backward( + input_tensor.torch_tensor(), + grad_output_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + grad_input_tensor.torch_tensor(), + ) + except Exception as e: + print(f"Error during PyTorch reference computation: {e}") + raise + + if sync is not None: + sync() + + # Create descriptor for our max pool backward operator + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMaxPoolBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_input_tensor.descriptor, # gradient w.r.t. input (output of this op) + grad_output_tensor.descriptor, # gradient w.r.t. output (input to this op) + input_tensor.descriptor, # original input (for indices) + tuple_to_void_p(kernel_size), + tuple_to_void_p(stride), + tuple_to_void_p(padding), + c_bool(ceil_mode), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, grad_output_tensor, grad_input_tensor]: + if tensor is not None: + tensor.destroy_desc() + + # Get workspace size + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMaxPoolBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, grad_input_tensor.device) + + def lib_max_pool_backward(): + check_error( + LIBINFINIOP.infiniopMaxPoolBackward( + descriptor, + workspace.data(), + workspace_size.value, + grad_input_tensor.data(), # output: gradient w.r.t. input + grad_output_tensor.data(), # input: gradient w.r.t. output + input_tensor.data(), # input: original input tensor + None, + ) + ) + + # Execute the operation + try: + lib_max_pool_backward() + except Exception as e: + print(f"Error during libinfiniop max pool backward operation: {e}") + raise + + # Check results + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + grad_input_tensor.actual_tensor(), + grad_input_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ) + + actual_result = grad_input_tensor.actual_tensor() + expected_result = grad_input_tensor.torch_tensor() + + # 检查是否有 NaN 或 Inf + if torch.isnan(actual_result).any(): + print("WARNING: Actual result contains NaN values!") + if torch.isinf(actual_result).any(): + print("WARNING: Actual result contains Inf values!") + if torch.isnan(expected_result).any(): + print("WARNING: Expected result contains NaN values!") + if torch.isinf(expected_result).any(): + print("WARNING: Expected result contains Inf values!") + + assert torch.allclose( + grad_input_tensor.actual_tensor(), + grad_input_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ), f"Results don't match for input_shape {input_shape}, kernel_size {kernel_size}" + + # Profiling workflow + if PROFILE: + profile_operation( + "PyTorch", + lambda: max_pool_backward( + input_tensor.torch_tensor(), + grad_output_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + grad_input_tensor.torch_tensor(), + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", + lambda: lib_max_pool_backward(), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + + # Clean up + check_error(LIBINFINIOP.infiniopDestroyMaxPoolBackwardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/xmake.lua b/xmake.lua index 67add0d45..fbb5156c3 100644 --- a/xmake.lua +++ b/xmake.lua @@ -110,6 +110,16 @@ if has_config("metax-gpu") then includes("xmake/metax.lua") end +option("hcdnn") + set_default(true) + set_showmenu(true) + set_description("Whether to compile hcdnn for Metax GPU") +option_end() + +if has_config("hcdnn") then + add_defines("ENABLE_HCDNN_API") +end + -- 摩尔线程 option("moore-gpu") set_default(false)