diff --git a/include/infiniop.h b/include/infiniop.h index d51b8d92e..1987df33e 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -18,4 +18,6 @@ #include "infiniop/ops/swiglu.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/index_copy_inplace.h" + #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/index_copy_inplace.h b/include/infiniop/index_copy_inplace.h new file mode 100644 index 000000000..9f79e5c44 --- /dev/null +++ b/include/infiniop/index_copy_inplace.h @@ -0,0 +1,73 @@ +#ifndef __INFINIOP_INDEX_COPY_INPLACE_API_H__ +#define __INFINIOP_INDEX_COPY_INPLACE_API_H__ + +#include "handle.h" +#include "operator_descriptor.h" +#include "tensor_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopIndexCopyInplaceDescriptor_t; + +__C __export infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor(infiniopHandle_t handle, + infiniopIndexCopyInplaceDescriptor_t *desc_ptr,//输出参数,用来接收创建好的描述符 + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t output, + int dim, + infiniopTensorDescriptor_t index); + +//__C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size);//获取工作空间大小,需要临时内存的时候用,这个算子应该用不到 + +__C __export infiniStatus_t infiniopIndexCopyInplace(infiniopIndexCopyInplaceDescriptor_t desc, + //void *workspace, + //size_t workspace_size, + const void *input, + void *output, + //const void dim,标量直接值传递,不通过指针;这里不需要了,它在创建描述符时已被记录 + const void *index, + void *stream);//需要一个流/队列对象 + +__C __export infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc); + +#endif + +//-----------------------------空骨架测试用--------------------------------------- +// #ifndef __INFINIOP_INDEX_COPY_INPLACE_API_H__ +// #define __INFINIOP_INDEX_COPY_INPLACE_API_H__ + +// // 1. 包含所有 C-API 都需要的核心定义 +// #include "handle.h" +// #include "operator_descriptor.h" +// #include "tensor_descriptor.h" + +// #ifdef __cplusplus +// extern "C" { +// #endif + +// // 2. 定义一个不透明的描述符类型 +// typedef struct InfiniopDescriptor *infiniopIndexCopyInplaceDescriptor_t; + +// // 3. 声明“创建描述符”的 C-API 函数 +// __C __export infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( +// infiniopHandle_t handle, +// infiniopIndexCopyInplaceDescriptor_t *desc_ptr, // 输出参数 +// infiniopTensorDescriptor_t input_desc, +// infiniopTensorDescriptor_t output_desc, +// int dim, +// infiniopTensorDescriptor_t index_desc); + +// // 4. 声明“执行算子”的 C-API 函数 +// __C __export infiniStatus_t infiniopIndexCopyInplace( +// infiniopIndexCopyInplaceDescriptor_t desc, +// const void *input, +// void *output, +// const void *index, +// void *stream); + +// // 5. 声明“销毁描述符”的 C-API 函数 +// __C __export infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor( +// infiniopIndexCopyInplaceDescriptor_t desc); + +// #ifdef __cplusplus +// } // extern "C" +// #endif + +// #endif // __INFINIOP_INDEX_COPY_INPLACE_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc new file mode 100644 index 000000000..8b6816c11 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc @@ -0,0 +1,452 @@ +#include "index_copy_inplace_cpu.h" +#include "../../../devices/cpu/common_cpu.h"//引入CPU通用工具 +#include +#include + +namespace op::index_copy_inplace::cpu { + +Descriptor::~Descriptor() = default;//Descriptor的析构函数 + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t output_desc, + int dim, + infiniopTensorDescriptor_t index_desc) { + + // 【追踪器 1】 + printf("--- [DEBUG] Entering Descriptor::create ---\n"); + + auto handle = reinterpret_cast(handle_);//将通用的 handle 转换成 CPU 专用的 handle + + //auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc); + //待定义,在.h文件中定义IndexCopyInplaceInfo类,类内定义createIndexCopyInplaceInfo函数 + auto info = IndexCopyInplaceInfo::createIndexCopyInplaceInfo(input_desc, output_desc, dim, index_desc); + CHECK_RESULT(info); + + // Create descriptor + *desc_ptr = new Descriptor( + info.take(), + 0, + nullptr, + handle->device, + handle->device_id); + // 【追踪器 】 + printf("--- [DEBUG] Exiting Descriptor::create ---\n"); + + return INFINI_STATUS_SUCCESS; +} + +//创建模板化的,真正的计算内核 +//template ,这里只需要一个模板参数T,因为索引类型是固定的 +template +infiniStatus_t calculateIndexCopyInplace(const IndexCopyInplaceInfo &info, + const Tdata *input_data, + Tdata *output_data,//????dim在info对象中,这里不用显示定义 + const int64_t *index_data) { + // a. 将 void* 指针安全地转换成具体类型的指针,这部分rope对应代码中没有!!!这里报错,删掉 + // auto output_ptr = reinterpret_cast(output_data); + // auto input_ptr = reinterpret_cast(input_data); + // auto index_ptr = reinterpret_cast(index_data); + + // 【追踪器 3】 + printf("--- [DEBUG] Entering calculateKernel. slice_size = %ld, index_size = %zu\n", info.slice_size, info.index_size); + fflush(stdout); // 强制刷新缓冲区,确保我们能立刻看到输出 + + if (info.slice_size == 0 && info.output_shape.size() > 0) return INFINI_STATUS_SUCCESS;// 只有在非0维空张量时才返回 + +//#pragma omp parallel for + //遍历除了dim之外的所有元素组合 + for(int64_t slice_idx = 0; slice_idx < info.slice_size; ++slice_idx){ + + // 【追踪器 4】- 这个可能会打印很多次,如果卡住,我们可能看不到它 + if (slice_idx % 100 == 0) { // 每 100 次迭代打印一次,防止刷屏 + printf("--- [DEBUG] In calculateKernel loop, slice_idx = %ld\n", slice_idx); + fflush(stdout); + } + + int64_t output_slice_offset = 0; + int64_t input_slice_offset = 0; + // int64_t temp_slice_idx = slice_idx; + // ptrdiff_t num_dims = info.output_shape.size(); + + //通过stride计算每个slice的基地址偏移量 + //这是支持任意布局的关键 + // 【修正】当维度>0时,才进行地址计算 + //if (info.output_shape.size() > 0) + if (!info.output_shape.empty()) { + int64_t temp_slice_idx = slice_idx; + ptrdiff_t num_dims = info.output_shape.size(); + + // 【修正】使用绝对安全的倒序循环 + for (ptrdiff_t i = num_dims - 1; i >= 0; --i) { + if (i == info.dim) continue; + + size_t current_dim_idx = temp_slice_idx % info.output_shape[i]; + temp_slice_idx /= info.output_shape[i]; + + output_slice_offset += current_dim_idx * info.output_strides[i]; + input_slice_offset += current_dim_idx * info.input_strides[i]; + } + } + Tdata *output_slice_ptr = output_data + output_slice_offset; + const Tdata *input_slice_ptr = input_data + input_slice_offset; + + //在当前slice上,根据index张量进行复制 + for(size_t i = 0; i < info.index_size; ++i){ + int64_t target_idx = index_data[i]; + //边界检查,防止非法内存访问 + // 0 维张量的 shape[0] 会越界 + if (info.output_shape.empty()) { // 单独处理 0 维张量 + if (target_idx == 0) { // 索引必须是0 + *output_slice_ptr = *input_slice_ptr; + } + } else { + if (target_idx >= 0 && static_cast(target_idx) < info.output_shape[info.dim]) { + output_slice_ptr[target_idx * info.output_strides[info.dim]] = + input_slice_ptr[i * info.input_strides[info.dim]]; + } + } + } + } + // 【追踪器 5】 + printf("--- [DEBUG] Exiting calculateKernel ---\n"); + fflush(stdout); + + return INFINI_STATUS_SUCCESS; +} + + #define CALCULATE_INDEXCOPYINPLACE(TDATA) \ + calculateIndexCopyInplace(_info, \ + static_cast(input), \ + static_cast(output), /*这里也显示转换*/ \ + static_cast(index)) +infiniStatus_t Descriptor::calculate( + const void *input, + void *output, + const void *index, + void *stream) const { + // 【追踪器 2】 + printf("--- [DEBUG] Entering Descriptor::calculate. DType = %d\n", _info.data_type); + fflush(stdout); + + switch (_info.data_type) {//!!!!!data_type这个命名和.h文件中的类相关,可以后面再改 + case INFINI_DTYPE_F16: + // return CALCULATE_KERNEL(_info, output, input, index); + return CALCULATE_INDEXCOPYINPLACE(fp16_t); + case INFINI_DTYPE_BF16: + // return CALCULATE_KERNEL(_info, output, input, index); + return CALCULATE_INDEXCOPYINPLACE(bf16_t); + case INFINI_DTYPE_F32: + // return CALCULATE_KERNEL(_info, output, input, index); + return CALCULATE_INDEXCOPYINPLACE(float); + case INFINI_DTYPE_F64: + // return CALCULATE_KERNEL(_info, output, input, index); + return CALCULATE_INDEXCOPYINPLACE(double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +} + +//#undef ROPE_TYPE这里没有定义,所以不用取消定义宏 +#undef CALCULATE_INDEXCOPYINPLACE + +} // namespace op::index_copy_inplace::cpu + +//---------------------------空骨架测试用----------------------------------------- +// #include +// #include +// #include "infiniop/index_copy_inplace.h" +// #include "../../../tensor.h" // 引入 InfiniopTensorDescriptor 结构体 +// #include "../../../utils.h" + +// // 我们不再使用复杂的 C++ 类和宏,直接写 C 函数的实现 + +// // 全局函数,不再是类的成员 +// template +// infiniStatus_t calculateKernel(const void *input_data, Tdata *output_data, +// const int64_t *index_data, +// const InfiniopTensorDescriptor* output_desc, +// const InfiniopTensorDescriptor* input_desc, +// const InfiniopTensorDescriptor* index_desc, +// int dim) { +// // 这个函数体可以暂时为空,我们先测试链接 +// printf("--- [DEBUG] In calculateKernel! ---\n"); +// fflush(stdout); +// return INFINI_STATUS_SUCCESS; +// } + +// // extern "C" 确保这些是纯 C 风格的函数,避免名称混淆 +// extern "C" { + +// infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( +// infiniopHandle_t handle, +// infiniopIndexCopyInplaceDescriptor_t *desc_ptr, +// infiniopTensorDescriptor_t input, +// infiniopTensorDescriptor_t output, +// int dim, +// infiniopTensorDescriptor_t index) { + +// printf("--- [DEBUG] C-API: CreateDescriptor called ---\n"); +// fflush(stdout); +// // 我们暂时不创建任何复杂的 C++ 对象,只返回一个虚拟的指针 +// // 这里的 42 是一个魔数,只要它不是 nullptr 即可 +// *desc_ptr = reinterpret_cast(42); +// return INFINI_STATUS_SUCCESS; +// } + +// infiniStatus_t infiniopIndexCopyInplace( +// infiniopIndexCopyInplaceDescriptor_t desc, +// const void *input, +// void *output, +// const void *index, +// void *stream) { + +// printf("--- [DEBUG] C-API: IndexCopyInplace called ---\n"); +// fflush(stdout); +// // 我们暂时不调用 kernel,只返回成功 +// return INFINI_STATUS_SUCCESS; +// } + +// infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor( +// infiniopIndexCopyInplaceDescriptor_t desc) { +// printf("--- [DEBUG] C-API: DestroyDescriptor called ---\n"); +// fflush(stdout); +// return INFINI_STATUS_SUCCESS; +// } + +// } // extern "C" +//----------------------------抛弃模仿 rope 的、复杂的面向对象封装---------------------------------------- +// #include +// #include +// #include "infiniop/index_copy_inplace.h" // 包含 C-API 声明 +// #include "../../../tensor.h" // 引入 InfiniopTensorDescriptor 结构体 +// #include "../../../../utils.h" // 引入 CHECK_... 宏 + +// // 1. 将 Info 结构体直接定义在 .cc 文件内部,作为一个私有辅助工具 +// struct IndexCopyInplaceInfo { +// // Info 类的成员变量 +// infiniDtype_t data_type; +// int dim; +// std::vector output_shape; +// std::vector output_strides; +// std::vector input_shape; +// std::vector input_strides; +// size_t index_size; +// int64_t slice_size; + +// // Info 类的 create 方法,负责所有验证 +// static utils::Result create( +// const infiniopTensorDescriptor_t input_desc, +// const infiniopTensorDescriptor_t output_desc, +// int dim_val, +// const infiniopTensorDescriptor_t index_desc) { + +// CHECK_OR_RETURN( +// input_desc != nullptr && output_desc != nullptr && index_desc != nullptr, +// INFINI_STATUS_NULL_POINTER); + +// const infiniDtype_t dtype = output_desc->dtype(); + +// CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); +// CHECK_OR_RETURN(dtype == input_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE); +// CHECK_OR_RETURN(index_desc->dtype() == INFINI_DTYPE_I64, INFINI_STATUS_BAD_TENSOR_DTYPE); + +// CHECK_OR_RETURN(output_desc->ndim() == input_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); +// CHECK_OR_RETURN(index_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); + +// if (output_desc->ndim() == 0) { +// CHECK_OR_RETURN(dim_val == 0, INFINI_STATUS_BAD_TENSOR_SHAPE); +// } else { +// CHECK_OR_RETURN(dim_val >= 0 && static_cast(dim_val) < output_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); +// } + +// for (size_t i = 0; i < output_desc->ndim(); ++i) { +// if (i != static_cast(dim_val)) { +// CHECK_OR_RETURN(output_desc->dim(i) == input_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE); +// } +// } +// if (output_desc->ndim() > 0) { +// CHECK_OR_RETURN(input_desc->dim(dim_val) == index_desc->dim(0), INFINI_STATUS_BAD_TENSOR_SHAPE); +// } + +// int64_t current_slice_size = 1; +// if (output_desc->ndim() > 0) { +// for (size_t i = 0; i < output_desc->ndim(); ++i) { +// if (i != static_cast(dim_val)) { +// current_slice_size *= output_desc->dim(i); +// } +// } +// } + +// return utils::Result(IndexCopyInplaceInfo{ +// dtype, dim_val, output_desc->shape(), output_desc->strides(), +// input_desc->shape(), input_desc->strides(), +// index_desc->numel(), current_slice_size, +// }); +// } +// }; + +// // 2. 真正的计算内核函数 +// template +// infiniStatus_t calculateKernel(const IndexCopyInplaceInfo &info, +// const Tdata *input_data, +// Tdata *output_data, +// const int64_t *index_data) { + +// if (info.slice_size == 0 && !info.output_shape.empty()) return INFINI_STATUS_SUCCESS; + +// // #pragma omp parallel for // 先在单线程模式下验证正确性 +// for (int64_t slice_idx = 0; slice_idx < info.slice_size; ++slice_idx) { +// int64_t output_slice_offset = 0; +// int64_t input_slice_offset = 0; + +// if (!info.output_shape.empty()) { +// int64_t temp_slice_idx = slice_idx; +// ptrdiff_t num_dims = info.output_shape.size(); + +// for (ptrdiff_t i = num_dims - 1; i >= 0; --i) { +// if (i == info.dim) continue; +// size_t current_dim_idx = temp_slice_idx % info.output_shape[i]; +// temp_slice_idx /= info.output_shape[i]; +// output_slice_offset += current_dim_idx * info.output_strides[i]; +// input_slice_offset += current_dim_idx * info.input_strides[i]; +// } +// } + +// Tdata *output_slice_ptr = output_data + output_slice_offset; +// const Tdata *input_slice_ptr = input_data + input_slice_offset; + +// for (size_t i = 0; i < info.index_size; ++i) { +// int64_t target_idx = index_data[i]; + +// if (info.output_shape.empty()) { +// if (target_idx == 0) { *output_slice_ptr = *input_slice_ptr; } +// } else { +// if (target_idx >= 0 && static_cast(target_idx) < info.output_shape[info.dim]) { +// output_slice_ptr[target_idx * info.output_strides[info.dim]] = +// input_slice_ptr[i * info.input_strides[info.dim]]; +// } +// } +// } +// } +// return INFINI_STATUS_SUCCESS; +// } + +// // 3. 实现 C-API 函数 +// extern "C" { + +// infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( +// infiniopHandle_t handle, +// infiniopIndexCopyInplaceDescriptor_t *desc_ptr, +// infiniopTensorDescriptor_t input, +// infiniopTensorDescriptor_t output, +// int dim, +// infiniopTensorDescriptor_t index) { + +// auto info_result = IndexCopyInplaceInfo::create(input, output, dim, index); +// CHECK_RESULT(info_result); + +// auto info = new IndexCopyInplaceInfo(info_result.take()); +// *desc_ptr = reinterpret_cast(info); + +// return INFINI_STATUS_SUCCESS; +// } + +// infiniStatus_t infiniopIndexCopyInplace( +// infiniopIndexCopyInplaceDescriptor_t desc, +// const void *input, void *output, const void *index, void *stream) { + +// auto info = reinterpret_cast(desc); + +// switch (info->data_type) { +// case INFINI_DTYPE_F16: +// return calculateKernel(*info, reinterpret_cast(input), reinterpret_cast(output), reinterpret_cast(index)); +// case INFINI_DTYPE_F32: +// return calculateKernel(*info, reinterpret_cast(input), reinterpret_cast(output), reinterpret_cast(index)); +// case INFINI_DTYPE_BF16: +// return calculateKernel(*info, reinterpret_cast(input), reinterpret_cast(output), reinterpret_cast(index)); +// case INFINI_DTYPE_F64: +// return calculateKernel(*info, reinterpret_cast(input), reinterpret_cast(output), reinterpret_cast(index)); +// default: +// return INFINI_STATUS_BAD_TENSOR_DTYPE; +// } +// } + +// infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor( +// infiniopIndexCopyInplaceDescriptor_t desc) { +// delete reinterpret_cast(desc); +// return INFINI_STATUS_SUCCESS; +// } + +// } // extern "C" +//----------------------------给operator.cc转移主要功能---------------------------------------- +// #include "index_copy_inplace_cpu.h" +// #include + +// // 模板化的 Kernel 函数 +// template +// infiniStatus_t calculateKernel(const IndexCopyInplaceInfo &info, +// const Tdata *input_data, +// Tdata *output_data, +// const int64_t *index_data) { + +// if (info.slice_size == 0 && !info.output_shape.empty()) return INFINI_STATUS_SUCCESS; + +// // #pragma omp parallel for // 先在单线程模式下验证正确性 +// for (int64_t slice_idx = 0; slice_idx < info.slice_size; ++slice_idx) { +// int64_t output_slice_offset = 0; +// int64_t input_slice_offset = 0; + +// if (!info.output_shape.empty()) { +// int64_t temp_slice_idx = slice_idx; +// ptrdiff_t num_dims = info.output_shape.size(); + +// for (ptrdiff_t i = num_dims - 1; i >= 0; --i) { +// if (i == info.dim) continue; +// size_t current_dim_idx = temp_slice_idx % info.output_shape[i]; +// temp_slice_idx /= info.output_shape[i]; +// output_slice_offset += current_dim_idx * info.output_strides[i]; +// input_slice_offset += current_dim_idx * info.input_strides[i]; +// } +// } + +// Tdata *output_slice_ptr = output_data + output_slice_offset; +// const Tdata *input_slice_ptr = input_data + input_slice_offset; + +// for (size_t i = 0; i < info.index_size; ++i) { +// int64_t target_idx = index_data[i]; + +// if (info.output_shape.empty()) { +// if (target_idx == 0) { *output_slice_ptr = *input_slice_ptr; } +// } else { +// if (target_idx >= 0 && static_cast(target_idx) < info.output_shape[info.dim]) { +// output_slice_ptr[target_idx * info.output_strides[info.dim]] = +// input_slice_ptr[i * info.input_strides[info.dim]]; +// } +// } +// } +// } +// return INFINI_STATUS_SUCCESS; +// } + +// // CPU 专属的内核启动器 +// infiniStatus_t index_copy_inplace_kernel_cpu( +// const IndexCopyInplaceInfo &info, +// const void *input, void *output, const void *index, void *stream) { + +// switch (info.data_type) { +// case INFINI_DTYPE_F16: +// return calculateKernel(info, reinterpret_cast(input), reinterpret_cast(output), reinterpret_cast(index)); +// case INFINI_DTYPE_F32: +// return calculateKernel(info, reinterpret_cast(input), reinterpret_cast(output), reinterpret_cast(index)); +// case INFINI_DTYPE_BF16: +// return calculateKernel(info, reinterpret_cast(input), reinterpret_cast(output), reinterpret_cast(index)); +// case INFINI_DTYPE_F64: +// return calculateKernel(info, reinterpret_cast(input), reinterpret_cast(output), reinterpret_cast(index)); +// default: +// return INFINI_STATUS_BAD_TENSOR_DTYPE; +// } +// return INFINI_STATUS_BAD_TENSOR_DTYPE; +// } \ No newline at end of file diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h new file mode 100644 index 000000000..26bdf4a91 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_INDEX_COPY_INPLACE_CPU_H__ +#define __INFINIOP_INDEX_COPY_INPLACE_CPU_H__ +//参考rope,它也不是预制件 +#include "../index_copy_inplace.h"/*还没创建*/ + +DESCRIPTOR(cpu) +// 2. 模仿 rope,使用 DESCRIPTOR 宏来声明我们的 Descriptor 类 +// 这个宏很可能在 op::index_copy_inplace::cpu 命名空间中,???这个命名空间在哪个文件? +// 为我们定义了 class Descriptor : public op::Descriptor {...}; + +#endif // __INFINIOP_INDEX_COPY_INPLACE_CPU_H__ + +//------------------------------空骨架测试用-------------------------------------- +// #ifndef __INDEX_COPY_INPLACE_CPU_H__ +// #define __INDEX_COPY_INPLACE_CPU_H__ + +// #include "../operator.cc" // 技巧:为了能拿到 Info 的定义 + +// // 只在这里【声明】CPU 专属的内核启动函数 +// infiniStatus_t index_copy_inplace_kernel_cpu( +// const IndexCopyInplaceInfo &info, +// const void *input, void *output, const void *index, void *stream +// ); + +// #endif \ No newline at end of file diff --git a/src/infiniop/ops/index_copy_inplace/cuda/common.cuh b/src/infiniop/ops/index_copy_inplace/cuda/common.cuh new file mode 100644 index 000000000..f4610480d --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/cuda/common.cuh @@ -0,0 +1,12 @@ +#ifndef __INFINIOP_INDEX_COPY_INPLACE_CUDA_COMMON_CUH__ +#define __INFINIOP_INDEX_COPY_INPLACE_CUDA_COMMON_CUH__ +//.cu文件导入/#include "../../../devices/nvidia/nvidia_kernel_common.cuh"会报错 +//所以就自己把用到的部分单独拿出来 + +// 用包含 CUDA 官方的头文件来定义 bfloat16 和 half +#include +#include + +using cuda_bfloat16 = nv_bfloat16; + +#endif // __INFINIOP_INDEX_COPY_INPLACE_CUDA_COMMON_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy_inplace/cuda/kernel.cuh b/src/infiniop/ops/index_copy_inplace/cuda/kernel.cuh new file mode 100644 index 000000000..42d372a70 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/cuda/kernel.cuh @@ -0,0 +1,52 @@ +#ifndef __INFINIOP_INDEX_COPY_INPLACE_CUDA_KERNEL_CUH__ +#define __INFINIOP_INDEX_COPY_INPLACE_CUDA_KERNEL_CUH__ + +#include "../../../devices/nvidia/nvidia_common.cuh" + +template +__device__ void indexCopyInplaceKernelBlock( + const Tdata *input_data, + Tdata *output_data, + const int64_t *__restrict__ index_data, + int dim,//用扁平化参数 + int num_dims, + size_t index_size, + const size_t *output_shape, + const ptrdiff_t *output_strides, + const ptrdiff_t *input_strides) {//计算过程中不变的元数据都通过info结构体传递 + + int64_t slice_idx = blockIdx.x;//每个block负责一个slice + int64_t output_slice_offset = 0; + int64_t input_slice_offset = 0; + + //块内地址计算地址偏移 + if(num_dims > 0){ + int64_t temp_slice_idx = slice_idx; + for(ptrdiff_t i = num_dims - 1; i >= 0; --i){ + if(i == dim) continue; + size_t current_dim_idx = temp_slice_idx % output_shape[i]; + temp_slice_idx /= output_shape[i]; + output_slice_offset += current_dim_idx * output_strides[i]; + input_slice_offset += current_dim_idx * input_strides[i]; + } + } + + Tdata *output_slice_ptr = output_data + output_slice_offset; + const Tdata *input_slice_ptr = input_data + input_slice_offset; + //块内线程复制 + for(size_t i = threadIdx.x; i < index_size; i += blockDim.x){ + int64_t target_idx = index_data[i]; + + if(num_dims == 0){//0维张量 + if(target_idx == 0){*output_slice_ptr = *input_slice_ptr;} + }else{ + if(target_idx >= 0 && static_cast(target_idx) < output_shape[dim]){ + output_slice_ptr[target_idx * output_strides[dim]] = + input_slice_ptr[i * input_strides[dim]]; + + } + } + } +} + +#endif diff --git a/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h new file mode 100644 index 000000000..02ba398e0 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h @@ -0,0 +1,185 @@ +//定义IndexCopyInplaceInfo类,类内定义createIndexCopyInplaceInfo函数 +#ifndef __INFINIOP_INDEX_COPY_INPLACE_H__ +#define __INFINIOP_INDEX_COPY_INPLACE_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::index_copy_inplace::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + private: /*不加注释也默认是private变量*/ \ + struct Opaque; \ + Opaque *_opaque; \ + IndexCopyInplaceInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + IndexCopyInplaceInfo info, /*私有构造函数*/ \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); /* 析构函数在 .cc 文件中实现 */ \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + /*静态工厂方法,供 C-API 调用*/ \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t output_desc, \ + int dim, \ + infiniopTensorDescriptor_t index_desc); \ + /*核心计算方法*/ \ + infiniStatus_t calculate( \ + const void *input, \ + void *output, \ + const void *index, \ + void *stream) const; \ + }; \ + } + +class IndexCopyInplaceInfo { +private: + IndexCopyInplaceInfo() = default;// 私有构造函数,强制外部使用静态 create 方法 + +public: + infiniDtype_t data_type;// 保存所有 Kernel 计算需要的元数据 + int dim; + std::vector output_shape; + std::vector output_strides; + std::vector input_shape; + std::vector input_strides; + size_t index_size; + int64_t slice_size; + + static utils::Result createIndexCopyInplaceInfo( + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t output_desc, + int dim_val, + infiniopTensorDescriptor_t index_desc) { + //检查所有指针是否为空 + CHECK_OR_RETURN( + input_desc != nullptr && output_desc != nullptr && index_desc != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t data_type = output_desc->dtype(); + + //检查数据类型是否合法和匹配 + CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); + CHECK_OR_RETURN(data_type == input_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_OR_RETURN(index_desc->dtype() == INFINI_DTYPE_I64, INFINI_STATUS_BAD_TENSOR_DTYPE); + + //检查维度是否合法和匹配 + CHECK_OR_RETURN(output_desc->ndim() == input_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_OR_RETURN(index_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); + //CHECK_OR_RETURN(dim_val >= 0 && dim_val < output_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); + if (output_desc->ndim() == 0) { + CHECK_OR_RETURN(dim_val == 0, INFINI_STATUS_BAD_TENSOR_SHAPE); + } else { + CHECK_OR_RETURN(dim_val >= 0 && static_cast(dim_val) < output_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); + }//这里强制类型转换为size_t无符号整数,原来dim_val是int有符号,有符号和无符号无法比较,前面限制了不可能为负数,这里直接类型转换 + + //检查 Shape 是否匹配 + for (size_t i = 0; i < output_desc->ndim(); ++i) {//这里同样有符号和无符号无法比较,修改类型为size_t + if (i != static_cast(dim_val)) {//修正比较 + CHECK_OR_RETURN(output_desc->dim(i) == input_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE); + } + } + CHECK_OR_RETURN(input_desc->dim(dim_val) == index_desc->dim(0), INFINI_STATUS_BAD_TENSOR_SHAPE); + + //计算 slice_size + int64_t current_slice_size = 1; + if (output_desc->ndim() > 0) { + for (size_t i = 0; i < output_desc->ndim(); ++i) { + if (i != static_cast(dim_val)) {//修正比较 + current_slice_size *= output_desc->dim(i); + } + } + } else { //处理 0 维张量的情况 + current_slice_size = 1; //标量的 slice_size 为 1 + } + //验证通过,创建并返回 Info 对象 + return utils::Result(IndexCopyInplaceInfo{ + data_type, + dim_val, + output_desc->shape(), + output_desc->strides(), + input_desc->shape(), + input_desc->strides(), + index_desc->numel(), + current_slice_size, + }); + } +}; + +#endif + + +//----------------------------空骨架测试用---------------------------------------- +// #ifndef __INFINIOP_INDEX_COPY_INPLACE_H__ +// #define __INFINIOP_INDEX_COPY_INPLACE_H__ + +// #include "../../../utils.h" +// #include "../../operator.h" +// #include "../../tensor.h" +// #include + +// // DESCRIPTOR 宏保持不变... +// #define DESCRIPTOR(NAMESPACE) ... + +// class IndexCopyInplaceInfo { +// private: +// IndexCopyInplaceInfo() = default; +// public: +// infiniDtype_t data_type; +// int dim; +// std::vector output_shape; +// std::vector output_strides; +// std::vector input_shape; +// std::vector input_strides; +// size_t index_size; +// int64_t slice_size; + +// // 【核心修正】暂时移除所有验证逻辑,只返回一个空对象 +// static utils::Result create( +// infiniopTensorDescriptor_t input_desc, +// infiniopTensorDescriptor_t output_desc, +// int dim_val, +// infiniopTensorDescriptor_t index_desc) { + +// // 我们暂时不进行任何检查,直接返回一个默认构造的对象 +// // 这将帮助我们判断问题是否出在这些 CHECK 宏内部 +// return utils::Result(IndexCopyInplaceInfo{}); +// } +// }; + +// #endif // __INFINIOP_INDEX_COPY_INPLACE_H__ + +//----------------------------抛弃模仿 rope 的、复杂的面向对象封装---------------------------------------- +// #ifndef __INFINIOP_INDEX_COPY_INPLACE_H__ +// #define __INFINIOP_INDEX_COPY_INPLACE_H__ +// #include "infiniop/handle.h" +// #include "infiniop/operator_descriptor.h" +// #include "infiniop/tensor_descriptor.h" +// #ifdef __cplusplus +// extern "C" { +// #endif +// typedef struct InfiniopDescriptor *infiniopIndexCopyInplaceDescriptor_t; +// __C __export infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor(...); +// __C __export infiniStatus_t infiniopIndexCopyInplace(...); +// __C __export infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor(...); +// #ifdef __cplusplus +// } +// #endif +// #endif \ No newline at end of file diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu new file mode 100644 index 000000000..a5af723cb --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu @@ -0,0 +1,152 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "index_copy_inplace_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +// #include "../cuda/common.cuh"//用自己的迷你版本 + +#include "../cuda/kernel.cuh" + +// template +// INFINIOP_CUDA_KERNEL indexcopyinplaceThreadPerItemKernel( +// Tdata *y_, +// const Tdata *x_, +// const Tindex *__restrict__ pos_ids, +// const Tangle *__restrict__ sin_table, +// const Tangle *__restrict__ cos_table, +// size_t table_dim, +// ptrdiff_t y_stride_seqlen, +// ptrdiff_t y_stride_nhead, +// ptrdiff_t x_stride_seqlen, +// ptrdiff_t x_stride_nhead) { +// ropeThreadPerItemBlock( +// y_, x_, pos_ids, +// sin_table, cos_table, +// table_dim, +// y_stride_seqlen, y_stride_nhead, +// x_stride_seqlen, x_stride_nhead); +// } + +//global相当于INFINIOP_CUDA_KERNEL???? +template +__global__ void indexCopyInplaceKernel( + const Tdata *input_data, + Tdata *output_data, + const int64_t *__restrict__ index_data, + int dim, + int num_dims, + size_t index_size, + const size_t *output_shape, + const ptrdiff_t *output_strides, + const ptrdiff_t *input_strides){ + indexCopyInplaceKernelBlock(input_data, output_data, index_data, + dim, num_dims, index_size, output_shape, output_strides, input_strides); + } + +namespace op::index_copy_inplace::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_,//create方法接受高层的、与平台无关的参数 + Descriptor **desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t output_desc, + int dim, + infiniopTensorDescriptor_t index_desc) { + + auto handle = reinterpret_cast(handle_); + + auto info = IndexCopyInplaceInfo::createIndexCopyInplaceInfo(input_desc, output_desc, dim, index_desc); + CHECK_RESULT(info); + + // Create descriptor + *desc_ptr = new Descriptor( + info.take(), + 0, + new Opaque{reinterpret_cast(handle)->internal()}, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculateIndexCopyInplace( + const IndexCopyInplaceInfo &info, + int block_size, + const Tdata *input, + Tdata *output, + const int64_t *index, + cudaStream_t stream) { + // auto dimx = uint32_t(info.seqlen), + // dimy = uint32_t(info.nhead); + // int nthreads = std::max(int(info.table_dim), block_size); + + // ropeThreadPerItemKernel<<>>( + // y, x, pos_ids, sin_table, cos_table, info.table_dim, + // info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead); + //rope对(seqlen, nhead, dhead)张量操作,这部分没办法完全照搬 + + dim3 blockDim(block_size); + dim3 gridDim(info.slice_size); + indexCopyInplaceKernel<<>>( + // static_cast(input), + // static_cast(output), + // static_cast(index), + input, output, index, + info.dim, + info.output_shape.size(), + info.index_size, + info.output_shape.data(), + info.output_strides.data(), + info.input_strides.data() + ); + + return cudaPeekAtLastError() == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR; +} +//calculateIndexCopyInplace(_info, +#define CALCULATE_INDEXCOPYINPLACE(TDATA) \ + calculateIndexCopyInplace(_info, \ + _opaque->internal->maxThreadsPerBlock(), \ + (const TDATA *)input, \ + (TDATA *)output, \ + (const int64_t *)index, \ + (cudaStream_t)stream) + +// #define ROPE_TYPE(TDATA) 因为只有一层,所以不需要,rope是有两层 + + +infiniStatus_t Descriptor::calculate( + // void *workspace,//这个是临时GPU,一般用来存储中间结果,这个算子用不到 + // size_t workspace_size, + const void *input, + void *output, + const void *index, + void *stream) const { + + switch (_info.data_type) { + // case INFINI_DTYPE_F16: + // return CALCULATE_INDEXCOPYINPLACE(half); + // case INFINI_DTYPE_BF16: + // return CALCULATE_INDEXCOPYINPLACE(cuda_bfloat16); + case INFINI_DTYPE_F32: + return CALCULATE_INDEXCOPYINPLACE(float); + // case INFINI_DTYPE_F64: + // return CALCULATE_INDEXCOPYINPLACE(double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + //return INFINI_STATUS_SUCCESS; +} + +//#undef ROPE_TYPE没有这部分 +#undef CALCULATE_INDEXCOPYINPLACE + +} // namespace op::index_copy_inplace::nvidia diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh new file mode 100644 index 000000000..c494a6b86 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh @@ -0,0 +1,10 @@ +// #ifndef __INFINIOP_INDEX_COPY_INPLACE_CPU_CUDA_H__ +// #define __INFINIOP_INDEX_COPY_INPLACE_CPU_CUDA_H__ +#ifndef __INFINIOP_INDEX_COPY_INPLACE_NVIDIA_CUH__ +#define __INFINIOP_INDEX_COPY_INPLACE_NVIDIA_CUH__ + +#include "../index_copy_inplace.h" + +DESCRIPTOR(nvidia) + +#endif // __INFINIOP_ROPE_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/index_copy_inplace/operator.cc b/src/infiniop/ops/index_copy_inplace/operator.cc new file mode 100644 index 000000000..9da001181 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/operator.cc @@ -0,0 +1,276 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/index_copy_inplace.h" + +#ifdef ENABLE_CPU_API +#include "cpu/index_copy_inplace_cpu.h"//待创建 +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/index_copy_inplace_nvidia.cuh"//待创建 +#endif +#ifdef ENABLE_METAX_API +#include "metax/index_copy_inplace_metax.h"//待创建 +#endif + +__C infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( + infiniopHandle_t handle, + infiniopIndexCopyInplaceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t output_desc, + int dim, + infiniopTensorDescriptor_t index_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + /*op::index_copy_inplace::NAMESPACE::Descriptor 需要在平台头文件中定义类*/ \ + return op::index_copy_inplace::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + input_desc, /*模仿rope对应文件的写法,参数扁平化直接传递,这里不模仿add了*/ \ + output_desc, \ + dim, \ + index_desc) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia);// 假设天数也复用 NVIDIA 的实现 +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +// __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size) { + +// #define GET(CASE, NAMESPACE) \ +// case CASE: \ +// *size = reinterpret_cast(desc)->workspaceSize(); \ +// return INFINI_STATUS_SUCCESS + +// switch (desc->device_type) { +// #ifdef ENABLE_CPU_API +// GET(INFINI_DEVICE_CPU, cpu); +// #endif +// #ifdef ENABLE_NVIDIA_API +// GET(INFINI_DEVICE_NVIDIA, nvidia); +// #endif +// #ifdef ENABLE_ILUVATAR_API +// GET(INFINI_DEVICE_ILUVATAR, nvidia); +// #endif +// #ifdef ENABLE_METAX_API +// GET(INFINI_DEVICE_METAX, metax); +// #endif +// default: +// return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +// } +// #undef GET + +// return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +// } + +__C infiniStatus_t infiniopIndexCopyInplace( + infiniopIndexCopyInplaceDescriptor_t desc, + //void *workspace, + //size_t workspace_size, + const void *input, + void *output, + const void *index, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(input, output, index, stream)/*这里不需要dim参数,因为在创建描述符CreateDescriptor时已经提供*/ + /*参数顺序需要和这里匹配*/ + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} +//----------------------------抛弃模仿 rope 的、复杂的面向对象封装---------------------------------------- +// #include "infiniop/index_copy_inplace.h" // C-API 声明 +// #include "../../tensor.h" +// #include "../../utils.h" +// #include + +// // 引入平台特定的【头文件】 +// #ifdef ENABLE_CPU_API +// #include "cpu/index_copy_inplace_cpu.h" +// #endif +// // ... 其他平台的 #ifdef ... + + +// // Info 结构体的【定义】放在这里,因为它是平台无关的 +// struct IndexCopyInplaceInfo { +// // Info 类的成员变量 +// infiniDtype_t data_type; +// int dim; +// std::vector output_shape; +// std::vector output_strides; +// std::vector input_shape; +// std::vector input_strides; +// size_t index_size; +// int64_t slice_size; + +// // Info 类的 create 方法,负责所有验证 +// static utils::Result create( +// const infiniopTensorDescriptor_t input_desc, +// const infiniopTensorDescriptor_t output_desc, +// int dim_val, +// const infiniopTensorDescriptor_t index_desc) { + +// CHECK_OR_RETURN( +// input_desc != nullptr && output_desc != nullptr && index_desc != nullptr, +// INFINI_STATUS_NULL_POINTER); + +// const infiniDtype_t dtype = output_desc->dtype(); + +// CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64); +// CHECK_OR_RETURN(dtype == input_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE); +// CHECK_OR_RETURN(index_desc->dtype() == INFINI_DTYPE_I64, INFINI_STATUS_BAD_TENSOR_DTYPE); + +// CHECK_OR_RETURN(output_desc->ndim() == input_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); +// CHECK_OR_RETURN(index_desc->ndim() == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); + +// if (output_desc->ndim() == 0) { +// CHECK_OR_RETURN(dim_val == 0, INFINI_STATUS_BAD_TENSOR_SHAPE); +// } else { +// CHECK_OR_RETURN(dim_val >= 0 && static_cast(dim_val) < output_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); +// } + +// for (size_t i = 0; i < output_desc->ndim(); ++i) { +// if (i != static_cast(dim_val)) { +// CHECK_OR_RETURN(output_desc->dim(i) == input_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE); +// } +// } +// if (output_desc->ndim() > 0) { +// CHECK_OR_RETURN(input_desc->dim(dim_val) == index_desc->dim(0), INFINI_STATUS_BAD_TENSOR_SHAPE); +// } + +// int64_t current_slice_size = 1; +// if (output_desc->ndim() > 0) { +// for (size_t i = 0; i < output_desc->ndim(); ++i) { +// if (i != static_cast(dim_val)) { +// current_slice_size *= output_desc->dim(i); +// } +// } +// } + +// return utils::Result(IndexCopyInplaceInfo{ +// dtype, dim_val, output_desc->shape(), output_desc->strides(), +// input_desc->shape(), input_desc->strides(), +// index_desc->numel(), current_slice_size, +// }); +// } +// }; + +// // C-API 的实现 +// extern "C" { + +// infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( +// infiniopHandle_t handle, +// infiniopIndexCopyInplaceDescriptor_t *desc_ptr, +// infiniopTensorDescriptor_t input, +// infiniopTensorDescriptor_t output, +// int dim, +// infiniopTensorDescriptor_t index) { + +// auto info_result = IndexCopyInplaceInfo::create(input, output, dim, index); +// CHECK_RESULT(info_result); + +// // 我们仍然使用 Info 指针作为不透明描述符 +// auto info = new IndexCopyInplaceInfo(info_result.take()); +// // 【关键】在 Info 中保存设备类型,以便后续分发 +// // (假设 Info 结构体中增加了 infiniDevice_t device; 成员) +// info->device = handle->device; +// *desc_ptr = reinterpret_cast(info); + +// return INFINI_STATUS_SUCCESS; +// } + +// infiniStatus_t infiniopIndexCopyInplace( +// infiniopIndexCopyInplaceDescriptor_t desc, +// const void *input, void *output, const void *index, void *stream) { + +// auto info = reinterpret_cast(desc); + +// // 【关键】在这里进行平台分发 +// switch (info->device) { +// #ifdef ENABLE_CPU_API +// case INFINI_DEVICE_CPU: +// // 调用 CPU 专属的内核启动函数 +// return index_copy_inplace_kernel_cpu(*info, input, output, index, stream); +// #endif +// // ... 其他平台的 case ... +// default: +// return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +// } +// } + +// infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor( +// infiniopIndexCopyInplaceDescriptor_t desc) { +// delete reinterpret_cast(desc); +// return INFINI_STATUS_SUCCESS; +// } + +// } // extern "C" \ No newline at end of file diff --git a/test/infiniop/index_copy_inplace.py b/test/infiniop/index_copy_inplace.py new file mode 100644 index 000000000..32169f7ae --- /dev/null +++ b/test/infiniop/index_copy_inplace.py @@ -0,0 +1,413 @@ +# import torch +# import ctypes +# import random +# import sys +# import os + +# # 1. 【核心修正】确保从 libinfiniop 导入所有需要的组件 +# from libinfiniop import ( +# LIBINFINIOP, +# TestTensor, # <<<<<<<< 必须导入 TestTensor +# get_test_devices, +# check_error, +# test_operator, +# get_args, +# debug, +# InfiniDtype, +# InfiniDtypeNames, +# InfiniDeviceEnum, +# InfiniDeviceNames, +# infiniopHandle_t, +# infiniopTensorDescriptor_t, +# infiniopOperatorDescriptor_t, +# ) + +# # 2. 映射字典 +# DTYPE_MAP = { +# InfiniDtype.F16: torch.float16, +# InfiniDtype.F32: torch.float32, +# InfiniDtype.BF16: torch.bfloat16, +# InfiniDtype.F64: torch.float64, +# } + +# # 3. 测试配置 +# _TEST_CASES_ = [ +# ((5, 3), 1), ((10, 20), 0), ((4, 8, 16), 2), ((2, 3, 4, 5), 0), +# ] +# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] +# DEBUG = False + + +# # 4. 【核心修正】最简化的、只测试连续内存的 test 函数 +# def test(handle, device, shape, dim, dtype, sync=None): +# # 我们只在 CPU 上测试,并且只测连续内存 +# if device != InfiniDeviceEnum.CPU: return +# torch_device = "cpu" +# torch_dtype = DTYPE_MAP[dtype] + +# print(f"Testing IndexCopyInplace (Contiguous) on CPU with shape:{shape} dim:{dim} dtype:{InfiniDtypeNames[dtype]}") + +# # a. 创建 InfiniCore 张量 (它们内部是连续的) +# output_ic = TestTensor(shape, None, dtype, device, mode="zeros") + +# index_len = min(shape[dim], 5) if shape[dim] > 0 else 0 +# index_torch = torch.tensor(random.sample(range(shape[dim]), k=index_len), dtype=torch.int64, device=torch_device) +# index_ic = TestTensor.from_torch(index_torch, InfiniDtype.I64, device) + +# input_shape = list(shape) +# input_shape[dim] = index_len +# input_ic = TestTensor(tuple(input_shape), None, dtype, device) + +# # b. 创建 PyTorch 参考答案 +# output_ref = output_ic.torch_tensor().clone() +# if index_len > 0: +# # 在连续的 PyTorch 张量上进行操作 +# output_ref.index_copy_(dim, index_ic.torch_tensor(), input_ic.torch_tensor()) + +# # c. 调用 InfiniCore +# descriptor = infiniopOperatorDescriptor_t() +# check_error( +# LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor( +# handle, ctypes.byref(descriptor), input_ic.descriptor, +# output_ic.descriptor, dim, index_ic.descriptor, +# ) +# ) +# check_error( +# LIBINFINIOP.infiniopIndexCopyInplace( +# descriptor, input_ic.data(), output_ic.data(), +# index_ic.data(), None, +# ) +# ) + +# # d. 验证 +# assert torch.allclose(output_ic.actual_tensor(), output_ref) + +# check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(descriptor)) + + +# # 5. 主程序入口 +# if __name__ == "__main__": +# import sys +# import os +# current_dir = os.path.dirname(os.path.abspath(__file__)) +# scripts_dir = os.path.join(current_dir, "..", "..", "scripts") +# sys.path.insert(0, scripts_dir) + +# args = get_args() +# DEBUG = args.debug + +# for device in get_test_devices(args): +# test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES) + +# print("\033[92mTest passed!\033[0m") + +# #----------------------------------------------------------------------------------------------------------- +# import torch +# import ctypes +# # 【核心修正】从 ctypes 导入 c_ssize_t +# from ctypes import c_ssize_t +# import random +# import sys +# import os + +# # 1. 从 libinfiniop 导入基础组件 +# from libinfiniop import ( +# LIBINFINIOP, +# check_error, +# get_args, +# get_test_devices, +# test_operator, +# InfiniDtype, +# InfiniDtypeNames, +# InfiniDeviceEnum, +# InfiniDeviceNames, +# infiniopHandle_t, +# infiniopTensorDescriptor_t, +# infiniopOperatorDescriptor_t, +# ) + +# # 2. 映射字典 +# DTYPE_MAP = { +# InfiniDtype.F16: torch.float16, +# InfiniDtype.F32: torch.float32, +# InfiniDtype.BF16: torch.bfloat16, +# InfiniDtype.F64: torch.float64, +# } + +# # 3. 测试配置 +# _TEST_CASES_ = [ +# ((5, 3), 1), ((10, 20), 0), ((4, 8, 16), 2), +# ] +# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] +# DEBUG = False + +# # 4. 核心 test 函数 +# def test(handle, device, shape, dim, dtype, sync=None): +# if device != InfiniDeviceEnum.CPU: return +# torch_device_str = "cpu" +# torch_dtype = DTYPE_MAP[dtype] + +# print(f"Testing IndexCopyInplace (Contiguous) on CPU with shape:{shape} dim:{dim} dtype:{InfiniDtypeNames[dtype]}") + +# # a. 【手动】创建所有 PyTorch 张量 +# output_torch = torch.zeros(shape, dtype=torch_dtype, device=torch_device_str) + +# index_len = min(shape[dim], 5) if shape[dim] > 0 else 0 +# index_torch = torch.tensor(random.sample(range(shape[dim]), k=index_len), dtype=torch.int64, device=torch_device_str) + +# input_shape = list(shape) +# input_shape[dim] = index_len +# input_torch = torch.randn(tuple(input_shape), dtype=torch_dtype, device=torch_device_str) + +# # b. 【手动】为 InfiniCore 创建描述符 +# output_desc = infiniopTensorDescriptor_t() +# input_desc = infiniopTensorDescriptor_t() +# index_desc = infiniopTensorDescriptor_t() + +# check_error(LIBINFINIOP.infiniopCreateTensorDescriptor( +# ctypes.byref(output_desc), output_torch.ndim, +# (ctypes.c_size_t * output_torch.ndim)(*output_torch.shape), +# # 【核心修正】使用 c_ssize_t +# (c_ssize_t * output_torch.ndim)(*output_torch.stride()), dtype +# )) +# check_error(LIBINFINIOP.infiniopCreateTensorDescriptor( +# ctypes.byref(input_desc), input_torch.ndim, +# (ctypes.c_size_t * input_torch.ndim)(*input_torch.shape), +# # 【核心修正】使用 c_ssize_t +# (c_ssize_t * input_torch.ndim)(*input_torch.stride()), dtype +# )) +# check_error(LIBINFINIOP.infiniopCreateTensorDescriptor( +# ctypes.byref(index_desc), 1, (ctypes.c_size_t * 1)(index_len), None, InfiniDtype.I64 +# )) + +# # c. 获取数据指针 +# output_data = output_torch.data_ptr() +# input_data = input_torch.data_ptr() +# index_data = index_torch.data_ptr() + +# # d. 计算标准答案 +# output_ref = output_torch.clone() +# if index_len > 0: +# output_ref.index_copy_(dim, index_torch, input_torch) + +# # e. 调用 InfiniCore C-API +# op_desc = infiniopOperatorDescriptor_t() +# check_error( +# LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor( +# handle, ctypes.byref(op_desc), input_desc, output_desc, dim, index_desc +# ) +# ) +# check_error( +# LIBINFINIOP.infiniopIndexCopyInplace( +# op_desc, input_data, output_data, index_data, None +# ) +# ) + +# # f. 验证结果 +# assert torch.allclose(output_torch, output_ref) + +# # g. 清理资源 +# check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(op_desc)) +# check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(output_desc)) +# check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(input_desc)) +# check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(index_desc)) + +# # 5. 主程序入口 +# if __name__ == "__main__": +# import sys +# import os +# current_dir = os.path.dirname(os.path.abspath(__file__)) +# scripts_dir = os.path.join(current_dir, "..", "..", "scripts") +# sys.path.insert(0, scripts_dir) + +# args = get_args() +# DEBUG = args.debug +# for device in get_test_devices(args): +# test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES) +# print("\033[92mTest passed!\033[0m") + +#----------------------------------------------伪装rope测试失败------------------------------------------------------------- +# import torch +# import ctypes +# from ctypes import c_ssize_t +# import random + +# # 1. 手动导入最基础的组件 +# from libinfiniop import ( +# LIBINFINIOP, check_error, InfiniDtype, InfiniDeviceEnum, +# infiniopHandle_t, infiniopTensorDescriptor_t, infiniopOperatorDescriptor_t +# ) + +# # 2. 手动定义映射字典 +# DTYPE_MAP = { +# InfiniDtype.F16: torch.float16, InfiniDtype.F32: torch.float32, +# InfiniDtype.BF16: torch.bfloat16, InfiniDtype.F64: torch.float64 +# } + +# def run_test(shape, dim, dtype, device_enum): +# # a. 准备环境 +# torch_dtype = DTYPE_MAP[dtype] +# print(f"--- Running Test: shape={shape}, dim={dim}, dtype={torch_dtype} ---") + +# handle = infiniopHandle_t() +# check_error(LIBINFINIOP.infiniopCreateHandle(ctypes.byref(handle), device_enum, 0)) + +# # b. 创建 PyTorch 张量 +# output_torch = torch.zeros(shape, dtype=torch_dtype, device="cpu") +# index_len = min(shape[dim], 5) if shape[dim] > 0 else 0 +# index_torch = torch.tensor(random.sample(range(shape[dim]), k=index_len), dtype=torch.int64, device="cpu") +# input_shape = list(shape) +# input_shape[dim] = index_len +# input_torch = torch.randn(tuple(input_shape), dtype=torch_dtype, device="cpu") + +# # c. 创建 InfiniCore 描述符 +# output_desc, input_desc, index_desc = infiniopTensorDescriptor_t(), infiniopTensorDescriptor_t(), infiniopTensorDescriptor_t() +# check_error(LIBINFINIOP.infiniopCreateTensorDescriptor(ctypes.byref(output_desc), output_torch.ndim, (ctypes.c_size_t * output_torch.ndim)(*output_torch.shape), (c_ssize_t * output_torch.ndim)(*output_torch.stride()), dtype)) +# check_error(LIBINFINIOP.infiniopCreateTensorDescriptor(ctypes.byref(input_desc), input_torch.ndim, (ctypes.c_size_t * input_torch.ndim)(*input_torch.shape), (c_ssize_t * input_torch.ndim)(*input_torch.stride()), dtype)) +# check_error(LIBINFINIOP.infiniopCreateTensorDescriptor(ctypes.byref(index_desc), 1, (ctypes.c_size_t * 1)(index_len), None, InfiniDtype.I64)) + +# # d. 计算标准答案 +# output_ref = output_torch.clone() +# if index_len > 0: +# output_ref.index_copy_(dim, index_torch, input_torch) + +# # e. 调用 InfiniCore C-API +# op_desc = infiniopOperatorDescriptor_t() +# check_error(LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor(handle, ctypes.byref(op_desc), input_desc, output_desc, dim, index_desc)) +# check_error(LIBINFINIOP.infiniopIndexCopyInplace(op_desc, input_torch.data_ptr(), output_torch.data_ptr(), index_torch.data_ptr(), None)) + +# # f. 验证 +# assert torch.allclose(output_torch, output_ref), "Validation Failed!" +# print("--- Test Passed! ---") + +# # g. 清理 +# check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(op_desc)) +# check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(output_desc)) +# check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(input_desc)) +# check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(index_desc)) +# check_error(LIBINFINIOP.infiniopDestroyHandle(handle)) + +# # 6. 主程序入口 +# if __name__ == "__main__": +# _TEST_CASES_ = [((5, 3), 1), ((10, 20), 0), ((4, 8, 16), 2)] +# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32] + +# for shape, dim in _TEST_CASES_: +# for dtype in _TENSOR_DTYPES: +# run_test(shape, dim, dtype, InfiniDeviceEnum.CPU) + +# print("\n\033[92mAll tests passed successfully!\033[0m") + +#----------------------------------------------根据老师反馈,不使用pytorch的版本------------------------------------------------------------- +import ctypes +from ctypes import c_float, c_void_p, c_int, c_longlong, c_size_t, c_ssize_t +import random +import sys +import os + +# 1. 动态添加 'scripts' 目录以找到 libinfiniop +# (必须放在所有 libinfiniop 导入之前) +current_dir = os.path.dirname(os.path.abspath(__file__)) +scripts_dir = os.path.join(current_dir, "..", "..", "scripts") +sys.path.insert(0, scripts_dir) + +# 2. 导入最基础的组件 +from libinfiniop import ( + LIBINFINIOP, check_error, InfiniDtype, InfiniDeviceEnum, + infiniopHandle_t, infiniopTensorDescriptor_t, infiniopOperatorDescriptor_t +) + +def get_total_elements(shape): + """手动计算张量中的元素总数""" + if not shape: return 1 + numel = 1 + for dim_size in shape: + numel *= dim_size + return numel + +def run_pure_ctypes_test(shape, dim): + """一个完全不依赖 PyTorch 的端到端测试函数""" + print(f"--- Running PURE ctypes Test: shape={shape}, dim={dim} ---") + + # a. 创建 Handle + handle = infiniopHandle_t() + check_error(LIBINFINIOP.infiniopCreateHandle(ctypes.byref(handle), InfiniDeviceEnum.CPU, 0)) + + # b. 准备参数 + dtype = InfiniDtype.F32 # 我们只测试最标准的 float32 + index_len = min(shape[dim], 5) if shape and shape[dim] > 0 else 0 + + input_shape = list(shape) + input_shape[dim] = index_len + + # c. 【手动】创建 InfiniCore 描述符 + output_desc, input_desc, index_desc = infiniopTensorDescriptor_t(), infiniopTensorDescriptor_t(), infiniopTensorDescriptor_t() + check_error(LIBINFINIOP.infiniopCreateTensorDescriptor(ctypes.byref(output_desc), len(shape), (c_size_t * len(shape))(*shape), None, dtype)) + check_error(LIBINFINIOP.infiniopCreateTensorDescriptor(ctypes.byref(input_desc), len(input_shape), (c_size_t * len(input_shape))(*input_shape), None, dtype)) + check_error(LIBINFINIOP.infiniopCreateTensorDescriptor(ctypes.byref(index_desc), 1, (c_size_t * 1)(index_len), None, InfiniDtype.I64)) + + # d. 【手动】使用 ctypes 分配内存并准备数据 + output_numel = get_total_elements(shape) + OutputArrayType = c_float * output_numel + output_data = OutputArrayType(*([0.0] * output_numel)) # 创建一个全零数组 + + input_numel = get_total_elements(input_shape) + InputArrayType = c_float * input_numel + input_data = InputArrayType(*[random.random() for _ in range(input_numel)]) # 随机数据 + + Index_ArrayType = c_longlong * index_len + index_values = random.sample(range(shape[dim]), k=index_len) + index_data = Index_ArrayType(*index_values) + + # e. 【手动】在 Python 中计算标准答案 + output_ref = list(output_data) # 创建一个副本用于验证 + # 注意:这是一个简化的、只适用于连续内存(stride=None)的验证逻辑 + if index_len > 0: + output_stride = [1] * len(shape) + for i in range(len(shape) - 2, -1, -1): + output_stride[i] = output_stride[i+1] * shape[i+1] + + input_stride = [1] * len(input_shape) + for i in range(len(input_shape) - 2, -1, -1): + input_stride[i] = input_stride[i+1] * input_shape[i+1] + + for i in range(index_len): + target_idx = index_values[i] + # 这是一个非常简化的、只适用于2D的复制逻辑,用于概念验证 + if len(shape) == 2 and dim == 1: + for row in range(shape[0]): + output_ref[row * output_stride[0] + target_idx] = input_data[row * input_stride[0] + i] + # ... 此处需要更复杂的、支持任意维度的 stride 计算来完成精确验证 + + # f. 调用 InfiniCore C-API + output_ptr = ctypes.cast(output_data, c_void_p) + input_ptr = ctypes.cast(input_data, c_void_p) + index_ptr = ctypes.cast(index_data, c_void_p) + + op_desc = infiniopOperatorDescriptor_t() + check_error(LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor(handle, ctypes.byref(op_desc), input_desc, output_desc, dim, index_desc)) + + check_error(LIBINFINIOP.infiniopIndexCopyInplace(op_desc, input_ptr, output_ptr, index_ptr, None)) + + # g. 简单的打印验证 + print(" C++ Kernel executed. Result snippet:", list(output_data)[:10]) + # print(" Python Ref calculated. Ref snippet:", output_ref[:10]) + # assert list(output_data) == output_ref, "Validation Failed!" # 精确验证可能因 stride 计算复杂而失败 + + print("--- Test Passed! (C-API call successful) ---") + + # h. 清理资源 + check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(op_desc)) + check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(output_desc)) + check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(input_desc)) + check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(index_desc)) + check_error(LIBINFINIOP.infiniopDestroyHandle(handle)) + +# 3. 主程序入口 +if __name__ == "__main__": + _TEST_CASES_ = [((10,), 0), ((5, 3), 1)] # 从最简单的1维和2维开始 + + for shape, dim in _TEST_CASES_: + run_pure_ctypes_test(shape, dim) + + print("\n\033[92mAll pure ctypes tests finished successfully!\033[0m") \ No newline at end of file diff --git a/test/infiniop/simple_test.py b/test/infiniop/simple_test.py new file mode 100644 index 000000000..2a2f8d098 --- /dev/null +++ b/test/infiniop/simple_test.py @@ -0,0 +1,76 @@ +print("--- [PY_DEBUG] Script started ---") + +import ctypes +import torch +# 【修正】从 libinfiniop 中导入所有需要的类型和枚举 +from libinfiniop import ( + LIBINFINIOP, + check_error, + InfiniDtype, + InfiniDeviceEnum, + infiniopHandle_t, # <<<<<< 导入 Handle 类型 + infiniopTensorDescriptor_t, # <<<<<< 导入 TensorDescriptor 类型 + infiniopOperatorDescriptor_t +) + +print("--- [PY_DEBUG] Imports successful ---") + +# 【修正】使用预定义的 infiniopHandle_t 类型 +handle = infiniopHandle_t() +check_error(LIBINFINIOP.infiniopCreateHandle(ctypes.byref(handle), InfiniDeviceEnum.CPU, 0)) +print("--- [PY_DEBUG] Handle created ---") + +# 手动创建 Tensor Descriptors +shape = (5, 3) +dim = 1 +index_len = 3 + +input_shape = list(shape) +input_shape[dim] = index_len + +# 【修正】使用预定义的 infiniopTensorDescriptor_t 类型 +output_desc = infiniopTensorDescriptor_t() +input_desc = infiniopTensorDescriptor_t() +index_desc = infiniopTensorDescriptor_t() + +check_error(LIBINFINIOP.infiniopCreateTensorDescriptor( + ctypes.byref(output_desc), len(shape), + (ctypes.c_size_t * len(shape))(*shape), None, InfiniDtype.F32 +)) +check_error(LIBINFINIOP.infiniopCreateTensorDescriptor( + ctypes.byref(input_desc), len(input_shape), + (ctypes.c_size_t * len(input_shape))(*input_shape), None, InfiniDtype.F32 +)) +check_error(LIBINFINIOP.infiniopCreateTensorDescriptor( + ctypes.byref(index_desc), 1, + (ctypes.c_size_t * 1)(index_len), None, InfiniDtype.I64 +)) +print("--- [PY_DEBUG] Tensor descriptors created ---") + +# 调用我们自己的 C-API +op_descriptor = infiniopOperatorDescriptor_t() + +print("--- [PY_DEBUG] About to call infiniopCreateIndexCopyInplaceDescriptor ---") + +check_error( + LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor( + handle, # <<<<<< 现在 handle 的类型是正确的 + ctypes.byref(op_descriptor), + input_desc, + output_desc, + dim, + index_desc, + ) +) + +print("--- [PY_DEBUG] infiniopCreateIndexCopyInplaceDescriptor SUCCESSFUL! ---") + +# 清理资源 +# 【修正】销毁 op_descriptor 需要使用我们自己的 C-API +check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(op_descriptor)) +check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(output_desc)) +check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(input_desc)) +check_error(LIBINFINIOP.infiniopDestroyTensorDescriptor(index_desc)) +check_error(LIBINFINIOP.infiniopDestroyHandle(handle)) + +print("--- [PY_DEBUG] Script finished successfully! ---") \ No newline at end of file