Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,812 changes: 1,812 additions & 0 deletions core_2026-01-08_17:47:07.860_mccx_2790503.mudmp

Large diffs are not rendered by default.

1,812 changes: 1,812 additions & 0 deletions core_2026-01-08_19:12:43.733_mccx_2855686.mudmp

Large diffs are not rendered by default.

1,812 changes: 1,812 additions & 0 deletions core_2026-01-08_19:50:35.933_mccx_2881178.mudmp

Large diffs are not rendered by default.

1,820 changes: 1,820 additions & 0 deletions core_2026-01-08_20:07:44.657_mccx_2918926.mudmp

Large diffs are not rendered by default.

1,820 changes: 1,820 additions & 0 deletions core_2026-01-08_20:32:23.317_mccx_2931498.mudmp

Large diffs are not rendered by default.

918 changes: 918 additions & 0 deletions core_2026-01-08_20:54:38.545_mccx_2954869.mudmp

Large diffs are not rendered by default.

911 changes: 911 additions & 0 deletions core_2026-01-08_21:04:28.932_mccx_2967468.mudmp

Large diffs are not rendered by default.

918 changes: 918 additions & 0 deletions core_2026-01-08_21:15:41.497_mccx_2995510.mudmp

Large diffs are not rendered by default.

918 changes: 918 additions & 0 deletions core_2026-01-08_21:18:18.666_mccx_2996759.mudmp

Large diffs are not rendered by default.

918 changes: 918 additions & 0 deletions core_2026-01-08_21:48:52.578_mccx_3036206.mudmp

Large diffs are not rendered by default.

Binary file added hardtanh_cuda_test
Binary file not shown.
101 changes: 101 additions & 0 deletions hardtanh_cuda_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#include <cuda_runtime.h>
#include <iostream>
#include <vector>

#include "infiniop/handle.h"
#include "infiniop/tensor_descriptor.h"
#include "infiniop/ops/hardtanh.h"

#define CHECK_INFINI(op) \
do { \
infiniStatus_t status = (op); \
if (status != INFINI_STATUS_SUCCESS) { \
std::cerr << "Infiniop error at " << __FILE__ << ":" << __LINE__ << " -> " << status \
<< std::endl; \
return 1; \
} \
} while (0)

#define CHECK_CUDA(op) \
do { \
cudaError_t err = (op); \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " -> " \
<< cudaGetErrorString(err) << std::endl; \
return 1; \
} \
} while (0)

int main() {
CHECK_CUDA(cudaSetDevice(0));

infiniopHandle_t handle;
CHECK_INFINI(infiniopCreateHandle(&handle));

const size_t ndim = 2;
size_t shape[ndim] = {13, 4};
infiniopTensorDescriptor_t input_desc;
infiniopTensorDescriptor_t output_desc;
CHECK_INFINI(infiniopCreateTensorDescriptor(&input_desc, ndim, shape, nullptr, INFINI_DTYPE_F32));
CHECK_INFINI(infiniopCreateTensorDescriptor(&output_desc, ndim, shape, nullptr, INFINI_DTYPE_F32));

size_t numel = shape[0] * shape[1];
std::vector<float> host_input(numel);
for (size_t i = 0; i < numel; ++i) {
host_input[i] = static_cast<float>(i) / 10.f - 2.f;
}

float *d_input = nullptr;
float *d_output = nullptr;
CHECK_CUDA(cudaMalloc(&d_input, numel * sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_output, numel * sizeof(float)));
CHECK_CUDA(cudaMemcpy(d_input, host_input.data(), numel * sizeof(float), cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemset(d_output, 0, numel * sizeof(float)));

infiniopHardTanhDescriptor_t desc = nullptr;
CHECK_INFINI(infiniopCreateHardTanhDescriptor(
handle, &desc, output_desc, input_desc, -1.0f, 1.0f));

size_t workspace_size = 0;
CHECK_INFINI(infiniopGetHardTanhWorkspaceSize(desc, &workspace_size));

void *workspace = nullptr;
if (workspace_size > 0) {
CHECK_CUDA(cudaMalloc(&workspace, workspace_size));
}

std::cout << "Workspace bytes: " << workspace_size << std::endl;

infiniStatus_t status = infiniopHardTanh(
desc, workspace, workspace_size, d_output, d_input, nullptr);
if (status != INFINI_STATUS_SUCCESS) {
std::cerr << "infiniopHardTanh failed with status " << status << std::endl;
return 1;
}

CHECK_CUDA(cudaDeviceSynchronize());

std::vector<float> host_output(numel);
CHECK_CUDA(cudaMemcpy(host_output.data(), d_output, numel * sizeof(float), cudaMemcpyDeviceToHost));

float max_err = 0.f;
for (size_t i = 0; i < numel; ++i) {
float ref = std::max(-1.0f, std::min(1.0f, host_input[i]));
max_err = std::max(max_err, std::abs(ref - host_output[i]));
}

std::cout << "Max abs error: " << max_err << std::endl;

if (workspace) {
cudaFree(workspace);
}
CHECK_INFINI(infiniopDestroyHardTanhDescriptor(desc));
CHECK_INFINI(infiniopDestroyTensorDescriptor(input_desc));
CHECK_INFINI(infiniopDestroyTensorDescriptor(output_desc));
CHECK_INFINI(infiniopDestroyHandle(handle));
cudaFree(d_input);
cudaFree(d_output);

std::cout << "Done" << std::endl;
return 0;
}
4 changes: 4 additions & 0 deletions include/infinicore/ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "ops/add.hpp"
#include "ops/attention.hpp"
#include "ops/causal_softmax.hpp"
#include "ops/cross_entropy.hpp"
#include "ops/matmul.hpp"
#include "ops/ones.hpp"
#include "ops/paged_attention.hpp"
Expand All @@ -12,5 +13,8 @@
#include "ops/rearrange.hpp"
#include "ops/rms_norm.hpp"
#include "ops/rope.hpp"
#include "ops/hardswish.hpp"
#include "ops/hardtanh.hpp"
#include "ops/avg_pool1d.hpp"
#include "ops/silu.hpp"
#include "ops/swiglu.hpp"
18 changes: 18 additions & 0 deletions include/infinicore/ops/avg_pool1d.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"

namespace infinicore::op {

class AvgPool1d {
public:
using schema = void (*)(Tensor, Tensor, size_t, size_t, size_t);
static void execute(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding);
static common::OpDispatcher<schema> &dispatcher();
};

Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0);
void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride = 0, size_t padding = 0);

} // namespace infinicore::op
35 changes: 35 additions & 0 deletions include/infinicore/ops/cross_entropy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"

namespace infinicore::op {

class CrossEntropy {
public:
// Schema 定义:函数指针类型
// CrossEntropy 需要接收三个 Tensor: Output (Loss), Input (Logits), Target (Labels)
using schema = void (*)(Tensor, Tensor, Tensor);

// 执行入口
static void execute(Tensor output, Tensor input, Tensor target);

// 分发器访问接口
static common::OpDispatcher<schema> &dispatcher();
};

// ==================================================================
// 对外 Functional API
// ==================================================================

// 1. Out-of-place 接口:
// 输入 Logits 和 Target,内部自动创建 Output Tensor 并返回
Tensor cross_entropy(Tensor input, Tensor target);

// 2. Explicit Output 接口 (类似于 In-place 风格):
// 用户显式提供 Output Tensor 用于存储结果
// 注意:虽然命名带有下划线 _,但通常 CrossEntropy 无法真正原地修改 input,
// 所以这里只是表示“写入指定的 output 内存”
void cross_entropy_(Tensor output, Tensor input, Tensor target);

} // namespace infinicore::op
19 changes: 19 additions & 0 deletions include/infinicore/ops/equal.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"

namespace infinicore::op {

class Equal {
public:
using schema = void (*)(Tensor, Tensor, Tensor);

static void execute(Tensor out, Tensor a, Tensor b);
static common::OpDispatcher<schema> &dispatcher();
};

Tensor equal(Tensor a, Tensor b);
void equal_(Tensor out, Tensor a, Tensor b);

} // namespace infinicore::op
18 changes: 18 additions & 0 deletions include/infinicore/ops/hardswish.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"

namespace infinicore::op {

class Hardswish {
public:
using schema = void (*)(Tensor, Tensor);
static void execute(Tensor output, Tensor input);
static common::OpDispatcher<schema> &dispatcher();
};

Tensor hardswish(Tensor input);
void hardswish_(Tensor output, Tensor input);

} // namespace infinicore::op
18 changes: 18 additions & 0 deletions include/infinicore/ops/hardtanh.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"

namespace infinicore::op {

class HardTanh {
public:
using schema = void (*)(Tensor, Tensor, float, float);
static void execute(Tensor output, Tensor input, float min_val, float max_val);
static common::OpDispatcher<schema> &dispatcher();
};

Tensor hardtanh(Tensor input, float min_val = -1.0f, float max_val = 1.0f);
void hardtanh_(Tensor output, Tensor input, float min_val = -1.0f, float max_val = 1.0f);

} // namespace infinicore::op
6 changes: 6 additions & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,10 @@
#include "infiniop/ops/zeros.h"
#include "infiniop/tensor_descriptor.h"

#include "infiniop/ops/cross_entropy.h"
#include "infiniop/ops/hardswish.h"
#include "infiniop/ops/avg_pool1d.h"
#include "infiniop/ops/equal.h"
#include "infiniop/ops/hardtanh.h"

#endif // __INFINIOP_API_H__
38 changes: 38 additions & 0 deletions include/infiniop/ops/avg_pool1d.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#ifndef __INFINIOP_AVG_POOL1D_API_H__
#define __INFINIOP_AVG_POOL1D_API_H__

#include "../operator_descriptor.h"


typedef struct InfiniopDescriptor *infiniopAvgPool1dDescriptor_t;


__C __export infiniStatus_t infiniopCreateAvgPool1dDescriptor(
infiniopHandle_t handle,
infiniopAvgPool1dDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output,
infiniopTensorDescriptor_t input,
size_t kernel_size,
size_t stride,
size_t padding
);


__C __export infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(
infiniopAvgPool1dDescriptor_t desc,
size_t *size);


__C __export infiniStatus_t infiniopAvgPool1d(
infiniopAvgPool1dDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream);


__C __export infiniStatus_t infiniopDestroyAvgPool1dDescriptor(
infiniopAvgPool1dDescriptor_t desc);

#endif
42 changes: 42 additions & 0 deletions include/infiniop/ops/cross_entropy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef __INFINIOP_CROSS_ENTROPY_API_H__
#define __INFINIOP_CROSS_ENTROPY_API_H__

#include "../operator_descriptor.h"


typedef struct InfiniopDescriptor *infiniopCrossEntropyDescriptor_t;


__C __export infiniStatus_t infiniopCreateCrossEntropyDescriptor(
infiniopHandle_t handle,
infiniopCrossEntropyDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t target_desc
);



__C __export infiniStatus_t infiniopGetCrossEntropyWorkspaceSize(
infiniopCrossEntropyDescriptor_t desc,
size_t *size
);



__C __export infiniStatus_t infiniopCrossEntropy(
infiniopCrossEntropyDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *target,
void *stream
);


__C __export infiniStatus_t infiniopDestroyCrossEntropyDescriptor(
infiniopCrossEntropyDescriptor_t desc
);

#endif
37 changes: 37 additions & 0 deletions include/infiniop/ops/equal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef __INFINIOP_EQUAL_API_H__
#define __INFINIOP_EQUAL_API_H__

#include "../operator_descriptor.h"


typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t;



__C __export infiniStatus_t infiniopCreateEqualDescriptor(
infiniopHandle_t handle,
infiniopEqualDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);


__C __export infiniStatus_t infiniopGetEqualWorkspaceSize(
infiniopEqualDescriptor_t desc,
size_t *size);


__C __export infiniStatus_t infiniopEqual(
infiniopEqualDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream);


__C __export infiniStatus_t infiniopDestroyEqualDescriptor(
infiniopEqualDescriptor_t desc);

#endif
Loading