Skip to content

Commit 7c84868

Browse files
committed
[T1-1-1]: Cast operator with cpu nvidia metax iluvatar and test
1 parent f38ea0d commit 7c84868

File tree

13 files changed

+1278
-0
lines changed

13 files changed

+1278
-0
lines changed

include/infiniop/ops/cast.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#ifndef __INFINIOP_CAST_API_H__
2+
#define __INFINIOP_CAST_API_H__
3+
4+
#include "../operator_descriptor.h"
5+
6+
typedef struct InfiniopDescriptor *infiniopCastDescriptor_t;
7+
8+
__C __export infiniStatus_t infiniopCreateCastDescriptor(infiniopHandle_t handle,
9+
infiniopCastDescriptor_t *desc_ptr,
10+
infiniopTensorDescriptor_t output,
11+
infiniopTensorDescriptor_t input);
12+
13+
__C __export infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size);
14+
15+
__C __export infiniStatus_t infiniopCast(infiniopCastDescriptor_t desc,
16+
void *workspace,
17+
size_t workspace_size,
18+
void *output,
19+
const void *input,
20+
void *stream);
21+
22+
__C __export infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc);
23+
24+
#endif

src/infiniop-test/src/ops/cast.cpp

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#include "ops.hpp"
2+
#include "utils.hpp"
3+
#include <infinirt.h>
4+
#include <iomanip>
5+
#include <iostream>
6+
7+
namespace infiniop_test::cast {
8+
struct Test::Attributes {
9+
std::shared_ptr<Tensor> input;
10+
std::shared_ptr<Tensor> output;
11+
std::shared_ptr<Tensor> ans;
12+
};
13+
14+
std::shared_ptr<Test> Test::build(
15+
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
16+
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
17+
double rtol, double atol) {
18+
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
19+
test->_attributes = new Attributes();
20+
if (tensors.find("input") == tensors.end()
21+
|| tensors.find("output") == tensors.end()
22+
|| tensors.find("ans") == tensors.end()) {
23+
throw std::runtime_error("Invalid Test");
24+
}
25+
26+
test->_attributes->input = tensors["input"];
27+
test->_attributes->output = tensors["output"];
28+
test->_attributes->ans = tensors["ans"];
29+
30+
auto elemType = test->_attributes->input->ggml_type();
31+
if (elemType == GGML_TYPE_I32) {
32+
test->_rtol = 1e-5;
33+
test->_atol = 1e-5;
34+
}
35+
if (elemType == GGML_TYPE_I64) {
36+
test->_rtol = 1e-5;
37+
test->_atol = 1e-5;
38+
}
39+
if (elemType == GGML_TYPE_F16) {
40+
test->_rtol = 1e-3;
41+
test->_atol = 1e-3;
42+
}
43+
if (elemType == GGML_TYPE_F32) {
44+
test->_rtol = 1e-7;
45+
test->_atol = 1e-7;
46+
}
47+
if (elemType == GGML_TYPE_F64) {
48+
test->_rtol = 1e-7;
49+
test->_atol = 1e-7;
50+
}
51+
return test;
52+
}
53+
54+
std::shared_ptr<infiniop_test::Result> Test::run(
55+
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
56+
infiniopCastDescriptor_t op_desc;
57+
auto input = _attributes->input->to(device, device_id);
58+
auto output = _attributes->output->to(device, device_id);
59+
60+
CHECK_OR(infiniopCreateCastDescriptor(handle, &op_desc,
61+
output->desc(),
62+
input->desc()),
63+
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
64+
size_t workspace_size;
65+
CHECK_OR(infiniopGetCastWorkspaceSize(op_desc, &workspace_size),
66+
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
67+
void *workspace;
68+
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
69+
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
70+
CHECK_OR(infiniopCast(op_desc, workspace, workspace_size,
71+
output->data(),
72+
input->data(),
73+
nullptr),
74+
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
75+
76+
try {
77+
allClose(output, _attributes->ans, _rtol, _atol);
78+
} catch (const std::exception &e) {
79+
return TEST_FAILED(RESULT_INCORRECT, e.what());
80+
}
81+
82+
double elapsed_time = 0.;
83+
84+
elapsed_time = benchmark(
85+
[=]() {
86+
infiniopCast(
87+
op_desc, workspace, workspace_size,
88+
output->data(),
89+
input->data(),
90+
nullptr);
91+
},
92+
warm_ups, iterations);
93+
94+
return TEST_PASSED(elapsed_time);
95+
}
96+
97+
std::vector<std::string> Test::attribute_names() {
98+
return {};
99+
}
100+
101+
std::vector<std::string> Test::tensor_names() {
102+
return {"input", "output", "ans"};
103+
}
104+
105+
std::vector<std::string> Test::output_names() {
106+
return {"output"};
107+
}
108+
109+
std::string Test::toString() const {
110+
std::ostringstream oss;
111+
oss << op_name() << std::endl;
112+
oss << "- input: " << _attributes->input->info() << std::endl;
113+
oss << "- output: " << _attributes->output->info() << std::endl;
114+
oss << std::scientific << std::setprecision(2);
115+
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
116+
return oss.str();
117+
}
118+
119+
Test::~Test() {
120+
delete _attributes;
121+
}
122+
} // namespace infiniop_test::cast

src/infiniop/ops/cast/cast.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#ifndef __CAST_H__
2+
#define __CAST_H__
3+
4+
#include "../../operator.h"
5+
#include "info.h"
6+
7+
#define DESCRIPTOR(NAMESPACE) \
8+
\
9+
namespace op::cast::NAMESPACE { \
10+
class Descriptor final : public InfiniopDescriptor { \
11+
struct Opaque; \
12+
Opaque *_opaque; \
13+
\
14+
CastInfo _info; \
15+
size_t _min_workspace_size; \
16+
\
17+
Descriptor( \
18+
CastInfo info, \
19+
size_t min_workspace_size, \
20+
Opaque *opaque, \
21+
infiniDevice_t device_type, \
22+
int device_id) \
23+
: InfiniopDescriptor{device_type, device_id}, \
24+
_opaque(opaque), \
25+
_info(info), \
26+
_min_workspace_size(min_workspace_size) {} \
27+
\
28+
public: \
29+
~Descriptor(); \
30+
\
31+
static infiniStatus_t create( \
32+
infiniopHandle_t handle, \
33+
Descriptor **desc_ptr, \
34+
infiniopTensorDescriptor_t out_desc, \
35+
infiniopTensorDescriptor_t in_desc); \
36+
\
37+
size_t workspaceSize() const; \
38+
\
39+
infiniStatus_t calculate( \
40+
void *workspace, \
41+
size_t workspace_size, \
42+
void *output, \
43+
const void *input, \
44+
void *stream) const; \
45+
}; \
46+
}
47+
48+
#endif // __CAST_H__
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#include "cast_cpu.h"
2+
#include "../../../devices/cpu/common_cpu.h"
3+
#include "../info.h"
4+
#include "infinicore.h"
5+
#include <algorithm>
6+
7+
namespace op::cast::cpu {
8+
9+
Descriptor::~Descriptor() = default;
10+
11+
infiniStatus_t Descriptor::create(
12+
infiniopHandle_t handle_,
13+
Descriptor **desc_ptr,
14+
infiniopTensorDescriptor_t out_desc,
15+
infiniopTensorDescriptor_t in_desc) {
16+
17+
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
18+
19+
auto info_r = CastInfo::create(out_desc, in_desc);
20+
CHECK_RESULT(info_r);
21+
22+
*desc_ptr = new Descriptor(
23+
info_r.take(),
24+
0,
25+
nullptr,
26+
handle->device, handle->device_id);
27+
28+
return INFINI_STATUS_SUCCESS;
29+
}
30+
31+
size_t Descriptor::workspaceSize() const {
32+
return _min_workspace_size;
33+
}
34+
35+
template <typename Tout, typename Tin>
36+
static inline void cpu_cast_impl_incremental(
37+
void *output, const void *input, const op::cast::CastInfo &info) {
38+
39+
const size_t ndim = info.shape.size();
40+
const size_t n = info.n;
41+
42+
auto out_base = reinterpret_cast<Tout *>(output);
43+
auto in_base = reinterpret_cast<const Tin *>(input);
44+
45+
const std::vector<size_t> &shape = info.shape;
46+
const std::vector<ptrdiff_t> &in_stride = info.in_stride;
47+
const std::vector<ptrdiff_t> &out_stride = info.out_stride;
48+
49+
if (n == 0) return;
50+
51+
std::vector<size_t> idx(ndim, 0);
52+
ptrdiff_t in_off = 0;
53+
ptrdiff_t out_off = 0;
54+
55+
for (size_t it = 0; it < n; ++it) {
56+
const Tin *in_elem = in_base + in_off;
57+
Tout *out_elem = out_base + out_off;
58+
*out_elem = utils::cast<Tout, Tin>(*in_elem);
59+
60+
for (int d = static_cast<int>(ndim) - 1; d >= 0; --d) {
61+
idx[d] += 1;
62+
if (in_stride[d] != 0) in_off += in_stride[d];
63+
if (out_stride[d] != 0) out_off += out_stride[d];
64+
65+
if (idx[d] < shape[d]) {
66+
break;
67+
} else {
68+
idx[d] = 0;
69+
if (in_stride[d] != 0) in_off -= static_cast<ptrdiff_t>(shape[d]) * in_stride[d];
70+
if (out_stride[d] != 0) out_off -= static_cast<ptrdiff_t>(shape[d]) * out_stride[d];
71+
}
72+
}
73+
}
74+
}
75+
76+
infiniStatus_t Descriptor::calculate(
77+
void *workspace,
78+
size_t workspace_size,
79+
void *output,
80+
const void *input,
81+
void *stream) const {
82+
83+
if (output == const_cast<void*>(input)) {
84+
return INFINI_STATUS_BAD_PARAM; // or INFINI_STATUS_INPLACE_NOT_SUPPORTED
85+
}
86+
87+
#define CASE_OUT(DT_OUT, TOUT) \
88+
case DT_OUT: { \
89+
switch (_info.dt_in) { \
90+
case INFINI_DTYPE_I32: \
91+
cpu_cast_impl_incremental<TOUT, int32_t>(output, input, _info); \
92+
break; \
93+
case INFINI_DTYPE_I64: \
94+
cpu_cast_impl_incremental<TOUT, int64_t>(output, input, _info); \
95+
break; \
96+
case INFINI_DTYPE_U32: \
97+
cpu_cast_impl_incremental<TOUT, uint32_t>(output, input, _info); \
98+
break; \
99+
case INFINI_DTYPE_U64: \
100+
cpu_cast_impl_incremental<TOUT, uint64_t>(output, input, _info); \
101+
break; \
102+
case INFINI_DTYPE_F16: \
103+
cpu_cast_impl_incremental<TOUT, fp16_t>(output, input, _info); \
104+
break; \
105+
case INFINI_DTYPE_F32: \
106+
cpu_cast_impl_incremental<TOUT, float>(output, input, _info); \
107+
break; \
108+
case INFINI_DTYPE_F64: \
109+
cpu_cast_impl_incremental<TOUT, double>(output, input, _info); \
110+
break; \
111+
default: \
112+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
113+
} \
114+
break; \
115+
}
116+
117+
switch (_info.dt_out) {
118+
CASE_OUT(INFINI_DTYPE_I32, int32_t);
119+
CASE_OUT(INFINI_DTYPE_I64, int64_t);
120+
CASE_OUT(INFINI_DTYPE_U32, uint32_t);
121+
CASE_OUT(INFINI_DTYPE_U64, uint64_t);
122+
CASE_OUT(INFINI_DTYPE_F16, fp16_t);
123+
CASE_OUT(INFINI_DTYPE_F32, float);
124+
CASE_OUT(INFINI_DTYPE_F64, double);
125+
default:
126+
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
127+
}
128+
129+
#undef CASE_OUT
130+
131+
return INFINI_STATUS_SUCCESS;
132+
}
133+
134+
135+
} // namespace op::cast::cpu
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef __CAST_CPU_H__
2+
#define __CAST_CPU_H__
3+
4+
#include "../cast.h"
5+
6+
DESCRIPTOR(cpu)
7+
8+
#endif // __CAST_CPU_H__

0 commit comments

Comments
 (0)