Skip to content

Commit 1fba424

Browse files
[INTEL_HPU] enable set_value_with_tensor kernel (#1492)
1 parent 17304d4 commit 1fba424

File tree

3 files changed

+387
-12
lines changed

3 files changed

+387
-12
lines changed

backends/intel_hpu/kernels/hpu_operator.h

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,22 @@ class HpuOperator {
9393

9494
virtual ~HpuOperator() {}
9595

96+
synSectionHandle createSection() {
97+
synStatus status;
98+
synSectionHandle sectionHandle = nullptr;
99+
status = synSectionCreate(&sectionHandle, 0, graphHandle_);
100+
101+
PD_CHECK(status == synSuccess, "synSectionCreate() failed = ", status);
102+
sectons_.push_back(sectionHandle);
103+
return sectionHandle;
104+
}
105+
96106
synTensor createTensor(unsigned dims,
97107
synDataType data_type,
98108
DIMS tensor_size,
99109
bool is_presist,
100-
std::string name) {
110+
std::string name,
111+
synSectionHandle section = nullptr) {
101112
synStatus status;
102113
synTensorDescriptor desc{};
103114
// input
@@ -111,17 +122,17 @@ class HpuOperator {
111122
VLOG(6) << "name = " << name << ", " << tensor_size[dims - 1 - i];
112123
}
113124

114-
synSectionHandle sectionHandle = nullptr;
115-
if (is_presist) {
125+
synSectionHandle sectionHandle = section;
126+
if (is_presist && sectionHandle == nullptr) {
116127
status = synSectionCreate(&sectionHandle, 0, graphHandle_);
117128

118-
PD_CHECK(status == synSuccess, "synSectionCreate() failed = %d", status);
129+
PD_CHECK(status == synSuccess, "synSectionCreate() failed = ", status);
119130
sectons_.push_back(sectionHandle);
120131
}
121132

122133
synTensor tensor = nullptr;
123134
status = synTensorCreate(&tensor, &desc, sectionHandle, 0);
124-
PD_CHECK(status == synSuccess, "synTensorCreate() failed = %d", status);
135+
PD_CHECK(status == synSuccess, "synTensorCreate() failed = ", status);
125136
tensors_.insert({name, tensor});
126137
return tensor;
127138
}
@@ -156,8 +167,7 @@ class RecipeRunner {
156167
}
157168
synStatus status =
158169
synTensorRetrieveIds(recipe, tensorNames, tensorIds, totalNumOfTensors);
159-
PD_CHECK(
160-
status == synSuccess, "synTensorRetrieveIds() failed = %d", status);
170+
PD_CHECK(status == synSuccess, "synTensorRetrieveIds() failed = ", status);
161171
for (i = 0; i < totalNumOfTensors; i++) {
162172
tensorInfo[i].tensorId = tensorIds[i];
163173
}
@@ -167,7 +177,7 @@ class RecipeRunner {
167177
uint64_t request_workspace_size = 0;
168178
synStatus status =
169179
synWorkspaceGetSize(&request_workspace_size, recipeHandle_);
170-
PD_CHECK(status == synSuccess, "synWorkspaceGetSize() failed = %d", status);
180+
PD_CHECK(status == synSuccess, "synWorkspaceGetSize() failed = ", status);
171181

172182
if (request_workspace_size > cached_workspaceSize) {
173183
if (cached_workspaceSize != 0) {
@@ -176,17 +186,17 @@ class RecipeRunner {
176186
status =
177187
synStreamSynchronize(reinterpret_cast<synStreamHandle>(stream));
178188
PD_CHECK(
179-
status == synSuccess, "synStreamSynchronize() failed = %d", status);
189+
status == synSuccess, "synStreamSynchronize() failed = ", status);
180190

181191
status = synDeviceFree(0, cached_workspaceAddress, 0);
182-
PD_CHECK(status == synSuccess, "synDeviceFree() failed = %d", status);
192+
PD_CHECK(status == synSuccess, "synDeviceFree() failed = ", status);
183193
}
184194

185195
cached_workspaceSize = request_workspace_size;
186196
VLOG(6) << "malloc device workspace " << cached_workspaceSize;
187197
status = synDeviceMalloc(
188198
0, cached_workspaceSize, 0, 0, &cached_workspaceAddress);
189-
PD_CHECK(status == synSuccess, "synDeviceMalloc() failed = %d", status);
199+
PD_CHECK(status == synSuccess, "synDeviceMalloc() failed = ", status);
190200
}
191201

192202
VLOG(6) << "workspace size = " << cached_workspaceSize
@@ -204,7 +214,8 @@ class RecipeRunner {
204214
recipeHandle_,
205215
0);
206216

207-
PD_CHECK(status == synSuccess, "synLaunch() failed = %d", status);
217+
PD_CHECK(status == synSuccess, "synLaunch() failed = ", status);
218+
VLOG(6) << "synLaunch called ";
208219
}
209220

210221
protected:
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "habanalabs/perf_lib_layer_params.h"
16+
#include "habanalabs/synapse_api.h"
17+
#include "habanalabs/synapse_common_types.h"
18+
#include "kernels/funcs.h"
19+
#include "kernels/hpu_operator.h"
20+
#include "utils/utils.h"
21+
22+
namespace custom_kernel {
23+
class SetTensorValue : public HpuOperator {
24+
public:
25+
SetTensorValue(std::string guid_prefix, std::string node_name)
26+
: HpuOperator(guid_prefix), pName_(node_name) {}
27+
void AddNode(const std::vector<DIMS>& ins,
28+
const std::vector<DIMS>& outs,
29+
synDataType datatype,
30+
synSliceParams params) {
31+
assert(ins.size() == 2 && "input size should be 2");
32+
assert(outs.size() == 1 && "output size should be 1");
33+
34+
synSectionHandle section = createSection();
35+
synTensor inputs[ins.size()] = {
36+
createTensor(ins[0].size(), datatype, ins[0], true, "input", section),
37+
createTensor(ins[1].size(), datatype, ins[1], true, "value")};
38+
synTensor outputs[outs.size()] = {createTensor(
39+
outs[0].size(), datatype, outs[0], true, "output", section)};
40+
41+
synStatus status = synNodeCreate(graphHandle_,
42+
inputs,
43+
outputs,
44+
ins.size(),
45+
outs.size(),
46+
&params,
47+
sizeof(params),
48+
guid_.c_str(),
49+
pName_.c_str(),
50+
nullptr,
51+
nullptr);
52+
PD_CHECK(
53+
status == synSuccess, "[RUNTIME] synNodeCreate () failed = ", status);
54+
}
55+
std::string pName_;
56+
};
57+
58+
template <typename T, typename Context>
59+
void SetTensorValueKernel(const Context& dev_ctx,
60+
const phi::DenseTensor& x,
61+
const phi::DenseTensor& value,
62+
const phi::IntArray& starts,
63+
const phi::IntArray& ends,
64+
const phi::IntArray& steps,
65+
const std::vector<int64_t>& axes,
66+
const std::vector<int64_t>& decrease_axes,
67+
const std::vector<int64_t>& none_axes,
68+
phi::DenseTensor* out) {
69+
auto starts_v = starts.GetData();
70+
auto ends_v = ends.GetData();
71+
72+
PADDLE_ENFORCE_EQ(
73+
starts_v.size(),
74+
axes.size(),
75+
phi::errors::InvalidArgument(
76+
"The size of starts must be equal to the size of axes."));
77+
PADDLE_ENFORCE_EQ(ends_v.size(),
78+
axes.size(),
79+
phi::errors::InvalidArgument(
80+
"The size of ends must be equal to the size of axes."));
81+
82+
// allocate memory on device.
83+
dev_ctx.template Alloc<T>(out);
84+
const auto& in_dims = x.dims();
85+
86+
PADDLE_ENFORCE_EQ(x.data<T>(),
87+
out->data<T>(),
88+
phi::errors::InvalidArgument(
89+
"The input ptr must be equal to output ptr."));
90+
// ToDo: handle decrease_axes and none_axes in future
91+
92+
synSliceParams params = {{0}};
93+
for (int i = 0; i < in_dims.size(); i++) {
94+
params.axes[i] = i;
95+
params.steps[i] = 1;
96+
params.starts[i] = 0;
97+
params.ends[i] = in_dims[in_dims.size() - 1 - i];
98+
}
99+
for (int i = 0; i < static_cast<int>(axes.size()); i++) {
100+
params.starts[in_dims.size() - 1 - axes[i]] = starts[i];
101+
params.ends[in_dims.size() - 1 - axes[i]] = ends[i];
102+
}
103+
104+
std::vector<int64_t> input_dim = phi::vectorize<int64_t>(x.dims());
105+
std::vector<int64_t> value_dim = phi::vectorize<int64_t>(value.dims());
106+
std::vector<int64_t> outputs_dim = phi::vectorize<int64_t>(out->dims());
107+
108+
OpCacheOperator op_info;
109+
op_info.prepareOpInfo<T, synSliceParams>(
110+
"slice_insert", {input_dim, value_dim}, &params);
111+
112+
auto recipe = op_info.GetRecipe();
113+
if (recipe == nullptr) {
114+
// compile
115+
SetTensorValue op("slice_insert", "SliceInsert");
116+
op.AddNode(
117+
{input_dim, value_dim}, {outputs_dim}, op_info.datatype_, params);
118+
op.Compile();
119+
op_info.setOp(op);
120+
recipe = op_info.GetRecipe();
121+
}
122+
123+
// runtime
124+
std::map<std::string, uint64_t> tensors;
125+
tensors["input"] = reinterpret_cast<uint64_t>(x.data<T>());
126+
tensors["value"] = reinterpret_cast<uint64_t>(value.data<T>());
127+
tensors["output"] = reinterpret_cast<uint64_t>(out->data<T>());
128+
129+
RecipeRunner runner(recipe);
130+
runner.Run(reinterpret_cast<C_Stream>(dev_ctx.stream()), tensors);
131+
}
132+
133+
// template <typename T, typename Context>
134+
// void SetValueKernel(const Context& dev_ctx,
135+
// const phi::DenseTensor& x,
136+
// const phi::IntArray& starts,
137+
// const phi::IntArray& ends,
138+
// const phi::IntArray& steps,
139+
// const std::vector<int64_t>& axes,
140+
// const std::vector<int64_t>& decrease_axes,
141+
// const std::vector<int64_t>& none_axes,
142+
// const std::vector<int64_t>& shape,
143+
// const std::vector<phi::Scalar>& values,
144+
// phi::DenseTensor* out) {
145+
// std::vector<T> assgin_values;
146+
// assgin_values.reserve(values.size());
147+
// for (const auto& val : values) {
148+
// assgin_values.push_back(val.to<T>());
149+
// }
150+
// phi::DenseTensor value_tensor;
151+
// value_tensor.Resize(phi::make_ddim(shape));
152+
// custom_kernel::TensorFromVector(
153+
// dev_ctx, assgin_values, dev_ctx, &value_tensor);
154+
// value_tensor.Resize(phi::make_ddim(shape));
155+
156+
// custom_kernel::SetTensorValueKernel<T, Context>(dev_ctx,
157+
// x,
158+
// value_tensor,
159+
// starts,
160+
// ends,
161+
// steps,
162+
// axes,
163+
// decrease_axes,
164+
// none_axes,
165+
// out);
166+
// }
167+
168+
//
169+
170+
} // namespace custom_kernel
171+
172+
// PD_REGISTER_PLUGIN_KERNEL(set_value,
173+
// intel_hpu,
174+
// ALL_LAYOUT,
175+
// custom_kernel::SetValueKernel,
176+
// float,
177+
// phi::dtype::float16,
178+
// phi::dtype::bfloat16) {
179+
// }
180+
181+
PD_REGISTER_PLUGIN_KERNEL(set_value_with_tensor,
182+
intel_hpu,
183+
ALL_LAYOUT,
184+
custom_kernel::SetTensorValueKernel,
185+
float,
186+
phi::dtype::float16,
187+
phi::dtype::bfloat16) {}

0 commit comments

Comments
 (0)