Skip to content

Commit 83578cf

Browse files
authored
[npu] add box coder (#36171)
* [npu] add box coder * [npu] add box coder
1 parent 2b8fd70 commit 83578cf

File tree

3 files changed

+631
-1
lines changed

3 files changed

+631
-1
lines changed

paddle/fluid/operators/detection/CMakeLists.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,13 @@ function(detection_library TARGET_NAME)
1515
PARENT_SCOPE)
1616
endfunction()
1717

18+
if (WITH_ASCEND_CL)
19+
detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc)
20+
else()
21+
detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
22+
endif()
23+
1824
detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
19-
detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
2025
detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
2126
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
2227
detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
Lines changed: 373 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,373 @@
1+
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
Licensed under the Apache License, Version 2.0 (the "License");
3+
you may not use this file except in compliance with the License.
4+
You may obtain a copy of the License at
5+
http://www.apache.org/licenses/LICENSE-2.0
6+
Unless required by applicable law or agreed to in writing, software
7+
distributed under the License is distributed on an "AS IS" BASIS,
8+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
See the License for the specific language governing permissions and
10+
limitations under the License. */
11+
12+
#include "paddle/fluid/operators/detection/box_coder_op.h"
13+
#include "paddle/fluid/operators/npu_op_runner.h"
14+
15+
namespace paddle {
16+
namespace operators {
17+
18+
using Tensor = framework::Tensor;
19+
20+
template <typename T>
21+
struct BoxCoderFunction {
22+
public:
23+
explicit BoxCoderFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
24+
place = ctx.GetPlace();
25+
stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
26+
.stream();
27+
}
28+
Tensor Adds(const Tensor& x, float scalar) {
29+
Tensor y;
30+
y.mutable_data<T>(x.dims(), place);
31+
const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}});
32+
runner.Run(stream);
33+
return y;
34+
}
35+
Tensor Muls(const Tensor& x, float scalar) {
36+
Tensor y;
37+
y.mutable_data<T>(x.dims(), place);
38+
const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}});
39+
runner.Run(stream);
40+
return y;
41+
}
42+
Tensor Mul(const Tensor& x, const Tensor& y) {
43+
Tensor z;
44+
z.mutable_data<T>(x.dims(), place);
45+
const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {});
46+
runner.Run(stream);
47+
return z;
48+
}
49+
Tensor SubWithBroadCast(const Tensor& x, const Tensor& y,
50+
const framework::DDim& shape) {
51+
Tensor z;
52+
z.mutable_data<T>(shape, place);
53+
const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {});
54+
runner.Run(stream);
55+
return z;
56+
}
57+
void DivWithBroadCastVoid(const Tensor& x, const Tensor& y,
58+
const framework::DDim& shape, Tensor* z) {
59+
z->mutable_data<T>(shape, place);
60+
const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {});
61+
runner.Run(stream);
62+
}
63+
Tensor DivWithBroadCast(const Tensor& x, const Tensor& y,
64+
const framework::DDim& shape) {
65+
Tensor z;
66+
DivWithBroadCastVoid(x, y, shape, &z);
67+
return z;
68+
}
69+
void MulWithBroadCastVoid(const Tensor& x, const Tensor& y,
70+
const framework::DDim& shape, Tensor* z) {
71+
z->mutable_data<T>(shape, place);
72+
const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {});
73+
runner.Run(stream);
74+
}
75+
Tensor MulWithBroadCast(const Tensor& x, const Tensor& y,
76+
const framework::DDim& shape) {
77+
Tensor z;
78+
MulWithBroadCastVoid(x, y, shape, &z);
79+
return z;
80+
}
81+
void AddWithBroadCastVoid(const Tensor& x, const Tensor& y,
82+
const framework::DDim& shape, Tensor* z) {
83+
z->mutable_data<T>(shape, place);
84+
const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {});
85+
runner.Run(stream);
86+
}
87+
Tensor AddWithBroadCast(const Tensor& x, const Tensor& y,
88+
const framework::DDim& shape) {
89+
Tensor z;
90+
AddWithBroadCastVoid(x, y, shape, &z);
91+
return z;
92+
}
93+
Tensor Abs(const Tensor& x) {
94+
Tensor y;
95+
y.mutable_data<T>(x.dims(), place);
96+
const auto& runner = NpuOpRunner("Abs", {x}, {y}, {});
97+
runner.Run(stream);
98+
return y;
99+
}
100+
Tensor Log(const Tensor& x) {
101+
Tensor t_x_m1 = Adds(x, -1);
102+
Tensor y;
103+
y.mutable_data<T>(x.dims(), place);
104+
const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {});
105+
runner.Run(stream);
106+
return y;
107+
}
108+
Tensor Exp(const Tensor& x) {
109+
Tensor y;
110+
y.mutable_data<T>(x.dims(), place);
111+
const auto& runner = NpuOpRunner("Exp", {x}, {y}, {});
112+
runner.Run(stream);
113+
return y;
114+
}
115+
Tensor Dot(const Tensor& x, const Tensor& y) {
116+
auto dim_x = x.dims();
117+
auto dim_y = y.dims();
118+
PADDLE_ENFORCE_EQ(
119+
dim_x.size(), 2,
120+
platform::errors::InvalidArgument(
121+
"x should be a 2-dim tensor, but got %d-dim.", dim_x.size()));
122+
PADDLE_ENFORCE_EQ(
123+
dim_y.size(), 2,
124+
platform::errors::InvalidArgument(
125+
"y should be a 2-dim tensor, but got %d-dim.", dim_y.size()));
126+
PADDLE_ENFORCE_EQ(
127+
dim_x[1], dim_y[0],
128+
platform::errors::InvalidArgument("Expect dim_x[1] == dim_y[0], but "
129+
"got dim_x[1] = %d, dim_y[0] = %d.",
130+
dim_x[1], dim_y[0]));
131+
Tensor z;
132+
z.mutable_data<T>({dim_x[0], dim_y[1]}, place);
133+
const auto& runner =
134+
NpuOpRunner("MatMul", {x, y}, {z},
135+
{{"transpose_x1", false}, {"transpose_x2", false}});
136+
runner.Run(stream);
137+
return z;
138+
}
139+
void ConcatVoid(const std::vector<Tensor>& inputs,
140+
const framework::DDim& shape_out, int axis, Tensor* output) {
141+
output->mutable_data<T>(shape_out, place);
142+
std::vector<std::string> names;
143+
for (size_t i = 0; i < inputs.size(); i++) {
144+
names.push_back("x" + std::to_string(i));
145+
}
146+
NpuOpRunner runner{
147+
"ConcatD",
148+
{inputs},
149+
{*output},
150+
{{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
151+
runner.AddInputNames(names);
152+
runner.Run(stream);
153+
}
154+
Tensor Concat(const std::vector<Tensor>& inputs,
155+
const framework::DDim& shape_out, int axis) {
156+
Tensor output;
157+
ConcatVoid(inputs, shape_out, axis, &output);
158+
return output;
159+
}
160+
Tensor Slice(const Tensor& x, const std::vector<int>& offsets,
161+
const std::vector<int>& size, const framework::DDim& shape) {
162+
Tensor y;
163+
y.mutable_data<T>(shape, place);
164+
const auto& runner =
165+
NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}});
166+
runner.Run(stream);
167+
return y;
168+
}
169+
170+
private:
171+
platform::Place place;
172+
aclrtStream stream;
173+
const framework::ExecutionContext& ctx;
174+
};
175+
176+
template <typename T>
177+
void Vector2Tensor(const framework::ExecutionContext& ctx,
178+
const std::vector<T>& vec, const framework::DDim& ddim,
179+
Tensor* tsr) {
180+
framework::TensorFromVector<T>(vec, ctx.device_context(), tsr);
181+
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
182+
tsr->Resize(ddim);
183+
}
184+
185+
template <typename T>
186+
void BoxCoderEnc(const framework::ExecutionContext& ctx, const Tensor* tb,
187+
const Tensor* pb, const Tensor* pbv, const bool norm,
188+
const std::vector<float>& variance, Tensor* out) {
189+
auto M = pb->dims()[0];
190+
auto N = tb->dims()[0];
191+
auto shape_0 = framework::make_ddim({4, 2});
192+
Tensor m_diff;
193+
Tensor m_aver;
194+
std::vector<T> vec_diff = {static_cast<T>(-1), static_cast<T>(0),
195+
static_cast<T>(0), static_cast<T>(-1),
196+
static_cast<T>(1), static_cast<T>(0),
197+
static_cast<T>(0), static_cast<T>(1)};
198+
std::vector<T> vec_aver = {static_cast<T>(0.5), static_cast<T>(0),
199+
static_cast<T>(0), static_cast<T>(0.5),
200+
static_cast<T>(0.5), static_cast<T>(0),
201+
static_cast<T>(0), static_cast<T>(0.5)};
202+
Vector2Tensor<T>(ctx, vec_diff, shape_0, &m_diff);
203+
Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
204+
205+
BoxCoderFunction<T> F(ctx);
206+
Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
207+
Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
208+
Tensor tb_xy = F.Dot(*tb, m_aver);
209+
Tensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));
210+
211+
pb_xy.Resize({1, M, 2});
212+
pb_wh.Resize({1, M, 2});
213+
tb_xy.Resize({N, 1, 2});
214+
tb_wh.Resize({N, 1, 2});
215+
216+
auto shape_half = framework::make_ddim({N, M, 2});
217+
auto shape_full = framework::make_ddim({N, M, 4});
218+
219+
Tensor out_xy_0 = F.DivWithBroadCast(
220+
F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half);
221+
Tensor out_wh_0 = F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
222+
Tensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);
223+
224+
if (pbv) {
225+
F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out);
226+
} else {
227+
Tensor t_var;
228+
std::vector<T> vec_var(4);
229+
for (auto i = 0; i < 4; i++) {
230+
vec_var[i] = static_cast<T>(variance[i]);
231+
}
232+
Vector2Tensor(ctx, vec_var, framework::make_ddim({1, 1, 4}), &t_var);
233+
F.DivWithBroadCastVoid(out_0, t_var, shape_full, out);
234+
}
235+
}
236+
237+
template <typename T>
238+
void BoxCoderDec(const framework::ExecutionContext& ctx, const Tensor* tb,
239+
const Tensor* pb, const Tensor* pbv, const bool norm,
240+
const std::vector<float>& variance, int axis, Tensor* out) {
241+
auto shape_0 = framework::make_ddim({4, 2});
242+
Tensor m_diff;
243+
Tensor m_aver;
244+
std::vector<T> vec_diff = {static_cast<T>(-1), static_cast<T>(0),
245+
static_cast<T>(0), static_cast<T>(-1),
246+
static_cast<T>(1), static_cast<T>(0),
247+
static_cast<T>(0), static_cast<T>(1)};
248+
std::vector<T> vec_aver = {static_cast<T>(0.5), static_cast<T>(0),
249+
static_cast<T>(0), static_cast<T>(0.5),
250+
static_cast<T>(0.5), static_cast<T>(0),
251+
static_cast<T>(0), static_cast<T>(0.5)};
252+
Vector2Tensor<T>(ctx, vec_diff, shape_0, &m_diff);
253+
Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
254+
255+
BoxCoderFunction<T> F(ctx);
256+
Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
257+
Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
258+
auto pb_resize_shape = axis == 0
259+
? framework::make_ddim({1, pb->dims()[0], 2})
260+
: framework::make_ddim({pb->dims()[0], 1, 2});
261+
pb_xy.Resize(pb_resize_shape);
262+
pb_wh.Resize(pb_resize_shape);
263+
264+
auto tbox_slice_shape =
265+
framework::make_ddim({tb->dims()[0], tb->dims()[1], 2});
266+
std::vector<int> tbox_slice_size = {static_cast<int>(tb->dims()[0]),
267+
static_cast<int>(tb->dims()[1]), 2};
268+
Tensor tbox01 = F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
269+
Tensor tbox23 = F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);
270+
271+
Tensor tb_xy;
272+
Tensor tb_wh;
273+
if (pbv) {
274+
auto pbvt_slice_shape = framework::make_ddim({pbv->dims()[0], 2});
275+
auto pbvt_resize_shape = axis == 0
276+
? framework::make_ddim({1, pbv->dims()[0], 2})
277+
: framework::make_ddim({pbv->dims()[0], 1, 2});
278+
std::vector<int> pbvt_slice_size = {static_cast<int>(pbv->dims()[0]), 2};
279+
Tensor pbv_t01 = F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
280+
Tensor pbv_t23 = F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
281+
pbv_t01.Resize(pbvt_resize_shape);
282+
pbv_t23.Resize(pbvt_resize_shape);
283+
284+
F.AddWithBroadCastVoid(
285+
F.MulWithBroadCast(tbox01, F.Mul(pb_wh, pbv_t01), tbox_slice_shape),
286+
pb_xy, tbox_slice_shape, &tb_xy);
287+
F.MulWithBroadCastVoid(
288+
F.Exp(F.MulWithBroadCast(pbv_t23, tbox23, tbox_slice_shape)), pb_wh,
289+
tbox_slice_shape, &tb_wh);
290+
} else if (variance.empty()) {
291+
F.AddWithBroadCastVoid(F.MulWithBroadCast(tbox01, pb_wh, tbox_slice_shape),
292+
pb_xy, tbox_slice_shape, &tb_xy);
293+
F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh);
294+
} else {
295+
Tensor t_var01, t_var23;
296+
auto t_var_shape = framework::make_ddim({1, 1, 2});
297+
std::vector<T> vec_var01 = {static_cast<T>(variance[0]),
298+
static_cast<T>(variance[1])};
299+
std::vector<T> vec_var23 = {static_cast<T>(variance[2]),
300+
static_cast<T>(variance[3])};
301+
Vector2Tensor(ctx, vec_var01, t_var_shape, &t_var01);
302+
Vector2Tensor(ctx, vec_var23, t_var_shape, &t_var23);
303+
F.AddWithBroadCastVoid(
304+
F.MulWithBroadCast(tbox01,
305+
F.MulWithBroadCast(pb_wh, t_var01, pb_resize_shape),
306+
tbox_slice_shape),
307+
pb_xy, tbox_slice_shape, &tb_xy);
308+
F.MulWithBroadCastVoid(
309+
F.Exp(F.MulWithBroadCast(t_var23, tbox23, tbox_slice_shape)), pb_wh,
310+
tbox_slice_shape, &tb_wh);
311+
}
312+
Tensor obox01 =
313+
F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape);
314+
Tensor obox23 =
315+
F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape),
316+
(norm ? 0 : -1));
317+
F.ConcatVoid({obox01, obox23}, out->dims(), 2, out);
318+
}
319+
320+
template <typename T>
321+
class BoxCoderNPUKernel : public framework::OpKernel<T> {
322+
public:
323+
void Compute(const framework::ExecutionContext& ctx) const override {
324+
auto* prior_box = ctx.Input<Tensor>("PriorBox");
325+
auto* prior_box_var = ctx.Input<Tensor>("PriorBoxVar");
326+
auto* target_box = ctx.Input<framework::LoDTensor>("TargetBox");
327+
auto* output_box = ctx.Output<Tensor>("OutputBox");
328+
std::vector<float> variance = ctx.Attr<std::vector<float>>("variance");
329+
const int axis = ctx.Attr<int>("axis");
330+
331+
if (prior_box_var) {
332+
PADDLE_ENFORCE_EQ(variance.empty(), true,
333+
platform::errors::InvalidArgument(
334+
"Input 'PriorBoxVar' and attribute 'variance'"
335+
" of BoxCoder operator should not be used at the "
336+
"same time."));
337+
}
338+
if (!(variance.empty())) {
339+
PADDLE_ENFORCE_EQ(static_cast<int>(variance.size()), 4,
340+
platform::errors::InvalidArgument(
341+
"Size of attribute 'variance' in BoxCoder operator"
342+
" should be 4. But received size is %d",
343+
variance.size()));
344+
}
345+
346+
if (target_box->lod().size()) {
347+
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
348+
platform::errors::InvalidArgument(
349+
"Input 'TargetBox' of BoxCoder operator only"
350+
" supports LoD with one level."));
351+
}
352+
353+
auto code_type = GetBoxCodeType(ctx.Attr<std::string>("code_type"));
354+
bool normalized = ctx.Attr<bool>("box_normalized");
355+
356+
if (code_type == BoxCodeType::kEncodeCenterSize) {
357+
BoxCoderEnc<T>(ctx, target_box, prior_box, prior_box_var, normalized,
358+
variance, output_box);
359+
} else {
360+
BoxCoderDec<T>(ctx, target_box, prior_box, prior_box_var, normalized,
361+
variance, axis, output_box);
362+
}
363+
}
364+
};
365+
366+
} // namespace operators
367+
} // namespace paddle
368+
369+
namespace ops = paddle::operators;
370+
namespace plat = paddle::platform;
371+
372+
REGISTER_OP_NPU_KERNEL(box_coder, ops::BoxCoderNPUKernel<float>,
373+
ops::BoxCoderNPUKernel<plat::float16>);

0 commit comments

Comments
 (0)