Skip to content

Commit 6bfc572

Browse files
vslyutangzhiyi11QingshuChentaixiurongroot
authored
[2.0 rc1/cherrypick] cherry-pick kunlun PR:29234/29229/29293/29367/29280/29448 (#29466)
* add deformable_conv op on xpu (#29234) * rebase develop * update deformable_conv op on xpu * update deformable_conv op on xpu * update kunlun conv2d/softmax/elementwise implemetation (#29229) * update conv2d & softmax to new xpu api * test=kunlun * remove useless comments * test=kunlun * remote softmax xpu op * test=kunlun * update kunlun softmax * test=kunlun * update xpu unitest * test=kunlun * fix elementwise_grad bug for kunlun *test=kunlun * support global pooling for kunlun (#29293) * test=kunlun * update reduce_sum op on xpu (#29367) * update reduce_sum op on xpu * update reduce_sum op on xpu * support running on xpu * fix expand/uniform_random && concat/transpose to new api on xpu (#29280) * fix expand && concat/transpose to new api * update uniform_random_op * update xpu_header * 1. fix elementwise ops'bug 2. fix softmax_with_cross_entropy_op 3. add biliner_interp_op (#29448) Co-authored-by: root <[email protected]> Co-authored-by: 卖鱼的哲学 <[email protected]> Co-authored-by: QingshuChen <[email protected]> Co-authored-by: taixiurong <[email protected]> Co-authored-by: root <[email protected]>
1 parent 6b9302a commit 6bfc572

26 files changed

+1793
-739
lines changed

cmake/external/xpu.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ endif()
44

55
INCLUDE(ExternalProject)
66
SET(XPU_PROJECT "extern_xpu")
7-
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_11_10.tar.gz" CACHE STRING "" FORCE)
7+
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_12_04.tar.gz" CACHE STRING "" FORCE)
88
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
99
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
1010
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")

paddle/fluid/operators/concat_op_xpu.cc

Lines changed: 72 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
1111
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
14-
14+
#ifdef PADDLE_WITH_XPU
1515
#include "paddle/fluid/operators/concat_op.h"
16-
1716
#include <memory>
1817
#include <string>
1918
#include <vector>
20-
21-
#ifdef PADDLE_WITH_MKLDNN
22-
#include <paddle/fluid/platform/mkldnn_helper.h>
23-
#endif
24-
25-
#ifdef PADDLE_WITH_XPU
19+
#include "paddle/fluid/platform/xpu_header.h"
2620

2721
namespace paddle {
2822
namespace operators {
@@ -32,8 +26,8 @@ template <typename DeviceContext, typename T>
3226
class ConcatXPUKernel : public framework::OpKernel<T> {
3327
public:
3428
void Compute(const framework::ExecutionContext& ctx) const override {
35-
auto ins = ctx.MultiInput<framework::Tensor>("X");
36-
framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
29+
auto ins = ctx.MultiInput<framework::LoDTensor>("X");
30+
framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
3731
int axis = ctx.Attr<int>("axis");
3832
PADDLE_ENFORCE_NE(ins[0], nullptr, platform::errors::InvalidArgument(
3933
"The input should not be null."));
@@ -47,6 +41,7 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
4741
PADDLE_ENFORCE_LT(axis, ins[0]->dims().size(),
4842
platform::errors::InvalidArgument(
4943
"concat: axis shoud < ins[0]->dims()!"));
44+
5045
auto place = ctx.GetPlace();
5146
out->mutable_data<T>(place);
5247
std::vector<int> choose_idx;
@@ -57,43 +52,54 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
5752
n++;
5853
}
5954
}
60-
PADDLE_ENFORCE_LE(n, 8, platform::errors::InvalidArgument(
61-
"XPU only surpport at most 8 tensors for now"));
6255
PADDLE_ENFORCE_GT(
6356
n, 0, platform::errors::InvalidArgument("No tensor need concat?"));
64-
int h = 1;
65-
int w_except_axis = 1;
66-
for (int i = 0; i < axis; ++i) {
67-
h *= (ins[choose_idx[0]]->dims())[i];
68-
}
69-
for (int i = axis + 1; i < ins[0]->dims().size(); ++i) {
70-
w_except_axis *= (ins[choose_idx[0]]->dims())[i];
71-
}
72-
for (int i = 1; i < n; ++i) {
73-
int hh = 1;
74-
int ww = 1;
75-
for (int j = 0; j < axis; ++j) {
76-
hh *= (ins[choose_idx[i]]->dims())[j];
57+
58+
// If axis is 0, the lod of the output is not the same as inputs.
59+
if (axis == 0 && ins[0]->lod().size() > 0) {
60+
size_t lod_size_0 = ins[0]->lod().size();
61+
size_t lod_size = lod_size_0;
62+
for (size_t i = 1; i < ins.size(); ++i) {
63+
if (ins[i]->lod().size() > 0) {
64+
PADDLE_ENFORCE_EQ(
65+
ins[i]->lod().size(), lod_size_0,
66+
platform::errors::Unimplemented(
67+
"The lod level of all input LoDTensors should be same. "
68+
"Maybe different lod level of input LoDTensors can concat,"
69+
"it is not supported currently. The lod level of %dth input "
70+
"is %d and first input is %d.",
71+
i, ins[i]->lod().size(), lod_size_0));
72+
} else {
73+
lod_size = 0;
74+
break;
75+
}
7776
}
78-
for (int j = axis + 1; j < ins[i]->dims().size(); ++j) {
79-
ww *= (ins[choose_idx[i]]->dims())[j];
77+
if (lod_size) {
78+
auto* out_lod = out->mutable_lod();
79+
for (size_t i = 1; i < ins.size(); ++i) {
80+
auto in_lod = ConvertToLengthBasedLoD(ins[i]->lod());
81+
AppendLoD(out_lod, in_lod);
82+
}
8083
}
81-
PADDLE_ENFORCE_EQ(hh, h, platform::errors::InvalidArgument(
82-
"concat: h should be eual!"));
83-
PADDLE_ENFORCE_EQ(ww, w_except_axis,
84-
platform::errors::InvalidArgument(
85-
"concat: w should be eual except for axis!"));
8684
}
85+
86+
auto input_dims = ins[0]->dims();
87+
std::vector<std::vector<int>> xdims_list(n);
88+
for (int i = 0; i < n; ++i) {
89+
std::vector<int> tmp_dims(input_dims.size());
90+
for (int j = 0; j < input_dims.size(); ++j) {
91+
tmp_dims[j] = ins[i]->dims()[j];
92+
}
93+
xdims_list[i] = tmp_dims;
94+
}
95+
8796
auto& dev_ctx = ctx.template device_context<DeviceContext>();
88-
std::unique_ptr<int[]> in_w_host(new int[n]);
89-
std::unique_ptr<const float* []> ptrs(new const float*[n]);
97+
std::vector<const T*> ptrs;
9098
for (int i = 0; i < n; ++i) {
91-
ptrs[i] = ins[choose_idx[i]]->data<T>();
92-
in_w_host[i] = w_except_axis * (ins[choose_idx[i]]->dims())[axis];
99+
ptrs.push_back(ins[choose_idx[i]]->data<T>());
93100
}
94-
int r =
95-
xpu::concat<float>(dev_ctx.x_context(), h, (const int*)in_w_host.get(),
96-
n, (const float**)ptrs.get(), out->data<T>());
101+
int r = xpu::concat<T>(dev_ctx.x_context(), ptrs, out->data<T>(),
102+
xdims_list, axis);
97103
PADDLE_ENFORCE_EQ(
98104
r, XPU_SUCCESS,
99105
platform::errors::External(
@@ -102,6 +108,7 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
102108
r));
103109
}
104110
};
111+
105112
template <typename DeviceContext, typename T>
106113
class ConcatGradXPUKernel : public framework::OpKernel<T> {
107114
public:
@@ -132,37 +139,47 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
132139
static_cast<int64_t>(ins[0]->dims().size()));
133140
// get output tensor that the name is not kEmptyVarName
134141
std::vector<framework::Tensor*> outputs;
142+
std::vector<int> choose_idx;
143+
int n = 0;
135144
for (size_t j = 0; j < outs.size(); ++j) {
136145
if (out_var_names[j] != framework::kEmptyVarName &&
137146
outs[j]->numel() != 0UL) {
138147
outs[j]->mutable_data<T>(ctx.GetPlace());
139148
outputs.push_back(outs[j]);
140-
} else {
141-
outputs.push_back(nullptr);
149+
choose_idx.push_back(j);
150+
n++;
142151
}
143152
}
144153
PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
145154
"concat_grad: axis shoud >= 0!"));
146155
PADDLE_ENFORCE_LT(axis, out_grad->dims().size(),
147156
platform::errors::InvalidArgument(
148157
"concat_grad: axis shoud < ins[0]->dims()!"));
149-
auto out_grad_stride = framework::stride_numel(out_grad->dims());
150-
int n = outputs.size();
151-
PADDLE_ENFORCE_LE(n, 16,
152-
platform::errors::InvalidArgument(
153-
"XPU only surpport at most 16 tensors for now"));
154-
int h = out_grad_stride[0] / out_grad_stride[axis];
155-
auto& dev_ctx = ctx.template device_context<DeviceContext>();
156-
std::unique_ptr<int[]> in_w_host(new int[n]);
157-
std::unique_ptr<float* []> ptrs(new float*[n]);
158+
159+
auto input_dims = ins[0]->dims();
160+
std::vector<int> split_list(n);
161+
std::vector<int> xdims_list(input_dims.size());
162+
int total_length = 0;
163+
for (int i = 0; i < n; ++i) {
164+
split_list[i] = ins[i]->dims()[axis];
165+
total_length += ins[i]->dims()[axis];
166+
}
167+
for (int i = 0; i < input_dims.size(); ++i) {
168+
if (i == axis) {
169+
continue;
170+
}
171+
xdims_list[i] = input_dims[i];
172+
}
173+
xdims_list[axis] = total_length;
174+
175+
std::vector<T*> ptrs(n);
158176
for (int i = 0; i < n; ++i) {
159-
auto out_stride = framework::stride_numel(outputs[i]->dims());
160177
ptrs[i] = outputs[i]->data<T>();
161-
in_w_host[i] = out_stride[axis];
162178
}
163-
int r = xpu::concat_grad(dev_ctx.x_context(), h, in_w_host.get(), n,
164-
reinterpret_cast<float**>(ptrs.get()),
165-
out_grad->data<T>());
179+
180+
auto& dev_ctx = ctx.template device_context<DeviceContext>();
181+
int r = xpu::split<T>(dev_ctx.x_context(), out_grad->data<T>(), ptrs,
182+
xdims_list, split_list, axis);
166183
PADDLE_ENFORCE_EQ(
167184
r, XPU_SUCCESS,
168185
platform::errors::External(

paddle/fluid/operators/conv_op_xpu.cc

Lines changed: 27 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,6 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
2727
// that avoids modifying the variable in the Scope.
2828
Tensor filter = *context.Input<Tensor>("Filter");
2929
Tensor* output = context.Output<Tensor>("Output");
30-
// Tensor* max_input = context.Output<Tensor>("MaxInput");
31-
// Tensor* max_filter = context.Output<Tensor>("MaxFilter");
32-
// max_input->mutable_data<T>(context.GetPlace());
33-
// max_filter->mutable_data<T>(context.GetPlace());
3430
output->mutable_data<T>(context.GetPlace());
3531
int groups = context.Attr<int>("groups");
3632
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
@@ -43,62 +39,25 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
4339
const int f = static_cast<int>(filter.dims()[0]);
4440
const int win_h = static_cast<int>(filter.dims()[2]);
4541
const int win_w = static_cast<int>(filter.dims()[3]);
46-
PADDLE_ENFORCE_EQ(
47-
dilations[0] == 1 && dilations[1] == 1, true,
48-
platform::errors::InvalidArgument("XPU only support dilation == 1."));
4942
auto& dev_ctx = context.template device_context<DeviceContext>();
50-
// PADDLE_ENFORCE_EQ(
51-
// xpu::findmax(dev_ctx.x_context(), input->data<T>(), input->numel(),
52-
// max_input->data<T>()) == xpu::Error_t::SUCCESS,
53-
// true, platform::errors::InvalidArgument(
54-
// "XPU conv kernel error,can not finde max_input,please "
55-
// "check whether Baidu Kunlun "
56-
// "Card is properly installed."));
57-
// PADDLE_ENFORCE_EQ(
58-
// xpu::findmax(dev_ctx.x_context(), filter.data<T>(), filter.numel(),
59-
// max_filter->data<T>()) == xpu::Error_t::SUCCESS,
60-
// true, platform::errors::InvalidArgument(
61-
// "XPU conv kernel error,can not find max_filter,please "
62-
// "check whether Baidu Kunlun "
63-
// "Card is properly installed."));
64-
if (groups == 1) {
65-
int r = xpu::conv2d_forward_int16<float, float, float, float>(
66-
dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
67-
strides[0], strides[1], paddings[0], paddings[1], dilations[0],
68-
dilations[1], groups, input->data<float>(), filter.data<float>(),
69-
output->data<float>(), nullptr, nullptr, xpu::Activation_t::LINEAR,
70-
nullptr, nullptr);
71-
// max_input->data<float>(), max_filter->data<float>());
72-
PADDLE_ENFORCE_EQ(
73-
r, XPU_SUCCESS,
74-
platform::errors::External("XPU conv kernel return wrong value[%d], "
75-
"please check whether Baidu Kunlun Card "
76-
"is properly installed.",
77-
r));
78-
} else {
79-
int r = xpu::conv2d_int16_with_group<float, float, float>(
80-
dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
81-
output->data<float>(), batch_size, img_c, img_h, img_w, f, win_h,
82-
win_w, groups, strides[0], strides[1], paddings[0], paddings[1],
83-
nullptr, nullptr);
84-
// max_input->data<float>(), max_filter->data<float>());
85-
PADDLE_ENFORCE_EQ(
86-
r, XPU_SUCCESS,
87-
platform::errors::External("XPU conv kernel return wrong value[%d], "
88-
"please check whether Baidu Kunlun Card "
89-
"is properly installed.",
90-
r));
91-
}
43+
std::vector<int> k_size;
44+
k_size.push_back(win_h);
45+
k_size.push_back(win_w);
46+
int r = xpu::conv2d<float, float, float, int16_t>(
47+
dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
48+
output->data<float>(), batch_size, img_c, img_h, img_w, f, k_size,
49+
strides, paddings, dilations, groups, nullptr, nullptr, nullptr, true);
50+
PADDLE_ENFORCE_EQ(
51+
r, XPU_SUCCESS,
52+
platform::errors::External("XPU conv kernel return wrong value[%d %s]",
53+
r, XPUAPIErrorMsg[r]));
9254
}
9355
};
9456
template <typename DeviceContext, typename T>
9557
class GemmConvGradXPUKernel : public framework::OpKernel<T> {
9658
public:
9759
void Compute(const framework::ExecutionContext& context) const override {
9860
const Tensor* input = context.Input<Tensor>("Input");
99-
// const Tensor* max_input = context.Input<Tensor>("MaxInput");
100-
// const Tensor* max_filter = context.Input<Tensor>("MaxFilter");
101-
// Tensor* max_output_grad = context.Output<Tensor>("MaxOutputGrad");
10261
const Tensor* output_grad =
10362
context.Input<Tensor>(framework::GradVarName("Output"));
10463
Tensor* input_grad =
@@ -115,11 +74,6 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
11574
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
11675
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
11776
const int batch_size = static_cast<int>(input->dims()[0]);
118-
PADDLE_ENFORCE_EQ(groups == 1, true, platform::errors::InvalidArgument(
119-
"XPU only support groups == 1."));
120-
PADDLE_ENFORCE_EQ(
121-
dilations[0] == 1 && dilations[1] == 1, true,
122-
platform::errors::InvalidArgument("XPU only support dilation == 1."));
12377
const int img_c = static_cast<int>(input->dims()[1]);
12478
const int img_h = static_cast<int>(input->dims()[2]);
12579
const int img_w = static_cast<int>(input->dims()[3]);
@@ -133,52 +87,24 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
13387
filter_grad->mutable_data<T>(context.GetPlace());
13488
}
13589
auto& dev_ctx = context.template device_context<DeviceContext>();
136-
// max_output_grad->Resize({4});
137-
// max_output_grad->mutable_data<T>(context.GetPlace());
138-
// PADDLE_ENFORCE_EQ(
139-
// xpu::findmax(dev_ctx.x_context(), output_grad->data<T>(),
140-
// output_grad->numel(),
141-
// max_output_grad->data<T>()) == xpu::Error_t::SUCCESS,
142-
// true,
143-
// platform::errors::External(
144-
// "XPU conv kernel error, can not find max_output_grad, please
145-
// check "
146-
// "whether Baidu Kunlun Card is "
147-
// "properly installed."));
148-
if (input_grad) {
149-
int r = xpu::conv2d_backward_int16(
150-
dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
151-
strides[0], strides[1], paddings[0], paddings[1], dilations[0],
152-
dilations[1], groups, output_grad->data<float>(),
153-
filter.data<float>(), input_grad->data<float>(), nullptr, nullptr);
154-
// max_output_grad->data<float>(), max_filter->data<float>());
155-
PADDLE_ENFORCE_EQ(
156-
r, XPU_SUCCESS,
157-
platform::errors::External("XPU conv kernel return wrong value[%d], "
158-
"please check whether Baidu Kunlun Card "
159-
"is properly installed.",
160-
r));
161-
}
162-
if (filter_grad) {
163-
int r = xpu::conv2d_backward_weight_int16(
164-
dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
165-
strides[0], strides[1], paddings[0], paddings[1], dilations[0],
166-
dilations[1], groups, output_grad->data<float>(),
167-
input->data<float>(), filter_grad->data<float>(), nullptr, nullptr);
168-
// max_output_grad->data<float>(), max_input->data<float>());
169-
PADDLE_ENFORCE_EQ(
170-
r, XPU_SUCCESS,
171-
platform::errors::External("XPU conv kernel return wrong value[%d], "
172-
"please check whether Baidu Kunlun Card "
173-
"is properly installed.",
174-
r));
175-
}
90+
std::vector<int> k_size;
91+
k_size.push_back(win_h);
92+
k_size.push_back(win_w);
93+
int r = xpu::conv2d_grad<float, float, float, int16_t>(
94+
dev_ctx.x_context(), input->data<T>(), filter.data<T>(),
95+
output_grad->data<T>(), input_grad ? input_grad->data<T>() : nullptr,
96+
filter_grad ? filter_grad->data<T>() : nullptr, batch_size, img_c,
97+
img_h, img_w, f, k_size, strides, paddings, dilations, groups, nullptr,
98+
nullptr, nullptr, nullptr, nullptr, true);
99+
PADDLE_ENFORCE_EQ(
100+
r, XPU_SUCCESS,
101+
platform::errors::External("XPU conv kernel return wrong value[%d %s]",
102+
r, XPUAPIErrorMsg[r]));
176103
}
177104
};
178105
} // namespace operators
179106
} // namespace paddle
180107
namespace ops = paddle::operators;
181-
// TODO(xingzhaolong): neon kernel for mobile
182108
REGISTER_OP_XPU_KERNEL(
183109
depthwise_conv2d,
184110
ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>);
@@ -187,4 +113,7 @@ REGISTER_OP_XPU_KERNEL(
187113
REGISTER_OP_XPU_KERNEL(
188114
conv2d_grad,
189115
ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
116+
REGISTER_OP_XPU_KERNEL(
117+
depthwise_conv2d_grad,
118+
ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
190119
#endif

0 commit comments

Comments
 (0)