Skip to content

Commit 6e154fc

Browse files
ShawnNewqipenghcifar10Ashelly1PeiyuLau
authored
[MLU] cherry-pick from develop to release/2.4 (#48313)
* [MLU] fix compute error of dropout op (#45923) * [MLU] add mergedAdam kernel. (#45965) * [MLU] add int64 support for mlu one_hot_v2 (#46313) * [MLU] fix profiler compile failure (#46208) * [MLU] add barrier_op kernel. (#46417) * [MLU] fluid: add mluop (#46429) * [MLU] add huber_loss kernel. (#46455) * [MLU] add mlu kernel for add_reduce_max_grad (#45651) Co-authored-by: liupeiyu <[email protected]> * [MLU] add_fluid_mluop_yolo_box (#46573) * [MLU] fix phi::Tensor compile error of mlu. (#46649) * [MLU] add fluid MLUOps prior_box (#46585) * [MLU] fix cmake error (#46772) * [MLU]fix unittest of sync_bn (#46797) * [MLU] add masterparam support for mlu adamw. (#46804) * [MLU] add int64 support for allgather. (#46830) * [MLU] fix compile error & add mlu blacklist function. (#47439) * [MLU] fix softmax_with_cross_entropy failed in 370-X8. * [MLU] fix cncl stuck caused by multiple initializations. * [MLU] fix code style check. Co-authored-by: qipengh <[email protected]> Co-authored-by: cifar10 <[email protected]> Co-authored-by: Lux et Veritas <[email protected]> Co-authored-by: liupeiyu <[email protected]> Co-authored-by: ronnywang <[email protected]>
1 parent 96e974a commit 6e154fc

37 files changed

+3247
-702
lines changed

cmake/neuware.cmake

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@ set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64)
1515
include_directories(${NEUWARE_INCLUDE_DIR})
1616

1717
set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
18+
set(MLUOP_LIB ${NEUWARE_LIB_DIR}/libmluops.so)
1819
set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
1920
set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
2021
set(CNPAPI_LIB ${NEUWARE_LIB_DIR}/libcnpapi.so)
2122

2223
generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
23-
set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB} ${CNPAPI_LIB})
24+
set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${MLUOP_LIB} ${CNRT_LIB} ${CNDRV_LIB}
25+
${CNPAPI_LIB})
2426

2527
if(WITH_CNCL)
2628
message(STATUS "Compile with CNCL!")

paddle/fluid/imperative/prepared_operator.cc

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,48 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
146146
kernel_signature_(std::move(kernel_signature)),
147147
phi_kernel_(phi_kernel) {}
148148

149+
#ifdef PADDLE_WITH_MLU
150+
151+
static void tokenize(const std::string& ops,
152+
char delim,
153+
std::unordered_set<std::string>* op_set) {
154+
std::string::size_type beg = 0;
155+
for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos;
156+
++end) {
157+
op_set->insert(ops.substr(beg, end - beg));
158+
beg = end + 1;
159+
}
160+
161+
op_set->insert(ops.substr(beg));
162+
}
163+
164+
static bool is_in_mlu_black_list(const std::string& op_name) {
165+
static bool inited = false;
166+
static std::unordered_set<std::string> mlu_black_list;
167+
static std::mutex s_mtx;
168+
if (!inited) {
169+
std::lock_guard<std::mutex> guard(s_mtx);
170+
if (!inited) {
171+
if (std::getenv("MLU_BLACK_LIST") != nullptr) {
172+
std::string ops(std::getenv("MLU_BLACK_LIST"));
173+
tokenize(ops, ',', &mlu_black_list);
174+
}
175+
inited = true;
176+
VLOG(3) << "MLU Black List: ";
177+
for (auto iter = mlu_black_list.begin(); iter != mlu_black_list.end();
178+
++iter) {
179+
VLOG(3) << *iter << " ";
180+
}
181+
}
182+
}
183+
if (mlu_black_list.find(op_name) != mlu_black_list.end()) {
184+
return true;
185+
}
186+
return false;
187+
}
188+
189+
#endif
190+
149191
template <typename VarType>
150192
PreparedOp PrepareImpl(
151193
const NameVarMap<VarType>& ins,
@@ -194,6 +236,12 @@ PreparedOp PrepareImpl(
194236

195237
#endif
196238

239+
#ifdef PADDLE_WITH_MLU
240+
if (is_in_mlu_black_list(op.Type())) {
241+
expected_kernel_key.place_ = platform::CPUPlace();
242+
}
243+
#endif
244+
197245
bool has_phi_kernel = false;
198246

199247
const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type());
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "paddle/fluid/operators/collective/barrier_op.h"
16+
#if defined(PADDLE_WITH_CNCL)
17+
#include "paddle/fluid/platform/collective_helper.h"
18+
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
19+
#endif
20+
21+
namespace paddle {
22+
namespace operators {
23+
24+
template <typename T>
25+
class BarrierOpMLUKernel : public framework::OpKernel<T> {
26+
public:
27+
void Compute(const framework::ExecutionContext& ctx) const override {
28+
#if defined(PADDLE_WITH_CNCL)
29+
auto in = ctx.Input<phi::DenseTensor>("X");
30+
auto out = ctx.Output<phi::DenseTensor>("Out");
31+
32+
auto place = ctx.GetPlace();
33+
cnclDataType_t dtype =
34+
platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
35+
int64_t numel = in->numel();
36+
const void* sendbuff = in->data();
37+
void* recvbuff = out->mutable_data<T>(place);
38+
39+
int rid = ctx.Attr<int>("ring_id");
40+
auto cncl_comm = platform::CNCLCommContext::Instance().Get(rid, place);
41+
auto* comm = cncl_comm->comm();
42+
auto comm_stream = cncl_comm->stream();
43+
auto& dev_ctx =
44+
ctx.template device_context<paddle::platform::MLUDeviceContext>();
45+
cnclReduceOp_t cncl_red_type = cnclSum;
46+
dev_ctx.Wait();
47+
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
48+
sendbuff, recvbuff, numel, dtype, cncl_red_type, comm, comm_stream));
49+
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
50+
#else
51+
PADDLE_THROW(platform::errors::Unavailable(
52+
"PaddlePaddle should compile with CNCL."));
53+
#endif
54+
}
55+
};
56+
57+
} // namespace operators
58+
} // namespace paddle
59+
60+
namespace ops = paddle::operators;
61+
namespace plat = paddle::platform;
62+
63+
REGISTER_OP_MLU_KERNEL(barrier, ops::BarrierOpMLUKernel<int>);

paddle/fluid/operators/collective/c_allgather_op_mlu.cc

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/operators/collective/c_allgather_op.h"
16+
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
1617

1718
#if defined(PADDLE_WITH_CNCL)
1819
#include "paddle/fluid/platform/collective_helper.h"
@@ -27,15 +28,14 @@ template <typename T>
2728
class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
2829
public:
2930
void Compute(const framework::ExecutionContext& ctx) const override {
31+
auto place = ctx.GetPlace();
32+
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
3033
#if defined(PADDLE_WITH_CNCL)
31-
auto x = ctx.Input<framework::Tensor>("X");
32-
auto out = ctx.Output<framework::Tensor>("Out");
33-
cnclDataType_t dtype =
34-
platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
34+
auto x = ctx.Input<phi::DenseTensor>("X");
35+
auto out = ctx.Output<phi::DenseTensor>("Out");
3536

3637
int nranks = ctx.Attr<int>("nranks");
3738
int rid = ctx.Attr<int>("ring_id");
38-
auto place = ctx.GetPlace();
3939
auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
4040
PADDLE_ENFORCE_EQ(
4141
nranks,
@@ -48,19 +48,56 @@ class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
4848
out->mutable_data<T>(out_dims, place);
4949

5050
uint32_t send_numel = x->numel();
51-
void* send_buff = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
52-
void* recv_buff = reinterpret_cast<void*>(out->data<T>());
51+
void* send_buff;
52+
void* recv_buff;
53+
phi::DenseTensor in_tensor, out_tensor;
54+
if (framework::TransToProtoVarType(x->dtype()) ==
55+
framework::proto::VarType::INT64) {
56+
// cast from int64 to int32 since cncl do not support int64
57+
in_tensor.mutable_data<int32_t>(x->dims(), place);
58+
out_tensor.mutable_data<int32_t>(out->dims(), place);
59+
MLUCnnlTensorDesc x_int64_desc(*x);
60+
MLUCnnlTensorDesc x_int32_desc(in_tensor);
61+
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
62+
MLUCnnl::Cast(ctx,
63+
cast_type,
64+
x_int64_desc.get(),
65+
GetBasePtr(x),
66+
x_int32_desc.get(),
67+
GetBasePtr(&in_tensor));
68+
send_buff = reinterpret_cast<void*>(in_tensor.data<int32_t>());
69+
recv_buff = reinterpret_cast<void*>(out_tensor.data<int32_t>());
70+
} else {
71+
in_tensor.ShareDataWith(*x);
72+
out_tensor.ShareDataWith(*out);
73+
send_buff = reinterpret_cast<void*>(in_tensor.data<T>());
74+
recv_buff = reinterpret_cast<void*>(out_tensor.data<T>());
75+
}
5376

5477
mluStream stream = nullptr;
5578
if (ctx.Attr<bool>("use_calc_stream")) {
56-
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
5779
stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
5880
} else {
5981
stream = comm->stream();
6082
}
83+
cnclDataType_t dtype = platform::ToCNCLDataType(
84+
framework::TransToProtoVarType(in_tensor.dtype()));
6185

6286
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(
6387
send_buff, recv_buff, send_numel, dtype, comm->comm(), stream));
88+
if (framework::TransToProtoVarType(x->dtype()) ==
89+
framework::proto::VarType::INT64) {
90+
// cast back from int64 out_tensor to out
91+
MLUCnnlTensorDesc out_int64_desc(*out);
92+
MLUCnnlTensorDesc out_int32_desc(out_tensor);
93+
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
94+
MLUCnnl::Cast(ctx,
95+
cast_type,
96+
out_int32_desc.get(),
97+
GetBasePtr(&out_tensor),
98+
out_int64_desc.get(),
99+
GetBasePtr(out));
100+
}
64101
#else
65102
PADDLE_THROW(platform::errors::PreconditionNotMet(
66103
"PaddlePaddle should compile with MLU."));
@@ -80,4 +117,5 @@ REGISTER_OP_MLU_KERNEL(c_allgather,
80117
ops::CAllGatherOpMLUKernel<int>,
81118
ops::CAllGatherOpMLUKernel<int8_t>,
82119
ops::CAllGatherOpMLUKernel<int16_t>,
120+
ops::CAllGatherOpMLUKernel<int64_t>,
83121
ops::CAllGatherOpMLUKernel<plat::float16>);

paddle/fluid/operators/detection/CMakeLists.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,19 +42,23 @@ if(WITH_XPU)
4242
detection_library(iou_similarity_op SRCS iou_similarity_op.cc
4343
iou_similarity_op_xpu.cc)
4444
detection_library(prior_box_op SRCS prior_box_op.cc)
45+
detection_library(yolo_box_op SRCS yolo_box_op.cc)
4546
detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
4647
elseif(WITH_MLU)
4748
detection_library(iou_similarity_op SRCS iou_similarity_op.cc
4849
iou_similarity_op_mlu.cc)
49-
detection_library(prior_box_op SRCS prior_box_op.cc)
50+
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
51+
detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
5052
elseif(WITH_ASCEND_CL)
5153
detection_library(iou_similarity_op SRCS iou_similarity_op.cc
5254
iou_similarity_op_npu.cc)
5355
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc)
56+
detection_library(yolo_box_op SRCS yolo_box_op.cc)
5457
else()
5558
detection_library(iou_similarity_op SRCS iou_similarity_op.cc
5659
iou_similarity_op.cu)
5760
detection_library(prior_box_op SRCS prior_box_op.cc)
61+
detection_library(yolo_box_op SRCS yolo_box_op.cc)
5862
# detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
5963
endif()
6064

@@ -73,7 +77,6 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
7377
detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
7478
detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
7579
detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
76-
detection_library(yolo_box_op SRCS yolo_box_op.cc)
7780
detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
7881
box_decoder_and_assign_op.cu)
7982
detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
Licensed under the Apache License, Version 2.0 (the "License");
3+
you may not use this file except in compliance with the License.
4+
You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software
9+
distributed under the License is distributed on an "AS IS" BASIS,
10+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
See the License for the specific language governing permissions and
12+
limitations under the License. */
13+
14+
#include "paddle/fluid/framework/op_registry.h"
15+
#include "paddle/fluid/framework/tensor_util.h"
16+
#include "paddle/fluid/operators/detection/prior_box_op.h"
17+
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
18+
19+
namespace paddle {
20+
namespace operators {
21+
22+
template <typename T>
23+
class PriorBoxMLUKernel : public framework::OpKernel<T> {
24+
public:
25+
void Compute(const framework::ExecutionContext& ctx) const override {
26+
auto* input = ctx.Input<phi::DenseTensor>("Input");
27+
auto* image = ctx.Input<phi::DenseTensor>("Image");
28+
auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
29+
auto* variances = ctx.Output<phi::DenseTensor>("Variances");
30+
float step_w = ctx.Attr<float>("step_w");
31+
float step_h = ctx.Attr<float>("step_h");
32+
float offset = ctx.Attr<float>("offset");
33+
bool clip = ctx.Attr<bool>("clip");
34+
bool min_max_aspect_ratios_order =
35+
ctx.Attr<bool>("min_max_aspect_ratios_order");
36+
37+
int im_width = image->dims()[3];
38+
int im_height = image->dims()[2];
39+
int width = input->dims()[3];
40+
int height = input->dims()[2];
41+
42+
auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
43+
bool flip = ctx.Attr<bool>("flip");
44+
std::vector<float> new_aspect_ratios;
45+
ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
46+
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
47+
phi::DenseTensor ratios;
48+
paddle::framework::TensorFromVector(new_aspect_ratios, dev_ctx, &ratios);
49+
MLUOpTensorDesc new_aspect_ratios_desc(ratios);
50+
51+
auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
52+
phi::DenseTensor min;
53+
paddle::framework::TensorFromVector(min_sizes, dev_ctx, &min);
54+
MLUOpTensorDesc min_sizes_desc(min);
55+
56+
auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
57+
phi::DenseTensor max;
58+
paddle::framework::TensorFromVector(max_sizes, dev_ctx, &max);
59+
MLUOpTensorDesc max_sizes_desc(max);
60+
61+
auto variances_attr = ctx.Attr<std::vector<float>>("variances");
62+
phi::DenseTensor var_tensor;
63+
paddle::framework::TensorFromVector(variances_attr, dev_ctx, &var_tensor);
64+
MLUOpTensorDesc variances_attr_desc(var_tensor);
65+
66+
auto place = ctx.GetPlace();
67+
68+
boxes->mutable_data<T>(place);
69+
variances->mutable_data<T>(place);
70+
71+
MLUOpTensorDesc var_desc(*variances);
72+
MLUOpTensorDesc output_desc(*boxes);
73+
MLUOP::OpPriorBox(ctx,
74+
min_sizes_desc.get(),
75+
GetBasePtr(&min),
76+
new_aspect_ratios_desc.get(),
77+
GetBasePtr(&ratios),
78+
variances_attr_desc.get(),
79+
GetBasePtr(&var_tensor),
80+
max_sizes_desc.get(),
81+
GetBasePtr(&max),
82+
height,
83+
width,
84+
im_height,
85+
im_width,
86+
step_h,
87+
step_w,
88+
offset,
89+
clip,
90+
min_max_aspect_ratios_order,
91+
output_desc.get(),
92+
GetBasePtr(boxes),
93+
var_desc.get(),
94+
GetBasePtr(variances));
95+
}
96+
};
97+
98+
} // namespace operators
99+
} // namespace paddle
100+
101+
namespace ops = paddle::operators;
102+
namespace plat = paddle::platform;
103+
104+
REGISTER_OP_MLU_KERNEL(prior_box, ops::PriorBoxMLUKernel<float>);

0 commit comments

Comments
 (0)