Skip to content

Commit 0f266ac

Browse files
cherry pick xpu to 2.1 (#34000)
* update xpu cmake for kunlun (#33328) * xpu support amp (#33809) * fix bug DLTP-31078 (#33877) * update xpu cmake (#33906) * [xpu] add dropout & amp ops in xpu place (#33891) Co-authored-by: TTerror <[email protected]>
1 parent ed7903c commit 0f266ac

19 files changed

+938
-187
lines changed

cmake/external/xpu.cmake

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,18 @@ ELSEIF(WITH_CENTOS)
2727
SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
2828
SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64")
2929
SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
30+
3031
ELSE ()
3132
SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
3233
SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
3334
SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
3435
ENDIF()
3536

36-
IF(NOT XPU_BASE_URL)
37-
SET(XPU_BASE_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20210527")
38-
ENDIF()
39-
37+
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
38+
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701")
4039
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
4140
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
42-
SET(XPU_XCCL_URL "${XPU_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
41+
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
4342
SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE)
4443

4544
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
@@ -96,7 +95,11 @@ ELSE(WITH_XPU_BKCL)
9695
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
9796
ENDIF(WITH_XPU_BKCL)
9897

99-
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
98+
if(NOT XPU_SDK_ROOT)
99+
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
100+
else()
101+
ADD_CUSTOM_TARGET(extern_xpu DEPENDS xpulib)
102+
endif()
100103

101104
# Ensure that xpu/api.h can be included without dependency errors.
102105
file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")

paddle/fluid/imperative/amp_auto_cast.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ AmpOperators::AmpOperators()
3333
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
3434
bool supported = false;
3535
for (auto& kernel_type : it->second) {
36-
if (platform::is_gpu_place(kernel_type.first.place_) &&
36+
if ((platform::is_gpu_place(kernel_type.first.place_) ||
37+
platform::is_xpu_place(kernel_type.first.place_)) &&
3738
kernel_type.first.data_type_ == fp16_dtype) {
3839
supported = true;
3940
}
@@ -91,7 +92,8 @@ inline std::string GetDtypeStr(
9192

9293
inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
9394
if (platform::is_gpu_place(var->Place()) ||
94-
platform::is_cuda_pinned_place(var->Place())) {
95+
platform::is_cuda_pinned_place(var->Place()) ||
96+
platform::is_xpu_place(var->Place())) {
9597
// CudaPinndePlace is added for varbase created by dataloader
9698
if (var->DataType() == framework::proto::VarType::FP32 ||
9799
var->DataType() == framework::proto::VarType::FP16) {
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#ifdef PADDLE_WITH_XPU
16+
#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
17+
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
18+
#include "paddle/fluid/platform/float16.h"
19+
namespace paddle {
20+
namespace operators {
21+
template <typename T>
22+
class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
23+
using MPDType = typename details::MPTypeTrait<T>::Type;
24+
using XPUTyp = typename XPUTypeTrait<T>::Type;
25+
26+
public:
27+
void Compute(const framework::ExecutionContext& ctx) const {
28+
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
29+
const auto xs = ctx.MultiInput<framework::Tensor>("X");
30+
const auto* scale = ctx.Input<framework::Tensor>("Scale");
31+
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
32+
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
33+
34+
const MPDType* scale_data = scale->data<MPDType>();
35+
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
36+
37+
// cpy to cpu
38+
bool cpu_found_inf_data = false;
39+
40+
MPDType cpu_scale_data;
41+
if (platform::is_xpu_place(scale->place())) {
42+
xpu_memcpy(&cpu_scale_data, scale_data, sizeof(MPDType),
43+
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
44+
} else {
45+
cpu_scale_data = (*scale_data);
46+
}
47+
MPDType inverse_scale = 1.0 / cpu_scale_data;
48+
for (size_t i = 0; i < xs.size(); ++i) {
49+
const auto* x = xs[i];
50+
auto* out = outs[i];
51+
out->mutable_data<T>(dev_ctx.GetPlace());
52+
framework::Tensor is_finite =
53+
ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
54+
dev_ctx);
55+
framework::Tensor is_nan =
56+
ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
57+
dev_ctx);
58+
framework::Tensor is_finite_and_nan =
59+
ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
60+
dev_ctx);
61+
if (cpu_found_inf_data == false) {
62+
int r = xpu::isfinite(dev_ctx.x_context(),
63+
reinterpret_cast<const XPUTyp*>(x->data<T>()),
64+
is_finite.data<bool>(), x->numel());
65+
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
66+
"XPU API(isfinite) return wrong "
67+
"value[%d %s]",
68+
r, XPUAPIErrorMsg[r]));
69+
r = xpu::logical_not(dev_ctx.x_context(), reinterpret_cast<const bool*>(
70+
is_finite.data<bool>()),
71+
is_finite.data<bool>(), x->numel());
72+
PADDLE_ENFORCE_EQ(
73+
r, XPU_SUCCESS,
74+
platform::errors::External("XPU API(logical_not) return wrong "
75+
"value[%d %s]",
76+
r, XPUAPIErrorMsg[r]));
77+
r = xpu::isnan(dev_ctx.x_context(),
78+
reinterpret_cast<const XPUTyp*>(x->data<T>()),
79+
is_nan.data<bool>(), x->numel());
80+
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
81+
"XPU API(isnan) return wrong "
82+
"value[%d %s]",
83+
r, XPUAPIErrorMsg[r]));
84+
r = xpu::logical_or(dev_ctx.x_context(), is_finite.data<bool>(),
85+
is_nan.data<bool>(), is_finite.data<bool>(),
86+
x->numel());
87+
PADDLE_ENFORCE_EQ(
88+
r, XPU_SUCCESS,
89+
platform::errors::External("XPU API(logical_or) return wrong "
90+
"value[%d %s]",
91+
r, XPUAPIErrorMsg[r]));
92+
r = xpu::any(dev_ctx.x_context(), is_finite.data<bool>(),
93+
found_inf_data, x->numel());
94+
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
95+
"XPU API(any) return wrong "
96+
"value[%d %s]",
97+
r, XPUAPIErrorMsg[r]));
98+
memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
99+
BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
100+
found_inf_data, sizeof(bool));
101+
}
102+
103+
if (cpu_found_inf_data) {
104+
inverse_scale = 0.0;
105+
}
106+
auto dev_env = XPUEnv::getenv("XPUSIM_DEVICE_MODEL");
107+
108+
if (std::is_same<T, paddle::platform::float16>::value &&
109+
(dev_env == nullptr || std::strcmp(dev_env, "KUNLUN1"))) {
110+
framework::Tensor float_x;
111+
framework::Tensor float_out;
112+
float_x.mutable_data<MPDType>(dev_ctx.GetPlace(),
113+
x->numel() * sizeof(MPDType));
114+
float_out.mutable_data<MPDType>(dev_ctx.GetPlace(),
115+
out->numel() * sizeof(MPDType));
116+
int r = xpu::cast_v2(dev_ctx.x_context(),
117+
reinterpret_cast<const float16*>(x->data<T>()),
118+
float_x.data<MPDType>(), x->numel());
119+
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
120+
"XPU API(cast_v2) return wrong "
121+
"value[%d %s]",
122+
r, XPUAPIErrorMsg[r]));
123+
124+
r = xpu::scale(dev_ctx.x_context(), float_x.data<MPDType>(),
125+
float_out.data<MPDType>(), x->numel(), false,
126+
inverse_scale, 0.0);
127+
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
128+
"XPU API(scale) return wrong "
129+
"value[%d %s]",
130+
r, XPUAPIErrorMsg[r]));
131+
132+
r = xpu::cast_v2(dev_ctx.x_context(), float_out.data<MPDType>(),
133+
reinterpret_cast<float16*>(out->data<T>()),
134+
out->numel());
135+
136+
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
137+
"XPU API(cast_v2) return wrong "
138+
"value[%d %s]",
139+
r, XPUAPIErrorMsg[r]));
140+
if (dev_ctx.x_context()->xpu_stream) {
141+
dev_ctx.Wait();
142+
}
143+
144+
} else {
145+
int r = xpu::scale(dev_ctx.x_context(),
146+
reinterpret_cast<const XPUTyp*>(x->data<T>()),
147+
reinterpret_cast<XPUTyp*>(out->data<T>()),
148+
x->numel(), false, inverse_scale, 0.0);
149+
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
150+
"XPU API(scale) return wrong "
151+
"value[%d %s]",
152+
r, XPUAPIErrorMsg[r]));
153+
}
154+
}
155+
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
156+
found_inf_data, platform::CPUPlace(), &cpu_found_inf_data,
157+
sizeof(bool));
158+
}
159+
};
160+
161+
} // namespace operators
162+
} // namespace paddle
163+
164+
namespace ops = paddle::operators;
165+
namespace plat = paddle::platform;
166+
REGISTER_OP_XPU_KERNEL(check_finite_and_unscale,
167+
ops::CheckFiniteAndUnscaleXPUKernel<float>,
168+
ops::CheckFiniteAndUnscaleXPUKernel<plat::float16>);
169+
170+
#endif
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#ifdef PADDLE_WITH_XPU
16+
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
17+
#include <cstring>
18+
#include <string>
19+
#include <vector>
20+
#include "paddle/fluid/framework/op_registry.h"
21+
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
22+
#include "paddle/fluid/platform/float16.h"
23+
24+
namespace paddle {
25+
namespace operators {
26+
27+
template <typename T>
28+
class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
29+
using MPDType = typename details::MPTypeTrait<T>::Type;
30+
using XPUTyp = typename XPUTypeTrait<T>::Type;
31+
32+
public:
33+
void Compute(const framework::ExecutionContext& ctx) const override {
34+
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
35+
36+
const auto xs = ctx.MultiInput<framework::Tensor>("X");
37+
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
38+
const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
39+
PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
40+
platform::errors::InvalidArgument(
41+
"FoundInfinite must has only one element."));
42+
const bool* found_inf_data = found_inf->data<bool>();
43+
bool cpu_found_inf_data = false;
44+
if (platform::is_xpu_place(found_inf->place())) {
45+
xpu_memcpy(&cpu_found_inf_data, found_inf_data, sizeof(bool),
46+
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
47+
} else {
48+
cpu_found_inf_data = (*found_inf_data);
49+
}
50+
51+
for (size_t i = 0; i < xs.size(); ++i) {
52+
auto* out = outs[i];
53+
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
54+
int num = out->numel();
55+
if (cpu_found_inf_data) {
56+
VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
57+
int r = 0;
58+
r = xpu::constant(dev_ctx.x_context(),
59+
reinterpret_cast<XPUTyp*>(out_data), num,
60+
XPUTyp(0.0));
61+
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
62+
"XPU API(constant) return wrong "
63+
"value[%d %s]",
64+
r, XPUAPIErrorMsg[r]));
65+
}
66+
}
67+
const bool stop_update = ctx.Attr<bool>("stop_update");
68+
if (stop_update) {
69+
return;
70+
}
71+
72+
const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
73+
const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
74+
const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
75+
auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
76+
auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
77+
auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
78+
const MPDType* pre_loss_scaling_data = pre_loss_scaling->data<MPDType>();
79+
const int* good_in_data = good_in->data<int>();
80+
const int* bad_in_data = bad_in->data<int>();
81+
82+
MPDType* updated_loss_scaling_data =
83+
updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
84+
int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
85+
int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
86+
87+
const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
88+
const int decr_every_n_nan_or_inf =
89+
ctx.Attr<int>("decr_every_n_nan_or_inf");
90+
const float incr_ratio = ctx.Attr<float>("incr_ratio");
91+
const float decr_ratio = ctx.Attr<float>("decr_ratio");
92+
93+
int cpu_bad_in_data;
94+
int cpu_good_in_data;
95+
MPDType cpu_pre_loss_scaling_data;
96+
if (platform::is_xpu_place(bad_in->place())) {
97+
xpu_memcpy(&cpu_bad_in_data, bad_in_data, sizeof(int),
98+
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
99+
} else {
100+
cpu_bad_in_data = (*bad_in_data);
101+
}
102+
103+
if (platform::is_xpu_place(good_in->place())) {
104+
xpu_memcpy(&cpu_good_in_data, good_in_data, sizeof(int),
105+
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
106+
} else {
107+
cpu_good_in_data = (*good_in_data);
108+
}
109+
110+
if (platform::is_xpu_place(pre_loss_scaling->place())) {
111+
xpu_memcpy(&cpu_pre_loss_scaling_data, pre_loss_scaling_data,
112+
sizeof(MPDType), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
113+
} else {
114+
cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
115+
}
116+
117+
int cpu_good_out_data = 0;
118+
int cpu_bad_out_data = 0;
119+
MPDType cpu_updated_loss_scaling_data;
120+
121+
if (cpu_found_inf_data) {
122+
cpu_good_out_data = 0;
123+
cpu_bad_out_data = cpu_bad_in_data + 1;
124+
if (cpu_bad_out_data == decr_every_n_nan_or_inf) {
125+
MPDType new_loss_scaling = cpu_pre_loss_scaling_data * decr_ratio;
126+
cpu_updated_loss_scaling_data =
127+
(new_loss_scaling < static_cast<MPDType>(1))
128+
? (static_cast<MPDType>(1))
129+
: (new_loss_scaling);
130+
cpu_bad_out_data = 0;
131+
}
132+
} else {
133+
cpu_bad_out_data = 0;
134+
cpu_good_out_data = cpu_good_in_data + 1;
135+
if (cpu_good_out_data == incr_every_n_steps) {
136+
MPDType new_loss_scaling = cpu_pre_loss_scaling_data * incr_ratio;
137+
cpu_updated_loss_scaling_data = (std::isfinite(new_loss_scaling))
138+
? new_loss_scaling
139+
: cpu_pre_loss_scaling_data;
140+
cpu_good_out_data = 0;
141+
}
142+
}
143+
144+
// copy to host
145+
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
146+
bad_out_data, platform::CPUPlace(), &cpu_bad_out_data,
147+
sizeof(int));
148+
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
149+
good_out_data, platform::CPUPlace(), &cpu_good_out_data,
150+
sizeof(int));
151+
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
152+
updated_loss_scaling_data, platform::CPUPlace(),
153+
&cpu_updated_loss_scaling_data, sizeof(MPDType));
154+
}
155+
};
156+
157+
} // namespace operators
158+
} // namespace paddle
159+
160+
namespace ops = paddle::operators;
161+
namespace plat = paddle::platform;
162+
163+
REGISTER_OP_XPU_KERNEL(update_loss_scaling,
164+
ops::UpdateLossScalingXPUKernel<float>,
165+
ops::UpdateLossScalingXPUKernel<plat::float16>);
166+
#endif

0 commit comments

Comments
 (0)