Skip to content

Commit 7bddf2e

Browse files
authored
[NPU] mod for model bert (#36165)
* merge conflict of paddle_gtest_main.cc * modify FLAGS_npu_precision_mode and default not to call aclSetCompileopt
1 parent bec9fc9 commit 7bddf2e

File tree

8 files changed

+290
-5
lines changed

8 files changed

+290
-5
lines changed

paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,11 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
166166
namespace ops = paddle::operators;
167167
namespace plat = paddle::platform;
168168

169-
REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<float>,
169+
REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<int>,
170+
ops::ElementwiseSubNPUKernel<float>,
170171
ops::ElementwiseSubNPUKernel<plat::float16>);
171172

172173
REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
174+
ops::ElementwiseSubGradNPUKernel<int>,
173175
ops::ElementwiseSubGradNPUKernel<float>,
174176
ops::ElementwiseSubGradNPUKernel<plat::float16>);

paddle/fluid/operators/fill_any_like_op_npu.cc

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,12 @@ class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
6363
.stream();
6464

6565
auto shape = out->dims();
66-
const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out},
67-
{{"dims", framework::vectorize(shape)}});
68-
runner.Run(stream);
66+
NpuOpRunner runner;
67+
runner.SetType("Fill")
68+
.AddInput(framework::vectorize(shape))
69+
.AddInput(tensor_tmp)
70+
.AddOutput(*out)
71+
.Run(stream);
6972
}
7073
};
7174

@@ -75,5 +78,8 @@ class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
7578
namespace ops = paddle::operators;
7679

7780
REGISTER_OP_NPU_KERNEL(fill_any_like, ops::FillAnyLikeNPUKernel<int>,
81+
#ifdef PADDLE_WITH_ASCEND_INT64
82+
ops::FillAnyLikeNPUKernel<int64_t>,
83+
#endif
7884
ops::FillAnyLikeNPUKernel<float>,
7985
ops::FillAnyLikeNPUKernel<paddle::platform::float16>);

paddle/fluid/operators/npu_op_runner.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ limitations under the License. */
2626

2727
#include "paddle/fluid/framework/framework.pb.h"
2828

29+
DECLARE_string(npu_precision_mode);
30+
2931
namespace paddle {
3032
namespace operators {
3133

@@ -404,6 +406,12 @@ void NpuOpRunner::Run(aclrtStream stream) const {
404406
VLOG(4) << "attr: " << attr_;
405407
VLOG(4) << "stream: " << stream;
406408

409+
if (!FLAGS_npu_precision_mode.empty()) {
410+
PADDLE_ENFORCE_NPU_SUCCESS(
411+
aclSetCompileopt(ACL_PRECISION_MODE, FLAGS_npu_precision_mode.c_str()));
412+
VLOG(4) << "set ACL_PRECISION_MODE: " << FLAGS_npu_precision_mode;
413+
}
414+
407415
aclError ret = aclopCompileAndExecute(
408416
op_type_.c_str(), input_descs_.size(), input_descs_.data(),
409417
input_buffers_.data(), output_descs_.size(), output_descs_.data(),

paddle/fluid/operators/slice_op_npu.cc

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,12 +181,37 @@ class SliceGradNPUKernel : public framework::OpKernel<T> {
181181
paddings[i][1] = static_cast<int64_t>(in_dims[i] - size[i] - offsets[i]);
182182
}
183183

184+
Tensor tmp_dout;
185+
tmp_dout.ShareDataWith(*dout);
186+
auto out_dims = dout->dims();
187+
auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
188+
auto decrease_size = decrease_axis.size();
189+
if (decrease_size > 0) {
190+
if (decrease_size == static_cast<size_t>(in_dims.size())) {
191+
out_dims = framework::make_ddim(std::vector<int>(decrease_size, 1));
192+
} else {
193+
std::vector<int> origin_out_shape(out_dims.size() + decrease_size, -1);
194+
for (size_t i = 0; i < decrease_size; ++i) {
195+
origin_out_shape[decrease_axis[i]] = 1;
196+
}
197+
int index = 0;
198+
for (size_t i = 0; i < origin_out_shape.size(); ++i) {
199+
if (origin_out_shape[i] == -1) {
200+
origin_out_shape[i] = out_dims[index];
201+
++index;
202+
}
203+
}
204+
out_dims = framework::make_ddim(origin_out_shape);
205+
}
206+
tmp_dout.Resize(out_dims);
207+
}
208+
184209
dinput->mutable_data<T>(ctx.GetPlace());
185210
auto stream =
186211
ctx.template device_context<paddle::platform::NPUDeviceContext>()
187212
.stream();
188213
const auto& runner =
189-
NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}});
214+
NpuOpRunner("PadD", {tmp_dout}, {*dinput}, {{"paddings", paddings}});
190215
runner.Run(stream);
191216
}
192217
};

paddle/fluid/platform/flags.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,13 @@ PADDLE_DEFINE_EXPORTED_string(
121121
"If proveided, it will be passed to aclInit().");
122122
PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling, 1,
123123
"set minmum loss scaling value!");
124+
PADDLE_DEFINE_EXPORTED_string(
125+
npu_precision_mode, "",
126+
"NPU operator precision mode, options are 'force_fp32', 'force_fp16', "
127+
"'allow_fp32_to_fp16', 'must_keep_origin_dtype' and "
128+
"'allow_mix_precision'. If you want to use the default mode ("
129+
"allow_fp32_to_fp16), set this to empty string. For more details, "
130+
"please refer to the documents");
124131
#endif
125132

126133
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,11 @@ def test_check_output(self):
9090
# max_relative_error=0.006,)
9191

9292

93+
class TestElementwiseSubOpInt32(TestElementwiseSubOp):
94+
def init_dtype(self):
95+
self.dtype = np.int32
96+
97+
9398
class TestSubtractAPI(unittest.TestCase):
9499
def test_name(self):
95100
with paddle.static.program_guard(paddle.static.Program()):

python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ def init(self):
5757
self.value = -1
5858

5959

60+
class TestFillAnyLikeNPUOpInt64(TestFillAnyLikeNPUOp):
61+
def init(self):
62+
self.dtype = np.int64
63+
self.value = -1
64+
65+
6066
class TestFillAnyLikeNPUOpFloat32(TestFillAnyLikeNPUOp):
6167
def init(self):
6268
self.dtype = np.float32

python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,5 +301,231 @@ def test_npu(self):
301301
self.assertTrue(np.allclose(npu_loss, cpu_loss))
302302

303303

304+
class TestSliceOpDecsDim(OpTest):
305+
def setUp(self):
306+
self.op_type = "slice"
307+
self.set_npu()
308+
self.init_dtype()
309+
self.config()
310+
self.set_inputs()
311+
self.set_outputs()
312+
self.set_attrs()
313+
314+
def set_inputs(self):
315+
self.inputs = {'Input': self.input}
316+
317+
def set_outputs(self):
318+
self.outputs = {'Out': self.out}
319+
320+
def set_attrs(self):
321+
self.attrs = {
322+
'axes': self.axes,
323+
'starts': self.starts,
324+
'ends': self.ends,
325+
'infer_flags': self.infer_flags,
326+
'decrease_axis': self.decrease_axis,
327+
}
328+
329+
def config(self):
330+
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
331+
self.starts = [1, 0, 2]
332+
self.ends = [2, 3, 4]
333+
self.axes = [0, 1, 2]
334+
self.decrease_axis = [0]
335+
self.infer_flags = [1, 1, 1]
336+
self.out = self.input[1, 0:3, 2:4, :]
337+
338+
def init_dtype(self):
339+
self.dtype = np.float32
340+
341+
def set_npu(self):
342+
self.__class__.use_npu = True
343+
self.place = paddle.NPUPlace(0)
344+
345+
def test_check_output(self):
346+
self.check_output_with_place(self.place)
347+
348+
def test_check_grad_normal(self):
349+
if self.dtype == np.float16:
350+
return
351+
self.check_grad_with_place(self.place, ['Input'], 'Out')
352+
353+
354+
class TestSliceOpDecsDimFp16(TestSliceOpDecsDim):
355+
def init_dtype(self):
356+
self.dtype = np.float16
357+
358+
359+
class TestSliceOpDecsDim2(TestSliceOpDecsDim):
360+
def config(self):
361+
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
362+
self.starts = [1, 0, 2]
363+
self.ends = [2, 1, 4]
364+
self.axes = [0, 1, 2]
365+
self.decrease_axis = [0, 1]
366+
self.infer_flags = [1, 1, 1]
367+
self.out = self.input[1, 0, 2:4, :]
368+
369+
370+
class TestSliceOpDecsDim3(TestSliceOpDecsDim):
371+
def config(self):
372+
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
373+
self.starts = [-1, 0, 2]
374+
self.ends = [1000000, 1, 4]
375+
self.axes = [0, 1, 2]
376+
self.decrease_axis = [0, 1]
377+
self.infer_flags = [1, 1, 1]
378+
self.out = self.input[-1, 0, 2:4, :]
379+
380+
381+
class TestSliceOpDecsDim4(TestSliceOpDecsDim):
382+
def config(self):
383+
self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype)
384+
self.starts = [0, 1, 2, 3]
385+
self.ends = [1, 2, 3, 4]
386+
self.axes = [0, 1, 2, 3]
387+
self.decrease_axis = [0, 1, 2, 3]
388+
self.infer_flags = [1, 1, 1]
389+
self.out = self.input[0, 1, 2, 3:4]
390+
391+
392+
class TestSliceOpDecsDim5(TestSliceOpDecsDim):
393+
def config(self):
394+
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
395+
self.starts = [-1]
396+
self.ends = [1000000]
397+
self.axes = [3]
398+
self.decrease_axis = [3]
399+
self.infer_flags = [1, 1, 1]
400+
self.out = self.input[:, :, :, -1]
401+
402+
403+
class TestSliceOpDecsDim6(TestSliceOpDecsDim):
404+
def config(self):
405+
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
406+
self.starts = [0, 1, 2, 3]
407+
self.ends = [1, 2, 3, 4]
408+
self.axes = [0, 1, 2, 3]
409+
self.decrease_axis = [0, 1, 2, 3]
410+
self.infer_flags = [1, 1, 1]
411+
self.out = self.input[0, 1, 2, 3:4]
412+
413+
414+
class TestSliceOpDecsDimStartsTensor(TestSliceOpDecsDim):
415+
def set_inputs(self):
416+
self.inputs = {
417+
'Input': self.input,
418+
"StartsTensor": np.array(
419+
self.starts, dtype='int32')
420+
}
421+
422+
def set_attrs(self):
423+
self.attrs = {
424+
'axes': self.axes,
425+
#'starts': self.starts,
426+
'ends': self.ends,
427+
'infer_flags': self.infer_flags,
428+
'decrease_axis': self.decrease_axis,
429+
}
430+
431+
def config(self):
432+
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
433+
self.starts = [1, 0, 2]
434+
self.ends = [2, 3, 4]
435+
self.axes = [0, 1, 2]
436+
self.decrease_axis = [0]
437+
self.infer_flags = [-1, -1, -1]
438+
self.out = self.input[1, 0:3, 2:4, :]
439+
440+
441+
class TestSliceOpDecsDimStartsTensorFP16(TestSliceOpDecsDimStartsTensor):
442+
def init_dtype(self):
443+
self.dtype = np.float16
444+
445+
446+
class TestSliceOpDecsDimStartsTensorStartsAndEndsTensor(TestSliceOpDecsDim):
447+
def set_inputs(self):
448+
self.inputs = {
449+
'Input': self.input,
450+
"StartsTensor": np.array(
451+
self.starts, dtype='int64'),
452+
"EndsTensor": np.array(
453+
self.ends, dtype='int32')
454+
}
455+
456+
def set_attrs(self):
457+
self.attrs = {
458+
'axes': self.axes,
459+
#'starts': self.starts,
460+
#'ends': self.ends,
461+
'infer_flags': self.infer_flags,
462+
'decrease_axis': self.decrease_axis,
463+
}
464+
465+
def config(self):
466+
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
467+
self.starts = [1, 0, 2]
468+
self.ends = [2, 1, 4]
469+
self.axes = [0, 1, 2]
470+
self.decrease_axis = [0, 1]
471+
self.infer_flags = [-1, -1, -1]
472+
self.out = self.input[1, 0, 2:4, :]
473+
474+
475+
class TestSliceOpDecsDimStartsTensorStartsAndEndsTensorFP16(
476+
TestSliceOpDecsDimStartsTensorStartsAndEndsTensor):
477+
def init_dtype(self):
478+
self.dtype = np.float16
479+
480+
481+
class TestSliceOpDecsDimStartsListTensor(TestSliceOpDecsDim):
482+
def set_inputs(self):
483+
starts_tensor = []
484+
for index, ele in enumerate(self.starts):
485+
starts_tensor.append(("x" + str(index), np.ones(
486+
(1)).astype('int32') * ele))
487+
488+
self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
489+
490+
def set_attrs(self):
491+
self.attrs = {
492+
'axes': self.axes,
493+
'starts': self.starts_infer,
494+
'ends': self.ends,
495+
'infer_flags': self.infer_flags,
496+
'decrease_axis': self.decrease_axis,
497+
}
498+
499+
def config(self):
500+
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
501+
self.starts = [1, 0, 2]
502+
self.ends = [2, 3, 4]
503+
self.axes = [0, 1, 2]
504+
self.decrease_axis = [0]
505+
self.infer_flags = [1, -1, 1]
506+
self.out = self.input[1, 0:3, 2:4, :]
507+
508+
self.starts_infer = [1, -1, 2]
509+
510+
511+
class TestSliceOpDecsDimStartsListTensor2(TestSliceOpDecsDimStartsListTensor):
512+
def config(self):
513+
self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
514+
self.starts = [-1]
515+
self.ends = [1000000]
516+
self.axes = [3]
517+
self.decrease_axis = [3]
518+
self.infer_flags = [-1]
519+
self.out = self.input[:, :, :, -1]
520+
521+
self.starts_infer = [-1]
522+
523+
524+
class TestSliceOpDecsDimStartsListTensorFP16(
525+
TestSliceOpDecsDimStartsListTensor):
526+
def init_dtype(self):
527+
self.dtype = np.float16
528+
529+
304530
if __name__ == '__main__':
305531
unittest.main()

0 commit comments

Comments
 (0)