Skip to content

Commit 97a7751

Browse files
author
chengduo
authored
Fix the order of sum (#12562)
* fix the order of sum * add doc * check whether need to copy * follow comments
1 parent 3300a53 commit 97a7751

File tree

5 files changed

+60
-8
lines changed

5 files changed

+60
-8
lines changed

cmake/generic.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,8 @@ function(cc_test TARGET_NAME)
264264
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
265265
if (${cc_test_SERIAL})
266266
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
267+
268+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
267269
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
268270
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
269271
endif()
@@ -330,6 +332,8 @@ function(nv_test TARGET_NAME)
330332
add_test(${TARGET_NAME} ${TARGET_NAME})
331333
if (nv_test_SERIAL)
332334
set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
335+
336+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
333337
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
334338
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
335339
endif()
@@ -580,6 +584,7 @@ function(py_test TARGET_NAME)
580584
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
581585
add_test(NAME ${TARGET_NAME}
582586
COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
587+
FLAGS_cpu_deterministic=true
583588
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
584589
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
585590
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

paddle/fluid/framework/details/build_strategy.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,26 @@ namespace framework {
2121
namespace details {
2222

2323
struct BuildStrategy {
24+
// ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
25+
// kReduce, for CPU and GPU. If you use kAllReduce, different threads
26+
// optimize their parameters separately. If you use kReduce, the optimizations
27+
// of parameters are distributed to different threads.
28+
// For example, a model has 100 parameters and is running with four threads,
29+
// if you choose kAllReduce, every thread is to optimize 100 parameters
30+
// separately, if you choose kReduce, every thread is to optimize 25
31+
// parameters.
32+
// Of particular note is, if you use kReduce when using CPU training,
33+
// all the parameters are shared between different threads. This feature will
34+
// save memory.
35+
// FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
36+
// equal for GPU. Because, the result of the different order of summing maybe
37+
// different, for example, the result of `a+b+c+d` may be different with the
38+
// result of `c+a+b+d`.
39+
// For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
40+
// so the result of kAllReduce and kReduce maybe not equal.
41+
// For CPU, if you want to fix the order of summing to make the result
42+
// of kAllReduce and kReduce no diff, you can add
43+
// `FLAGS_cpu_deterministic=true` to env.
2444
enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
2545

2646
enum class GradientScaleStrategy {

paddle/fluid/framework/details/reduce_op_handle.cc

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
#include "paddle/fluid/framework/details/variable_visitor.h"
1919
#include "paddle/fluid/platform/profiler.h"
2020

21+
DEFINE_bool(
22+
cpu_deterministic, false,
23+
"Whether to make the result of computation deterministic in CPU side.");
24+
2125
namespace paddle {
2226
namespace framework {
2327
namespace details {
@@ -91,11 +95,33 @@ void ReduceOpHandle::RunImpl() {
9195
} else {
9296
std::vector<const LoDTensor *> lod_tensors =
9397
GetInputValues<LoDTensor>(in_var_handles, var_scopes);
98+
9499
if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
95100
this->RunAndRecordEvent([&] {
96-
ReduceLoDTensor func(lod_tensors,
97-
out_var->GetMutable<framework::LoDTensor>());
98-
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
101+
// FIXME(zcd): The order of summing is important,
102+
// especially when the type of data is float or double.
103+
// For example, the result of `a+b+c+d` may be different
104+
// with the result of `c+a+b+d`, so the summing order should be fixed.
105+
if (!FLAGS_cpu_deterministic) {
106+
ReduceLoDTensor func(lod_tensors,
107+
out_var->GetMutable<framework::LoDTensor>());
108+
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
109+
} else {
110+
// We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
111+
// here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
112+
auto &reduce_sum_trg = *this->local_scopes_[0]
113+
->FindVar(kLocalExecScopeName)
114+
->Get<Scope *>()
115+
->FindVar(out_var_handle->name_)
116+
->GetMutable<framework::LoDTensor>();
117+
ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
118+
VisitDataType(ToDataType(lod_tensors[0]->type()), func);
119+
120+
auto trg = out_var->GetMutable<framework::LoDTensor>();
121+
if (reduce_sum_trg.data<void>() != trg->data<void>()) {
122+
TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
123+
}
124+
}
99125
});
100126
} else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
101127
#ifdef PADDLE_WITH_CUDA

python/paddle/fluid/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ def __bootstrap__():
123123
read_env_flags = [
124124
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
125125
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
126-
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads'
126+
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
127+
'cpu_deterministic'
127128
]
128129
if core.is_compiled_with_dist():
129130
read_env_flags.append('rpc_deadline')

python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def _compare_reduce_and_allreduce(self,
198198
model,
199199
use_cuda,
200200
iter=20,
201-
delta2=1e-4):
201+
delta2=1e-6):
202202
if use_cuda and not core.is_compiled_with_cuda():
203203
return
204204

@@ -276,10 +276,10 @@ def test_seresnext_with_learning_rate_decay(self):
276276
model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
277277

278278
def test_seresnext_with_new_strategy(self):
279-
# self._compare_reduce_and_allreduce(
280-
# model=SE_ResNeXt50Small, use_cuda=True)
281279
self._compare_reduce_and_allreduce(
282-
model=SE_ResNeXt50Small, use_cuda=False, iter=5, delta2=1e-2)
280+
model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
281+
self._compare_reduce_and_allreduce(
282+
model=SE_ResNeXt50Small, use_cuda=False, iter=5)
283283

284284

285285
if __name__ == '__main__':

0 commit comments

Comments
 (0)