Skip to content

Commit 4cf76cf

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_reader_op_in_pserver
2 parents d66a54d + 27d6962 commit 4cf76cf

File tree

118 files changed

+1140
-490
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

118 files changed

+1140
-490
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
6565
option(WITH_ANAKIN "Compile with Anakin library" OFF)
6666
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
6767
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
68+
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
6869

6970
# CMAKE_BUILD_TYPE
7071
if(NOT CMAKE_BUILD_TYPE)

benchmark/fluid/args.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ def parse_args():
125125
parser.add_argument(
126126
'--use_inference_transpiler',
127127
action='store_true',
128-
help='If set, uses inference transpiler to optimize the program.')
128+
help='If set, use inference transpiler to optimize the program.')
129+
parser.add_argument(
130+
'--no_random',
131+
action='store_true',
132+
help='If set, keep the random seed and do not shuffle the data.')
129133
args = parser.parse_args()
130134
return args

benchmark/fluid/fluid_benchmark.py

100755100644
Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,6 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
132132
exe.run(startup_prog)
133133

134134
# Use inference_transpiler to speedup
135-
if args.use_inference_transpiler:
136-
t = fluid.InferenceTranspiler()
137-
t.transpile(infer_prog, place)
138-
139135
if not args.use_reader_op:
140136
feed_var_list = [
141137
var for var in train_prog.global_block().vars.itervalues()
@@ -186,6 +182,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
186182
print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
187183
# evaluation
188184
if not args.no_test and batch_acc and not args.use_reader_op:
185+
if args.use_inference_transpiler:
186+
t = fluid.InferenceTranspiler()
187+
t.transpile(infer_prog, place)
188+
189189
pass_test_acc = test(exe, infer_prog, test_reader, feeder,
190190
batch_acc)
191191
print(", Test Accuracy: %f" % pass_test_acc)
@@ -316,6 +316,8 @@ def main():
316316
args = parse_args()
317317
print_arguments(args)
318318
print_paddle_envs()
319+
if args.no_random:
320+
fluid.default_startup_program().random_seed = 1
319321

320322
# the unique trainer id, starting from 0, needed by trainer
321323
# only

benchmark/fluid/models/resnet.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,12 +197,12 @@ def get_model(args):
197197
optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
198198

199199
batched_train_reader = paddle.batch(
200-
paddle.reader.shuffle(
200+
train_reader if args.no_random else paddle.reader.shuffle(
201201
train_reader, buf_size=5120),
202202
batch_size=args.batch_size * args.gpus,
203203
drop_last=True)
204204
batched_test_reader = paddle.batch(
205-
train_reader, batch_size=args.batch_size, drop_last=True)
205+
test_reader, batch_size=args.batch_size, drop_last=True)
206206

207207
return avg_cost, inference_program, optimizer, batched_train_reader,\
208208
batched_test_reader, batch_acc

cmake/cblas.cmake

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,20 @@ else()
8383
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
8484
endif()
8585

86-
find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
86+
if(WITH_SYSTEM_BLAS)
87+
find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
8788
${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
88-
find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
89+
find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
8990
${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
9091

91-
if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
92-
set(CBLAS_FOUND ON)
93-
set(CBLAS_PROVIDER REFERENCE)
94-
set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
95-
set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
96-
add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
97-
message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
92+
if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
93+
set(CBLAS_FOUND ON)
94+
set(CBLAS_PROVIDER REFERENCE)
95+
set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
96+
set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
97+
add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
98+
message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
99+
endif()
98100
endif()
99101

100102
if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)

paddle/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,18 @@ if(NOT WITH_FLUID_ONLY)
77
add_subdirectory(legacy/parameter)
88

99
if(MOBILE_INFERENCE)
10-
add_subdirectory(capi)
10+
add_subdirectory(legacy/capi)
1111
else()
1212
add_subdirectory(legacy/pserver)
1313
add_subdirectory(trainer)
1414
add_subdirectory(scripts)
1515

1616
if(WITH_C_API)
17-
add_subdirectory(capi)
17+
add_subdirectory(legacy/capi)
1818
endif()
1919

2020
if(WITH_SWIG_PY)
21-
add_subdirectory(api)
21+
add_subdirectory(legacy/api)
2222
endif()
2323
endif()
2424
endif()

paddle/fluid/framework/details/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,12 @@ else()
2525
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
2626
endif()
2727

28+
cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
2829
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
2930
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
3031

3132
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
32-
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle)
33+
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
3334

3435

3536
cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)

paddle/fluid/framework/details/build_strategy.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ struct BuildStrategy {
3333
GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
3434

3535
std::string debug_graphviz_path_{""};
36+
37+
bool enable_data_balance_{true};
3638
};
3739

3840
} // namespace details
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/fluid/framework/details/data_balance_op_handle.h"
16+
#include <algorithm>
17+
#include "paddle/fluid/framework/details/container_cast.h"
18+
19+
namespace paddle {
20+
namespace framework {
21+
namespace details {
22+
23+
#ifdef PADDLE_WITH_CUDA
24+
DataBalanceOpHandle::DataBalanceOpHandle(
25+
const std::vector<Scope *> &local_scopes,
26+
const std::vector<platform::Place> &places,
27+
const platform::NCCLContextMap *ctxs)
28+
: local_scopes_(local_scopes), places_(places) {
29+
if (ctxs) {
30+
for (auto &p : places_) {
31+
this->dev_ctxes_[p] = ctxs->DevCtx(p);
32+
}
33+
}
34+
}
35+
#else
36+
DataBalanceOpHandle::DataBalanceOpHandle(
37+
const std::vector<Scope *> &local_scopes,
38+
const std::vector<platform::Place> &places)
39+
: local_scopes_(local_scopes), places_(places) {}
40+
#endif
41+
42+
std::string DataBalanceOpHandle::Name() const { return "data balance"; }
43+
44+
std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
45+
const std::vector<int> &device_sizes) {
46+
int device_num = device_sizes.size();
47+
int total_size = 0;
48+
int empty_num = 0;
49+
std::vector<std::array<int, 2>> size_device_vec;
50+
size_device_vec.reserve(device_num);
51+
for (int i = 0; i < device_num; ++i) {
52+
if (device_sizes[i] == 0) {
53+
++empty_num;
54+
}
55+
total_size += device_sizes[i];
56+
size_device_vec.push_back({{device_sizes[i], i}});
57+
}
58+
std::vector<std::array<int, 3>> res;
59+
if (empty_num == 0) {
60+
// No need to do data balance.
61+
return res;
62+
}
63+
if (total_size < device_num) {
64+
// No enough data.
65+
PADDLE_THROW("There is no next data.");
66+
}
67+
std::sort(size_device_vec.begin(), size_device_vec.end(),
68+
[](const std::array<int, 2> &a, const std::array<int, 2> &b) {
69+
return a[0] > b[0];
70+
});
71+
int expected_device_size = total_size / device_num;
72+
int src_idx = 0;
73+
for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
74+
if (size_device_vec[src_idx][0] <= expected_device_size) {
75+
++src_idx;
76+
PADDLE_ENFORCE_LT(
77+
src_idx, device_num - empty_num,
78+
"In current srategy an empty tensor should not be copy source.");
79+
}
80+
size_device_vec[src_idx][0] -= expected_device_size;
81+
size_device_vec[dst_idx][0] += expected_device_size;
82+
res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
83+
expected_device_size}});
84+
}
85+
return res;
86+
}
87+
88+
void DataBalanceOpHandle::RunImpl() {
89+
if (places_.size() == 1) {
90+
return;
91+
}
92+
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
93+
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
94+
PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
95+
PADDLE_ENFORCE_EQ(
96+
in_var_handles.size(), out_var_handles.size(),
97+
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
98+
int data_num = in_var_handles.size() / places_.size();
99+
WaitInputVarGenerated();
100+
std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
101+
std::vector<int> device_sizes;
102+
for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
103+
PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
104+
"The name of input and output should be equal.");
105+
int place_idx = i / data_num;
106+
int data_idx = i % data_num;
107+
auto *local_scope =
108+
local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
109+
auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name_);
110+
PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
111+
auto *tensor = tensor_var->GetMutable<LoDTensor>();
112+
lod_tensors[data_idx].push_back(tensor);
113+
int ins_size =
114+
tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
115+
if (data_idx == 0) {
116+
device_sizes.emplace_back(ins_size);
117+
} else {
118+
PADDLE_ENFORCE_EQ(
119+
ins_size, device_sizes.at(place_idx),
120+
"All data on the same device shall have the same batch size.");
121+
}
122+
}
123+
const auto &balance_plan = GetBalancePlan(device_sizes);
124+
125+
for (const auto &trans : balance_plan) {
126+
for (int data_idx = 0; data_idx < data_num; ++data_idx) {
127+
LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
128+
LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
129+
int trans_ins_size = trans[2];
130+
LoD src_lod = src_tensor->lod();
131+
int src_ins_size =
132+
src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
133+
int cut_point = src_ins_size - trans_ins_size;
134+
if (!src_lod.empty()) {
135+
for (auto &level : src_lod) {
136+
cut_point = level[cut_point];
137+
}
138+
}
139+
TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
140+
dst_tensor->place(), dst_tensor);
141+
src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
142+
if (!src_lod.empty()) {
143+
dst_tensor->set_lod(SliceInLevel(
144+
src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
145+
src_tensor->set_lod(
146+
SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
147+
}
148+
}
149+
}
150+
}
151+
152+
} // namespace details
153+
} // namespace framework
154+
} // namespace paddle
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
17+
#include <string>
18+
#include <vector>
19+
#include "paddle/fluid/framework/details/op_handle_base.h"
20+
#include "paddle/fluid/framework/lod_tensor.h"
21+
#include "paddle/fluid/framework/scope.h"
22+
#ifdef PADDLE_WITH_CUDA
23+
#include "paddle/fluid/platform/nccl_helper.h"
24+
#endif
25+
26+
namespace paddle {
27+
namespace framework {
28+
namespace details {
29+
30+
struct DataBalanceOpHandle : public OpHandleBase {
31+
public:
32+
#ifdef PADDLE_WITH_CUDA
33+
DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
34+
const std::vector<platform::Place> &places,
35+
const platform::NCCLContextMap *ctxs);
36+
#else
37+
DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
38+
const std::vector<platform::Place> &places);
39+
#endif
40+
41+
std::string Name() const override;
42+
43+
bool IsMultiDeviceTransfer() override { return false; };
44+
45+
protected:
46+
void RunImpl() override;
47+
48+
private:
49+
// std::vector<(src_dev_id, dst_dev_id, trans_size)>
50+
std::vector<std::array<int, 3>> GetBalancePlan(
51+
const std::vector<int> &batch_size_per_device);
52+
53+
const std::vector<Scope *> local_scopes_;
54+
const std::vector<platform::Place> places_;
55+
};
56+
57+
} // namespace details
58+
} // namespace framework
59+
} // namespace paddle

0 commit comments

Comments
 (0)