Skip to content

Commit 2d036c4

Browse files
authored
polish dist unit test code (#12512)
* polish dist se resnext ut * update * update * update * avoid cpu initializer differ * change to use executor for now * update by comment * remove lr decay use para exe, should fix para exe bug later * update by comment
1 parent 97a7751 commit 2d036c4

File tree

4 files changed

+31
-15
lines changed

4 files changed

+31
-15
lines changed

paddle/fluid/operators/read_op.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "paddle/fluid/framework/op_registry.h"
1616
#include "paddle/fluid/framework/reader.h"
1717
#include "paddle/fluid/operators/detail/safe_ref.h"
18+
#include "paddle/fluid/platform/profiler.h"
1819

1920
namespace paddle {
2021
namespace operators {
@@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase {
6566
.GetMutable<framework::ReaderHolder>();
6667
std::vector<std::string> out_arg_names = Outputs("Out");
6768
std::vector<framework::LoDTensor> ins;
69+
70+
// For profiling
71+
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
72+
auto& ctx = *pool.Get(dev_place);
73+
platform::RecordEvent record_event(Type(), &ctx);
74+
6875
reader->ReadNext(&ins);
6976
if (ins.empty()) {
7077
if (Attr<bool>("throw_eof_exp")) {

python/paddle/fluid/tests/unittests/dist_se_resnext.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,9 @@ def conv_bn_layer(self,
174174
padding=(filter_size - 1) / 2,
175175
groups=groups,
176176
act=None,
177+
# avoid pserver CPU init differs from GPU
178+
param_attr=fluid.ParamAttr(
179+
initializer=fluid.initializer.Constant()),
177180
bias_attr=False)
178181
return fluid.layers.batch_norm(input=conv, act=act)
179182

@@ -194,10 +197,8 @@ def squeeze_excitation(self, input, num_channels, reduction_ratio):
194197

195198
def get_model(batch_size):
196199
# Input data
197-
image = fluid.layers.fill_constant(
198-
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
199-
label = fluid.layers.fill_constant(
200-
shape=[batch_size, 1], dtype='int64', value=0.0)
200+
image = fluid.layers.data(name="data", shape=[3, 224, 224], dtype='float32')
201+
label = fluid.layers.data(name="int64", shape=[1], dtype='int64')
201202

202203
# Train program
203204
model = SE_ResNeXt(layers=50)
@@ -222,8 +223,10 @@ def get_model(batch_size):
222223
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
223224

224225
optimizer = fluid.optimizer.Momentum(
225-
learning_rate=fluid.layers.piecewise_decay(
226-
boundaries=bd, values=lr),
226+
# FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed.
227+
#learning_rate=fluid.layers.piecewise_decay(
228+
# boundaries=bd, values=lr),
229+
learning_rate=base_lr,
227230
momentum=0.9,
228231
regularization=fluid.regularizer.L2Decay(1e-4))
229232
optimizer.minimize(avg_cost)
@@ -232,7 +235,7 @@ def get_model(batch_size):
232235
train_reader = paddle.batch(
233236
paddle.dataset.flowers.train(), batch_size=batch_size)
234237
test_reader = paddle.batch(
235-
paddle.dataset.flowers.test(), batch_size=batch_size)
238+
paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
236239

237240
return test_program, avg_cost, train_reader, test_reader, acc_top1, out
238241

@@ -256,7 +259,6 @@ def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
256259
trainers)
257260
pserver_prog = t.get_pserver_program(current_endpoint)
258261
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
259-
260262
place = fluid.CPUPlace()
261263
exe = fluid.Executor(place)
262264
exe.run(startup_prog)
@@ -302,12 +304,19 @@ def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
302304
]
303305

304306
feeder = fluid.DataFeeder(feed_var_list, place)
305-
reader_generator = train_reader()
306-
first_loss, = exe.run(fetch_list=[avg_cost.name])
307+
reader_generator = test_reader()
308+
309+
data = next(reader_generator)
310+
first_loss, = exe.run(fetch_list=[avg_cost.name],
311+
feed=feeder.feed(data))
307312
print(first_loss)
313+
308314
for i in xrange(5):
309-
loss, = exe.run(fetch_list=[avg_cost.name])
310-
last_loss, = exe.run(fetch_list=[avg_cost.name])
315+
data = next(reader_generator)
316+
loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
317+
318+
data = next(reader_generator)
319+
last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
311320
print(last_loss)
312321

313322

python/paddle/fluid/tests/unittests/test_dist_base.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ def check_with_place(self, model_file, delta=1e-3):
6363
"PATH": os.getenv("PATH"),
6464
"PYTHONPATH": os.getenv("PYTHONPATH"),
6565
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
66-
"FLAGS_fraction_of_gpu_memory_to_use": "0.15"
66+
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
67+
"FLAGS_cudnn_deterministic": "1"
6768
}
6869
# Run local to get a base line
6970
env_local = {"CUDA_VISIBLE_DEVICES": "0"}

python/paddle/fluid/tests/unittests/test_dist_se_resnext.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@
1717

1818
class TestDistSeResneXt2x2(TestDistBase):
1919
def test_se_resnext(self):
20-
# TODO(paddle-dev): Is the delta too large?
21-
self.check_with_place("dist_se_resnext.py", delta=0.2)
20+
self.check_with_place("dist_se_resnext.py")
2221

2322

2423
if __name__ == "__main__":

0 commit comments

Comments
 (0)