Skip to content

Commit fde34eb

Browse files
authored
[Cherry-pick] Apply IOU to test_parallel_executor_seresnext_base_gpu … (#43925)
* [Cherry-pick] Apply IOU to test_parallel_executor_seresnext_base_gpu (#43812) 1. Fix the conflict between #43812 and current release/2.3 branch 2. test_parallel_executor_seresnext_base_gpu failed on 2 P100 GPUs with `470.82` driver.
1 parent 83520fd commit fde34eb

11 files changed

+243
-163
lines changed

python/paddle/fluid/tests/unittests/parallel_executor_test_base.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333

3434
class TestParallelExecutorBase(unittest.TestCase):
35+
3536
@classmethod
3637
def check_network_convergence(cls,
3738
method,
@@ -52,6 +53,7 @@ def check_network_convergence(cls,
5253
optimizer=fluid.optimizer.Adam,
5354
use_fast_executor=False,
5455
enable_sequential_execution=False):
56+
5557
def run_executor(exe, binary, feed, fetch_list):
5658
if feed_data_reader is None:
5759
res = exe.run(binary, feed=feed, fetch_list=fetch_list)
@@ -66,8 +68,8 @@ def run_executor(exe, binary, feed, fetch_list):
6668
feed_data_reader, FeedDataReader
6769
), "feed_data_reader must be type of FeedDataReader"
6870

69-
paddle.seed(1)
70-
paddle.framework.random._manual_program_seed(1)
71+
paddle.seed(0)
72+
paddle.framework.random._manual_program_seed(0)
7173
main = fluid.Program()
7274
startup = fluid.Program()
7375

@@ -101,28 +103,39 @@ def run_executor(exe, binary, feed, fetch_list):
101103
) if use_device == DeviceType.XPU else int(
102104
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
103105

106+
area_below_loss = 0
104107
begin = time.time()
105-
first_loss, = run_executor(
106-
exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
108+
first_loss, = run_executor(exe=exe,
109+
binary=binary,
110+
feed=feed_dict,
111+
fetch_list=[loss.name])
112+
area_below_loss += 0.5 * first_loss.mean()
107113
for _ in range(iter):
108-
run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[])
109-
last_loss, = run_executor(
110-
exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name])
114+
mid_loss = run_executor(exe=exe,
115+
binary=binary,
116+
feed=feed_dict,
117+
fetch_list=[loss.name])
118+
area_below_loss += mid_loss[0].mean()
119+
last_loss, = run_executor(exe=exe,
120+
binary=binary,
121+
feed=feed_dict,
122+
fetch_list=[loss.name])
123+
area_below_loss += 0.5 * last_loss.mean()
111124
end = time.time()
112125

113126
if batch_size is not None:
114-
print("%.4f Instance per second" % (
115-
(batch_size * iter + 2) / (end - begin)))
127+
print("%.4f Instance per second" % ((batch_size * iter + 2) /
128+
(end - begin)))
116129

117130
avg_last_loss_val = np.array(last_loss).mean()
118131
avg_first_loss_val = np.array(first_loss).mean()
119132
if math.isnan(float(avg_last_loss_val)) or math.isnan(
120133
float(avg_first_loss_val)):
121134
sys.exit("got NaN loss, training failed.")
122135

123-
print(first_loss, last_loss)
136+
print(first_loss, last_loss, area_below_loss)
124137
# self.assertGreater(first_loss[0], last_loss[0])
125-
return first_loss, last_loss
138+
return first_loss, last_loss, area_below_loss
126139

127140
@classmethod
128141
def check_pass_conflict(cls,

python/paddle/fluid/tests/unittests/seresnext_test_base.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222

2323
class TestResnetBase(TestParallelExecutorBase):
24+
2425
def _compare_result_with_origin_model(self,
2526
check_func,
2627
use_device,
@@ -29,7 +30,7 @@ def _compare_result_with_origin_model(self,
2930
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
3031
return
3132

32-
func_1_first_loss, func_1_last_loss = self.check_network_convergence(
33+
func_1_first_loss, func_1_last_loss, func_1_loss_area = self.check_network_convergence(
3334
seresnext_net.model,
3435
feed_dict=seresnext_net.feed_dict(use_device),
3536
iter=seresnext_net.iter(use_device),
@@ -38,7 +39,7 @@ def _compare_result_with_origin_model(self,
3839
use_reduce=False,
3940
optimizer=seresnext_net.optimizer)
4041

41-
func_2_first_loss, func_2_last_loss = check_func(
42+
func_2_first_loss, func_2_last_loss, func_2_loss_area = check_func(
4243
seresnext_net.model,
4344
feed_dict=seresnext_net.feed_dict(use_device),
4445
iter=seresnext_net.iter(use_device),
@@ -51,7 +52,12 @@ def _compare_result_with_origin_model(self,
5152
for loss in zip(func_1_last_loss, func_2_last_loss):
5253
self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
5354
else:
54-
self.assertAlmostEquals(
55-
np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5)
56-
self.assertAlmostEquals(
57-
np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
55+
np.testing.assert_allclose(func_1_loss_area,
56+
func_2_loss_area,
57+
rtol=delta2)
58+
self.assertAlmostEquals(np.mean(func_1_first_loss),
59+
func_2_first_loss[0],
60+
delta=1e-5)
61+
self.assertAlmostEquals(np.mean(func_1_last_loss),
62+
func_2_last_loss[0],
63+
delta=delta2)

python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727

2828
class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
29+
2930
@classmethod
3031
def setUpClass(cls):
3132
os.environ['CPU_NUM'] = str(4)
@@ -47,15 +48,15 @@ def compare_fuse_all_reduce_ops(self,
4748
img, label = init_feed_dict()
4849
feed_dict_data = {"image": img, "label": label}
4950

50-
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
51+
not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
5152
model,
5253
feed_dict=feed_dict_data,
5354
get_data_from_feeder=get_data_from_feeder,
5455
use_device=use_device,
5556
fuse_all_reduce_ops=False,
5657
fuse_all_optimizer_ops=fuse_all_optimizer_ops,
5758
optimizer=optimizer)
58-
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
59+
fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
5960
model,
6061
feed_dict=feed_dict_data,
6162
get_data_from_feeder=get_data_from_feeder,
@@ -77,13 +78,13 @@ def optimizer(self, learning_rate=1e-3):
7778

7879

7980
class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
81+
8082
def _decorate_compare_fused_all_reduce(self, model, use_device):
81-
self.compare_fuse_all_reduce_ops(
82-
model,
83-
use_device,
84-
init_feed_dict=init_data,
85-
optimizer=self.optimizer,
86-
fuse_all_optimizer_ops=True)
83+
self.compare_fuse_all_reduce_ops(model,
84+
use_device,
85+
init_feed_dict=init_data,
86+
optimizer=self.optimizer,
87+
fuse_all_optimizer_ops=True)
8788

8889
def test_simple_fc_with_fuse_all_reduce(self):
8990
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
@@ -101,16 +102,17 @@ def test_batchnorm_fc_with_fuse_all_reduce(self):
101102

102103

103104
class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps):
105+
104106
def _decorate_compare_fused_all_reduce(self, model, use_device):
105-
self.compare_fuse_all_reduce_ops(
106-
model,
107-
use_device,
108-
init_feed_dict=init_data,
109-
optimizer=self.optimizer,
110-
fuse_all_optimizer_ops=True)
107+
self.compare_fuse_all_reduce_ops(model,
108+
use_device,
109+
init_feed_dict=init_data,
110+
optimizer=self.optimizer,
111+
fuse_all_optimizer_ops=True)
111112

112113

113114
class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
115+
114116
@classmethod
115117
def setUpClass(cls):
116118
os.environ['CPU_NUM'] = str(4)

python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222

2323
class TestMNIST(TestParallelExecutorBase):
24+
2425
@classmethod
2526
def setUpClass(cls):
2627
os.environ['CPU_NUM'] = str(4)
@@ -41,19 +42,23 @@ def _optimizer(learning_rate=1e-6):
4142
# FIXME (liuwei12)
4243
# the new memory optimize strategy will crash this unittest
4344
# add enable_inplace=False here to force pass the unittest
44-
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
45+
not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
4546
model,
46-
feed_dict={"image": img,
47-
"label": label},
47+
feed_dict={
48+
"image": img,
49+
"label": label
50+
},
4851
use_device=use_device,
4952
fuse_elewise_add_act_ops=False,
5053
use_ir_memory_optimize=False,
5154
enable_inplace=False,
5255
optimizer=_optimizer)
53-
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
56+
fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
5457
model,
55-
feed_dict={"image": img,
56-
"label": label},
58+
feed_dict={
59+
"image": img,
60+
"label": label
61+
},
5762
use_device=use_device,
5863
fuse_elewise_add_act_ops=True,
5964
use_ir_memory_optimize=False,

python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py

Lines changed: 48 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525

2626
class TestFuseOptimizationOps(TestParallelExecutorBase):
27+
2728
@classmethod
2829
def setUpClass(cls):
2930
os.environ['CPU_NUM'] = str(4)
@@ -41,14 +42,14 @@ def _compare_fused_optimizer_ops(self,
4142
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
4243
return
4344

44-
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
45+
not_fuse_op_first_loss, not_fuse_op_last_loss, _ = self.check_network_convergence(
4546
model,
4647
feed_dict=feed_dict,
4748
get_data_from_feeder=get_data_from_feeder,
4849
use_device=use_device,
4950
fuse_all_optimizer_ops=False,
5051
optimizer=optimizer)
51-
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
52+
fuse_op_first_loss, fuse_op_last_loss, _ = self.check_network_convergence(
5253
model,
5354
feed_dict=feed_dict,
5455
get_data_from_feeder=get_data_from_feeder,
@@ -63,36 +64,41 @@ def _compare_fused_optimizer_ops(self,
6364

6465
def _decorate_compare_fused_optimizer_ops(self, model, use_device,
6566
optimizer):
66-
self._compare_fused_optimizer_ops(
67-
model,
68-
use_device,
69-
feed_dict=self._get_feed_dict(),
70-
optimizer=optimizer)
67+
self._compare_fused_optimizer_ops(model,
68+
use_device,
69+
feed_dict=self._get_feed_dict(),
70+
optimizer=optimizer)
7171

7272

7373
class TestFuseAdamOps(TestFuseOptimizationOps):
74+
7475
def optimizer(self, learning_rate=1e-4):
7576
return fluid.optimizer.Adam(learning_rate=learning_rate)
7677

7778
def test_batchnorm_fc_with_fuse_op(self):
78-
self._decorate_compare_fused_optimizer_ops(
79-
fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
80-
self._decorate_compare_fused_optimizer_ops(
81-
fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
79+
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
80+
DeviceType.CUDA,
81+
optimizer=self.optimizer)
82+
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
83+
DeviceType.CPU,
84+
optimizer=self.optimizer)
8285

8386

8487
class TestFuseSGDOps(TestFuseAdamOps):
88+
8589
def optimizer(self, learning_rate=1e-3):
8690
return fluid.optimizer.SGD(learning_rate=learning_rate)
8791

8892

8993
class TestFuseMomentumOps(TestFuseAdamOps):
94+
9095
def optimizer(self, learning_rate=1e-3):
91-
return fluid.optimizer.Momentum(
92-
learning_rate=learning_rate, momentum=0.1)
96+
return fluid.optimizer.Momentum(learning_rate=learning_rate,
97+
momentum=0.1)
9398

9499

95100
class TestSpareFuseAdamOps(TestFuseOptimizationOps):
101+
96102
@classmethod
97103
def setUpClass(cls):
98104
os.environ['CPU_NUM'] = str(4)
@@ -120,24 +126,29 @@ def optimizer(self, learning_rate=1e-4):
120126

121127
def test_simple_bow_net_with_fuse_op(self):
122128
model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
123-
self._decorate_compare_fused_optimizer_ops(
124-
model, DeviceType.CUDA, optimizer=self.optimizer)
125-
self._decorate_compare_fused_optimizer_ops(
126-
model, DeviceType.CPU, optimizer=self.optimizer)
129+
self._decorate_compare_fused_optimizer_ops(model,
130+
DeviceType.CUDA,
131+
optimizer=self.optimizer)
132+
self._decorate_compare_fused_optimizer_ops(model,
133+
DeviceType.CPU,
134+
optimizer=self.optimizer)
127135

128136

129137
class TestSpareFuseSGDOps(TestSpareFuseAdamOps):
138+
130139
def optimizer(self, learning_rate=1e-3):
131140
return fluid.optimizer.SGD(learning_rate=learning_rate)
132141

133142

134143
class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):
144+
135145
def optimizer(self, learning_rate=1e-3):
136-
return fluid.optimizer.Momentum(
137-
learning_rate=learning_rate, momentum=0.1)
146+
return fluid.optimizer.Momentum(learning_rate=learning_rate,
147+
momentum=0.1)
138148

139149

140150
class TestPassConflictBase(TestFuseAdamOps):
151+
141152
def _compare_fused_optimizer_ops(self,
142153
model,
143154
use_device,
@@ -147,36 +158,40 @@ def _compare_fused_optimizer_ops(self,
147158
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
148159
return
149160

150-
self.check_pass_conflict(
151-
model,
152-
feed_dict=feed_dict,
153-
get_data_from_feeder=get_data_from_feeder,
154-
use_device=use_device,
155-
fuse_all_optimizer_ops=True,
156-
optimizer=optimizer,
157-
enable_sequential_execution=True)
161+
self.check_pass_conflict(model,
162+
feed_dict=feed_dict,
163+
get_data_from_feeder=get_data_from_feeder,
164+
use_device=use_device,
165+
fuse_all_optimizer_ops=True,
166+
optimizer=optimizer,
167+
enable_sequential_execution=True)
158168

159169

160170
class TestFuseAdamOpsPassConflict(TestPassConflictBase):
171+
161172
def optimizer(self, learning_rate=1e-4):
162173
return fluid.optimizer.Adam(learning_rate=learning_rate)
163174

164175
def test_batchnorm_fc_with_fuse_op(self):
165-
self._decorate_compare_fused_optimizer_ops(
166-
fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer)
167-
self._decorate_compare_fused_optimizer_ops(
168-
fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer)
176+
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
177+
DeviceType.CPU,
178+
optimizer=self.optimizer)
179+
self._decorate_compare_fused_optimizer_ops(fc_with_batchnorm,
180+
DeviceType.CUDA,
181+
optimizer=self.optimizer)
169182

170183

171184
class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict):
185+
172186
def optimizer(self, learning_rate=1e-3):
173187
return fluid.optimizer.SGD(learning_rate=learning_rate)
174188

175189

176190
class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict):
191+
177192
def optimizer(self, learning_rate=1e-3):
178-
return fluid.optimizer.Momentum(
179-
learning_rate=learning_rate, momentum=0.1)
193+
return fluid.optimizer.Momentum(learning_rate=learning_rate,
194+
momentum=0.1)
180195

181196

182197
if __name__ == '__main__':

0 commit comments

Comments
 (0)