Skip to content

Commit d2ad4a5

Browse files
author
chengduo
authored
Init allocated memory for unit test (#11657)
* memory init * add env * refine anounce * Add check for Nan * Debug * Add env for cc_test * Add env for py_test and nv_test * Remove py_test env * Add env for py_test * serial test_recognize_digits * Test FLAGS_init_allocated_mem function for unit test * Init allocated mem for op unit test * Add env for all unit test
1 parent 7b54f16 commit d2ad4a5

File tree

5 files changed

+38
-2
lines changed

5 files changed

+38
-2
lines changed

cmake/generic.cmake

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ function(cc_test TARGET_NAME)
264264
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
265265
if (${cc_test_SERIAL})
266266
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
267+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
267268
endif()
268269
endif()
269270
endfunction(cc_test)
@@ -328,6 +329,7 @@ function(nv_test TARGET_NAME)
328329
add_test(${TARGET_NAME} ${TARGET_NAME})
329330
if (nv_test_SERIAL)
330331
set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
332+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
331333
endif()
332334
endif()
333335
endfunction(nv_test)
@@ -575,7 +577,7 @@ function(py_test TARGET_NAME)
575577
set(multiValueArgs SRCS DEPS ARGS ENVS)
576578
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
577579
add_test(NAME ${TARGET_NAME}
578-
COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
580+
COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
579581
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
580582
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
581583
endif()

paddle/fluid/memory/malloc.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ limitations under the License. */
2020
#include "paddle/fluid/memory/detail/system_allocator.h"
2121
#include "paddle/fluid/platform/gpu_info.h"
2222

23+
DEFINE_bool(init_allocated_mem, false,
24+
"It is a mistake that the values of the memory allocated by "
25+
"BuddyAllocator are always zeroed in some op's implementation. "
26+
"To find this error in time, we use init_allocated_mem to indicate "
27+
"that initializing the allocated memory with a small value "
28+
"during unit testing.");
2329
DECLARE_double(fraction_of_gpu_memory_to_use);
2430

2531
namespace paddle {
@@ -41,6 +47,9 @@ template <>
4147
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
4248
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
4349
void* p = GetCPUBuddyAllocator()->Alloc(size);
50+
if (FLAGS_init_allocated_mem) {
51+
memset(p, 0xEF, size);
52+
}
4453
VLOG(10) << " pointer=" << p;
4554
return p;
4655
}
@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
104113
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
105114
platform::SetDeviceId(cur_dev);
106115
}
116+
if (FLAGS_init_allocated_mem) {
117+
cudaMemset(ptr, 0xEF, size);
118+
}
107119
return ptr;
108120
}
109121

@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
137149
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
138150
<< " bytes in CUDAPinnedPlace";
139151
}
152+
if (FLAGS_init_allocated_mem) {
153+
memset(ptr, 0xEF, size);
154+
}
140155
return ptr;
141156
}
142157

python/paddle/fluid/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ def __bootstrap__():
118118

119119
read_env_flags = [
120120
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
121-
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
121+
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
122+
'init_allocated_mem'
122123
]
123124
if core.is_compiled_with_cuda():
124125
read_env_flags += [

python/paddle/fluid/tests/unittests/parallel_executor_test_base.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
import paddle.fluid as fluid
1919
import time
2020
import numpy as np
21+
import math
22+
import sys
2123

2224
__all__ = ['TestParallelExecutorBase']
2325

@@ -93,6 +95,12 @@ def run_executor(exe, feed, fetch_list, program=None):
9395
print "%.4f Instance per second" % (
9496
(batch_size * iter + 2) / (end - begin))
9597

98+
avg_last_loss_val = np.array(last_loss).mean()
99+
avg_first_loss_val = np.array(first_loss).mean()
100+
if math.isnan(float(avg_last_loss_val)) or math.isnan(
101+
float(avg_first_loss_val)):
102+
sys.exit("got NaN loss, training failed.")
103+
96104
print first_loss, last_loss
97105
# self.assertGreater(first_loss[0], last_loss[0])
98106
return first_loss, last_loss

python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import numpy as np
1717
import unittest
1818
import os
19+
import sys
20+
import math
1921

2022

2123
def simple_fc_net():
@@ -73,6 +75,14 @@ def check_network_convergence(self, use_cuda, build_strategy=None):
7375

7476
train_loss, = train_exe.run([loss.name], feed=feed_dict)
7577

78+
avg_test_loss_val = np.array(test_loss).mean()
79+
if math.isnan(float(avg_test_loss_val)):
80+
sys.exit("got NaN loss, testing failed.")
81+
82+
avg_train_loss_val = np.array(train_loss).mean()
83+
if math.isnan(float(avg_train_loss_val)):
84+
sys.exit("got NaN loss, training failed.")
85+
7686
self.assertTrue(
7787
np.allclose(
7888
train_loss, test_loss, atol=1e-8),

0 commit comments

Comments
 (0)