Skip to content

Commit 8893cf1

Browse files
author
yi.wu
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fluid_benchmark_support_recordioreader
2 parents 8d14b39 + df87e63 commit 8893cf1

34 files changed

+552
-219
lines changed

Dockerfile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/
2424

2525
RUN apt-get update && \
2626
apt-get install -y --allow-downgrades \
27-
git python-pip python-dev openssh-server bison \
27+
git python-pip python-dev python-opencv openssh-server bison \
2828
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
2929
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
3030
curl sed grep graphviz libjpeg-dev zlib1g-dev \
@@ -76,8 +76,7 @@ RUN easy_install -U pip && \
7676
pip install sphinx-rtd-theme==0.1.9 recommonmark
7777

7878
RUN pip install pre-commit 'ipython==5.3.0' && \
79-
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
80-
pip install opencv-python
79+
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
8180

8281
#For docstring checker
8382
RUN pip install pylint pytest astroid isort

benchmark/fluid/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
2-
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop
2+
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
33
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
44
RUN pip install -U pip
5-
RUN pip install -U kubernetes opencv-python paddlepaddle
5+
RUN pip install -U kubernetes paddlepaddle
66

77
# IMPORTANT:
88
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.

benchmark/fluid/fluid_benchmark.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,11 @@ def parse_args():
7575
default=1,
7676
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
7777
# this option is available only for vgg and resnet.
78+
parser.add_argument(
79+
'--cpus',
80+
type=int,
81+
default=1,
82+
help='If cpus > 1, will use ParallelDo to run, else use Executor.')
7883
parser.add_argument(
7984
'--data_set',
8085
type=str,
@@ -91,8 +96,8 @@ def parse_args():
9196
help='If set, use nvprof for CUDA.')
9297
parser.add_argument(
9398
'--no_test',
94-
action='store_false',
95-
help='If set, test the testset during training.')
99+
action='store_true',
100+
help='If set, do not test the testset during training.')
96101
parser.add_argument(
97102
'--memory_optimize',
98103
action='store_true',
@@ -266,9 +271,9 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
266271
print("Pass: %d, Iter: %d, Loss: %f\n" %
267272
(pass_id, iters, np.mean(train_losses)))
268273
print_train_time(start_time, time.time(), num_samples)
269-
print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses)))
274+
print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
270275
# evaluation
271-
if not args.no_test and batch_acc != None:
276+
if not args.no_test and batch_acc:
272277
pass_test_acc = test(exe, infer_prog, test_reader, feeder,
273278
batch_acc)
274279
print(", Test Accuracy: %f" % pass_test_acc)
@@ -366,7 +371,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
366371
if args.use_reader_op:
367372
num_samples = num_samples * args.gpus
368373
print_train_time(start_time, time.time(), num_samples)
369-
if not args.no_test and batch_acc != None:
374+
if not args.no_test and batch_acc:
370375
test_acc = test(startup_exe, infer_prog, test_reader, feeder,
371376
batch_acc)
372377
print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))

benchmark/fluid/models/mnist.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,15 +84,30 @@ def get_model(args):
8484
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
8585
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
8686

87-
# Train program
88-
predict = cnn_model(images)
89-
cost = fluid.layers.cross_entropy(input=predict, label=label)
90-
avg_cost = fluid.layers.mean(x=cost)
91-
92-
# Evaluator
93-
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
94-
batch_acc = fluid.layers.accuracy(
95-
input=predict, label=label, total=batch_size_tensor)
87+
if args.device == 'CPU' and args.cpus > 1:
88+
places = fluid.layers.get_places(args.cpus)
89+
pd = fluid.layers.ParallelDo(places)
90+
with pd.do():
91+
predict = cnn_model(pd.read_input(images))
92+
label = pd.read_input(label)
93+
cost = fluid.layers.cross_entropy(input=predict, label=label)
94+
avg_cost = fluid.layers.mean(x=cost)
95+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
96+
97+
pd.write_output(avg_cost)
98+
pd.write_output(batch_acc)
99+
100+
avg_cost, batch_acc = pd()
101+
avg_cost = fluid.layers.mean(avg_cost)
102+
batch_acc = fluid.layers.mean(batch_acc)
103+
else:
104+
# Train program
105+
predict = cnn_model(images)
106+
cost = fluid.layers.cross_entropy(input=predict, label=label)
107+
avg_cost = fluid.layers.mean(x=cost)
108+
109+
# Evaluator
110+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
96111

97112
# inference program
98113
inference_program = fluid.default_main_program().clone()

benchmark/fluid/models/resnet.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,18 +166,32 @@ def get_model(args):
166166
input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
167167
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
168168

169-
predict = model(input, class_dim)
170-
cost = fluid.layers.cross_entropy(input=predict, label=label)
171-
avg_cost = fluid.layers.mean(x=cost)
172-
173-
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
174-
batch_acc = fluid.layers.accuracy(
175-
input=predict, label=label, total=batch_size_tensor)
169+
if args.device == 'CPU' and args.cpus > 1:
170+
places = fluid.layers.get_places(args.cpus)
171+
pd = fluid.layers.ParallelDo(places)
172+
with pd.do():
173+
predict = model(pd.read_input(input), class_dim)
174+
label = pd.read_input(label)
175+
cost = fluid.layers.cross_entropy(input=predict, label=label)
176+
avg_cost = fluid.layers.mean(x=cost)
177+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
178+
179+
pd.write_output(avg_cost)
180+
pd.write_output(batch_acc)
181+
182+
avg_cost, batch_acc = pd()
183+
avg_cost = fluid.layers.mean(avg_cost)
184+
batch_acc = fluid.layers.mean(batch_acc)
185+
else:
186+
predict = model(input, class_dim)
187+
cost = fluid.layers.cross_entropy(input=predict, label=label)
188+
avg_cost = fluid.layers.mean(x=cost)
189+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
176190

177191
inference_program = fluid.default_main_program().clone()
178192
with fluid.program_guard(inference_program):
179193
inference_program = fluid.io.get_inference_program(
180-
target_vars=[batch_acc, batch_size_tensor])
194+
target_vars=[batch_acc])
181195

182196
optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
183197

benchmark/fluid/models/stacked_dynamic_lstm.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,8 @@ def gate_common(
104104
loss = fluid.layers.mean(x=loss)
105105

106106
# add acc
107-
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
108107
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
109-
shape=[1], dtype='int64'), total=batch_size_tensor)
108+
shape=[1], dtype='int64'))
110109

111110
inference_program = fluid.default_main_program().clone()
112111
with fluid.program_guard(inference_program):

cmake/external/grpc.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ ExternalProject_Add(
4545
# checkout and clean other dirs under third_party
4646
# 4. remove .git, and package the directory.
4747
URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
48+
URL_MD5 "c9c58ee7d0e8929a63155af6a2ecdbd0"
4849
PREFIX ${GRPC_SOURCES_DIR}
4950
UPDATE_COMMAND ""
5051
CONFIGURE_COMMAND ""
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
## 堆内存分析和优化
2+
3+
计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。
4+
5+
6+
目前有很多内存泄漏分析工具,比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)
7+
8+
因为Fluid是用Python驱动C++ core来运行,valgrind直接分析非常困难,需要自己编译debug版本的、带valgrind支持的专用Python版本,而且输出的信息中大部分是Python自己的符号和调用信息,分析起来很困难,另外使用valgrind会让程序运行速度变得非常慢,所以不建议使用。
9+
10+
本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
11+
12+
gperftool主要支持以下四个功能:
13+
14+
- thread-caching malloc
15+
- heap-checking using tcmalloc
16+
- heap-profiling using tcmalloc
17+
- CPU profiler
18+
19+
Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)
20+
21+
对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。
22+
23+
## 使用流程
24+
#### 环境
25+
本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。
26+
27+
#### 使用流程
28+
29+
- 安装google-perftools
30+
31+
```
32+
apt-get install libunwind-dev
33+
apt-get install google-perftools
34+
```
35+
36+
- 安装pprof
37+
38+
```
39+
go get -u github.com/google/pprof
40+
```
41+
42+
- 设置运行环境
43+
44+
```
45+
export PPROF_PATH=/root/gopath/bin/pprof
46+
export PPROF_BINARY_PATH=/root/gopath/bin/pprof
47+
export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
48+
```
49+
50+
- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
51+
52+
```
53+
# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
54+
# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump,默认1GB
55+
env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
56+
```
57+
58+
随着程序的运行,会在perf_log这个文件夹下生成很多文件,如下:
59+
60+
```
61+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0001.heap
62+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0002.heap
63+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0003.heap
64+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0004.heap
65+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0005.heap
66+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0006.heap
67+
```
68+
69+
- 使用pprof对heap文件进行分析。分析有两种模式:
70+
- 完整模式。会对当前heap做一个分析,显示目前分配内存一些调用路径。
71+
72+
```
73+
pprof --pdf python test.log.0012.heap
74+
```
75+
上述命令会生成一个profile00x.pdf的文件,可以直接打开,例如:[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出,在CPU版本fluid的运行过程中,分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少,所以被忽略了,这对于分配内存泄漏是很不方便的,因为泄漏是一个缓慢的过程,在这种图中是无法看到的。
76+
77+
![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
78+
79+
- Diff模式。可以对两个时刻的heap做diff,把一些内存分配没有发生变化的模块去掉,而把增量部分显示出来。
80+
```
81+
pprof --pdf --base test.log.0010.heap python test.log.1045.heap
82+
```
83+
生成的结果为:[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
84+
85+
从图中可以看出:ProgramDesc这个结构,在两个版本之间增长了200MB+,所以这里有很大的内存泄漏的可能性,最终结果也确实证明是这里造成了泄漏。
86+
87+
![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
88+
![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
89+

paddle/contrib/inference/demo/simple_on_word2vec.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,10 @@ void Main(bool use_gpu) {
6565
}
6666

6767
TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
68+
69+
#ifdef PADDLE_WITH_CUDA
6870
TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
71+
#endif
6972

7073
} // namespace demo
7174
} // namespace paddle

paddle/contrib/inference/paddle_inference_api.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ class PaddlePredictor {
6363
struct Config;
6464
PaddlePredictor() = default;
6565
PaddlePredictor(const PaddlePredictor&) = delete;
66+
PaddlePredictor& operator=(const PaddlePredictor&) = delete;
6667

6768
// Predict an record.
6869
// The caller should be responsible for allocating and releasing the memory of
@@ -76,7 +77,7 @@ class PaddlePredictor {
7677
virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
7778

7879
// Destroy the Predictor.
79-
virtual ~PaddlePredictor() {}
80+
virtual ~PaddlePredictor() = default;
8081

8182
// The common configs for all the predictors.
8283
struct Config {

0 commit comments

Comments
 (0)