Skip to content

Commit ba773fc

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_broadcast_test
2 parents e819f1b + ea408d5 commit ba773fc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+798
-302
lines changed

Dockerfile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/
2424

2525
RUN apt-get update && \
2626
apt-get install -y --allow-downgrades \
27-
git python-pip python-dev openssh-server bison \
27+
git python-pip python-dev python-opencv openssh-server bison \
2828
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
2929
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
3030
curl sed grep graphviz libjpeg-dev zlib1g-dev \
@@ -76,8 +76,7 @@ RUN easy_install -U pip && \
7676
pip install sphinx-rtd-theme==0.1.9 recommonmark
7777

7878
RUN pip install pre-commit 'ipython==5.3.0' && \
79-
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
80-
pip install opencv-python
79+
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
8180

8281
#For docstring checker
8382
RUN pip install pylint pytest astroid isort

benchmark/fluid/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
2-
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop
2+
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
33
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
44
RUN pip install -U pip
5-
RUN pip install -U kubernetes opencv-python paddlepaddle
5+
RUN pip install -U kubernetes paddlepaddle
66

77
# IMPORTANT:
88
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.

benchmark/fluid/fluid_benchmark.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,11 @@ def parse_args():
6969
type=int,
7070
default=1,
7171
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
72+
parser.add_argument(
73+
'--cpus',
74+
type=int,
75+
default=1,
76+
help='If cpus > 1, will use ParallelDo to run, else use Executor.')
7277
parser.add_argument(
7378
'--data_set',
7479
type=str,
@@ -85,8 +90,8 @@ def parse_args():
8590
help='If set, use nvprof for CUDA.')
8691
parser.add_argument(
8792
'--no_test',
88-
action='store_false',
89-
help='If set, test the testset during training.')
93+
action='store_true',
94+
help='If set, do not test the testset during training.')
9095
parser.add_argument(
9196
'--memory_optimize',
9297
action='store_true',
@@ -229,9 +234,9 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
229234
print("Pass: %d, Iter: %d, Loss: %f\n" %
230235
(pass_id, iters, np.mean(train_losses)))
231236
print_train_time(start_time, time.time(), num_samples)
232-
print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses)))
237+
print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
233238
# evaluation
234-
if not args.no_test and batch_acc != None:
239+
if not args.no_test and batch_acc:
235240
pass_test_acc = test(exe, infer_prog, test_reader, feeder,
236241
batch_acc)
237242
print(", Test Accuracy: %f" % pass_test_acc)
@@ -310,7 +315,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
310315
print("Pass %d, batch %d, loss %s" %
311316
(pass_id, batch_id, np.array(loss)))
312317
print_train_time(start_time, time.time(), num_samples)
313-
if not args.no_test and batch_acc != None:
318+
if not args.no_test and batch_acc:
314319
test_acc = test(startup_exe, infer_prog, test_reader, feeder,
315320
batch_acc)
316321
print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))

benchmark/fluid/models/mnist.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,30 @@ def get_model(args):
6969
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
7070
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
7171

72-
# Train program
73-
predict = cnn_model(images)
74-
cost = fluid.layers.cross_entropy(input=predict, label=label)
75-
avg_cost = fluid.layers.mean(x=cost)
76-
77-
# Evaluator
78-
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
79-
batch_acc = fluid.layers.accuracy(
80-
input=predict, label=label, total=batch_size_tensor)
72+
if args.device == 'CPU' and args.cpus > 1:
73+
places = fluid.layers.get_places(args.cpus)
74+
pd = fluid.layers.ParallelDo(places)
75+
with pd.do():
76+
predict = cnn_model(pd.read_input(images))
77+
label = pd.read_input(label)
78+
cost = fluid.layers.cross_entropy(input=predict, label=label)
79+
avg_cost = fluid.layers.mean(x=cost)
80+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
81+
82+
pd.write_output(avg_cost)
83+
pd.write_output(batch_acc)
84+
85+
avg_cost, batch_acc = pd()
86+
avg_cost = fluid.layers.mean(avg_cost)
87+
batch_acc = fluid.layers.mean(batch_acc)
88+
else:
89+
# Train program
90+
predict = cnn_model(images)
91+
cost = fluid.layers.cross_entropy(input=predict, label=label)
92+
avg_cost = fluid.layers.mean(x=cost)
93+
94+
# Evaluator
95+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
8196

8297
# inference program
8398
inference_program = fluid.default_main_program().clone()

benchmark/fluid/models/resnet.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -132,18 +132,33 @@ def get_model(args):
132132

133133
input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
134134
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
135-
predict = model(input, class_dim)
136-
cost = fluid.layers.cross_entropy(input=predict, label=label)
137-
avg_cost = fluid.layers.mean(x=cost)
138135

139-
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
140-
batch_acc = fluid.layers.accuracy(
141-
input=predict, label=label, total=batch_size_tensor)
136+
if args.device == 'CPU' and args.cpus > 1:
137+
places = fluid.layers.get_places(args.cpus)
138+
pd = fluid.layers.ParallelDo(places)
139+
with pd.do():
140+
predict = model(pd.read_input(input), class_dim)
141+
label = pd.read_input(label)
142+
cost = fluid.layers.cross_entropy(input=predict, label=label)
143+
avg_cost = fluid.layers.mean(x=cost)
144+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
145+
146+
pd.write_output(avg_cost)
147+
pd.write_output(batch_acc)
148+
149+
avg_cost, batch_acc = pd()
150+
avg_cost = fluid.layers.mean(avg_cost)
151+
batch_acc = fluid.layers.mean(batch_acc)
152+
else:
153+
predict = model(input, class_dim)
154+
cost = fluid.layers.cross_entropy(input=predict, label=label)
155+
avg_cost = fluid.layers.mean(x=cost)
156+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
142157

143158
inference_program = fluid.default_main_program().clone()
144159
with fluid.program_guard(inference_program):
145160
inference_program = fluid.io.get_inference_program(
146-
target_vars=[batch_acc, batch_size_tensor])
161+
target_vars=[batch_acc])
147162

148163
optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
149164

benchmark/fluid/models/stacked_dynamic_lstm.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,8 @@ def gate_common(
101101
loss = fluid.layers.mean(x=loss)
102102

103103
# add acc
104-
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
105104
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
106-
shape=[1], dtype='int64'), total=batch_size_tensor)
105+
shape=[1], dtype='int64'))
107106

108107
inference_program = fluid.default_main_program().clone()
109108
with fluid.program_guard(inference_program):

cmake/external/grpc.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ ExternalProject_Add(
4545
# checkout and clean other dirs under third_party
4646
# 4. remove .git, and package the directory.
4747
URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
48+
URL_MD5 "c9c58ee7d0e8929a63155af6a2ecdbd0"
4849
PREFIX ${GRPC_SOURCES_DIR}
4950
UPDATE_COMMAND ""
5051
CONFIGURE_COMMAND ""
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
## 堆内存分析和优化
2+
3+
计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。
4+
5+
6+
目前有很多内存泄漏分析工具,比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)
7+
8+
因为Fluid是用Python驱动C++ core来运行,valgrind直接分析非常困难,需要自己编译debug版本的、带valgrind支持的专用Python版本,而且输出的信息中大部分是Python自己的符号和调用信息,分析起来很困难,另外使用valgrind会让程序运行速度变得非常慢,所以不建议使用。
9+
10+
本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
11+
12+
gperftool主要支持以下四个功能:
13+
14+
- thread-caching malloc
15+
- heap-checking using tcmalloc
16+
- heap-profiling using tcmalloc
17+
- CPU profiler
18+
19+
Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)
20+
21+
对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。
22+
23+
## 使用流程
24+
#### 环境
25+
本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。
26+
27+
#### 使用流程
28+
29+
- 安装google-perftools
30+
31+
```
32+
apt-get install libunwind-dev
33+
apt-get install google-perftools
34+
```
35+
36+
- 安装pprof
37+
38+
```
39+
go get -u github.com/google/pprof
40+
```
41+
42+
- 设置运行环境
43+
44+
```
45+
export PPROF_PATH=/root/gopath/bin/pprof
46+
export PPROF_BINARY_PATH=/root/gopath/bin/pprof
47+
export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
48+
```
49+
50+
- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
51+
52+
```
53+
# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
54+
# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump,默认1GB
55+
env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
56+
```
57+
58+
随着程序的运行,会在perf_log这个文件夹下生成很多文件,如下:
59+
60+
```
61+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0001.heap
62+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0002.heap
63+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0003.heap
64+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0004.heap
65+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0005.heap
66+
-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0006.heap
67+
```
68+
69+
- 使用pprof对heap文件进行分析。分析有两种模式:
70+
- 完整模式。会对当前heap做一个分析,显示目前分配内存一些调用路径。
71+
72+
```
73+
pprof --pdf python test.log.0012.heap
74+
```
75+
上述命令会生成一个profile00x.pdf的文件,可以直接打开,例如:[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出,在CPU版本fluid的运行过程中,分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少,所以被忽略了,这对于分配内存泄漏是很不方便的,因为泄漏是一个缓慢的过程,在这种图中是无法看到的。
76+
77+
![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
78+
79+
- Diff模式。可以对两个时刻的heap做diff,把一些内存分配没有发生变化的模块去掉,而把增量部分显示出来。
80+
```
81+
pprof --pdf --base test.log.0010.heap python test.log.1045.heap
82+
```
83+
生成的结果为:[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
84+
85+
从图中可以看出:ProgramDesc这个结构,在两个版本之间增长了200MB+,所以这里有很大的内存泄漏的可能性,最终结果也确实证明是这里造成了泄漏。
86+
87+
![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
88+
![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
89+

paddle/contrib/inference/demo/simple_on_word2vec.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,10 @@ void Main(bool use_gpu) {
6565
}
6666

6767
TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
68+
69+
#ifdef PADDLE_WITH_CUDA
6870
TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
71+
#endif
6972

7073
} // namespace demo
7174
} // namespace paddle

paddle/contrib/inference/paddle_inference_api.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ class PaddlePredictor {
6363
struct Config;
6464
PaddlePredictor() = default;
6565
PaddlePredictor(const PaddlePredictor&) = delete;
66+
PaddlePredictor& operator=(const PaddlePredictor&) = delete;
6667

6768
// Predict an record.
6869
// The caller should be responsible for allocating and releasing the memory of
@@ -76,7 +77,7 @@ class PaddlePredictor {
7677
virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
7778

7879
// Destroy the Predictor.
79-
virtual ~PaddlePredictor() {}
80+
virtual ~PaddlePredictor() = default;
8081

8182
// The common configs for all the predictors.
8283
struct Config {

0 commit comments

Comments
 (0)