Skip to content

Commit cb7c124

Browse files
committed
code optimized
2 parents 2f44585 + ea408d5 commit cb7c124

File tree

140 files changed

+2782
-1109
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

140 files changed

+2782
-1109
lines changed

Dockerfile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/
2424

2525
RUN apt-get update && \
2626
apt-get install -y --allow-downgrades \
27-
git python-pip python-dev openssh-server bison \
27+
git python-pip python-dev python-opencv openssh-server bison \
2828
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
2929
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
3030
curl sed grep graphviz libjpeg-dev zlib1g-dev \
@@ -76,8 +76,7 @@ RUN easy_install -U pip && \
7676
pip install sphinx-rtd-theme==0.1.9 recommonmark
7777

7878
RUN pip install pre-commit 'ipython==5.3.0' && \
79-
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
80-
pip install opencv-python
79+
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
8180

8281
#For docstring checker
8382
RUN pip install pylint pytest astroid isort

benchmark/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
77
caffe/image/logs
88
tensorflow/image/logs
99
tensorflow/rnn/logs
10+
fluid/models/*.pyc
11+
fluid/logs
12+
fluid/nohup.out

benchmark/fluid/Dockerfile

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
2+
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
3+
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
4+
RUN pip install -U pip
5+
RUN pip install -U kubernetes paddlepaddle
6+
7+
# IMPORTANT:
8+
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
9+
10+
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
11+
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
12+
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
13+
RUN pip uninstall -y paddlepaddle && mkdir /workspace
14+
15+
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
16+
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
17+
18+
ADD *.whl /
19+
RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
20+
21+
ENV LD_LIBRARY_PATH=/usr/local/lib
22+
ADD fluid_benchmark.py dataset.py models/ /workspace/

benchmark/fluid/README.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,25 @@ Currently supported `--model` argument include:
4444

4545
## Run Distributed Benchmark on Kubernetes Cluster
4646

47+
You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
48+
have to start all those processes mannually on each node, which is not recommended.
49+
50+
To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
51+
download it from
52+
http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
53+
build it by your own. Once you've got the "whl" package, put it under the current directory and run:
54+
55+
```bash
56+
docker build -t [your docker image name]:[your docker image tag] .
57+
```
58+
59+
Then push the image to a Docker registry that your Kubernetes cluster can reach.
60+
4761
We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
4862
distributed benchmark jobs to your cluster. To generate a job yaml, just run:
4963
5064
```bash
51-
python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
65+
python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
5266
```
5367
5468
Then the yaml files are generated under directory `myjob`, you can run:

benchmark/fluid/fluid_benchmark.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,7 @@ def parse_args():
4040
parser.add_argument(
4141
'--batch_size', type=int, default=32, help='The minibatch size.')
4242
parser.add_argument(
43-
'--learning_rate',
44-
type=float,
45-
default=0.001,
46-
help='The minibatch size.')
43+
'--learning_rate', type=float, default=0.001, help='The learning rate.')
4744
# TODO(wuyi): add "--use_fake_data" option back.
4845
parser.add_argument(
4946
'--skip_batch_num',
@@ -72,6 +69,11 @@ def parse_args():
7269
type=int,
7370
default=1,
7471
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
72+
parser.add_argument(
73+
'--cpus',
74+
type=int,
75+
default=1,
76+
help='If cpus > 1, will use ParallelDo to run, else use Executor.')
7577
parser.add_argument(
7678
'--data_set',
7779
type=str,
@@ -88,8 +90,8 @@ def parse_args():
8890
help='If set, use nvprof for CUDA.')
8991
parser.add_argument(
9092
'--no_test',
91-
action='store_false',
92-
help='If set, test the testset during training.')
93+
action='store_true',
94+
help='If set, do not test the testset during training.')
9395
parser.add_argument(
9496
'--memory_optimize',
9597
action='store_true',
@@ -231,13 +233,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
231233
train_losses.append(loss)
232234
print("Pass: %d, Iter: %d, Loss: %f\n" %
233235
(pass_id, iters, np.mean(train_losses)))
234-
train_elapsed = time.time() - start_time
235-
examples_per_sec = num_samples / train_elapsed
236-
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' %
237-
(num_samples, train_elapsed, examples_per_sec))
238-
print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses)))
236+
print_train_time(start_time, time.time(), num_samples)
237+
print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
239238
# evaluation
240-
if not args.no_test and batch_acc != None:
239+
if not args.no_test and batch_acc:
241240
pass_test_acc = test(exe, infer_prog, test_reader, feeder,
242241
batch_acc)
243242
print(", Test Accuracy: %f" % pass_test_acc)
@@ -315,11 +314,8 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
315314
if batch_id % 1 == 0:
316315
print("Pass %d, batch %d, loss %s" %
317316
(pass_id, batch_id, np.array(loss)))
318-
train_elapsed = time.time() - start_time
319-
examples_per_sec = num_samples / train_elapsed
320-
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
321-
(num_samples, train_elapsed, examples_per_sec))
322-
if not args.no_test and batch_acc != None:
317+
print_train_time(start_time, time.time(), num_samples)
318+
if not args.no_test and batch_acc:
323319
test_acc = test(startup_exe, infer_prog, test_reader, feeder,
324320
batch_acc)
325321
print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
@@ -329,20 +325,27 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
329325
def print_arguments(args):
330326
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
331327
vars(args)['device'] == 'GPU')
332-
print('----------- resnet Configuration Arguments -----------')
328+
print('----------- Configuration Arguments -----------')
333329
for arg, value in sorted(vars(args).iteritems()):
334330
print('%s: %s' % (arg, value))
335331
print('------------------------------------------------')
336332

337333

334+
def print_train_time(start_time, end_time, num_samples):
335+
train_elapsed = end_time - start_time
336+
examples_per_sec = num_samples / train_elapsed
337+
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
338+
(num_samples, train_elapsed, examples_per_sec))
339+
340+
338341
def main():
339342
args = parse_args()
340343
print_arguments(args)
341344

342345
# the unique trainer id, starting from 0, needed by trainer
343346
# only
344347
nccl_id_var, num_trainers, trainer_id = (
345-
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
348+
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
346349

347350
if args.use_cprof:
348351
pr = cProfile.Profile()

benchmark/fluid/models/mnist.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,30 @@ def get_model(args):
6969
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
7070
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
7171

72-
# Train program
73-
predict = cnn_model(images)
74-
cost = fluid.layers.cross_entropy(input=predict, label=label)
75-
avg_cost = fluid.layers.mean(x=cost)
76-
77-
# Evaluator
78-
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
79-
batch_acc = fluid.layers.accuracy(
80-
input=predict, label=label, total=batch_size_tensor)
72+
if args.device == 'CPU' and args.cpus > 1:
73+
places = fluid.layers.get_places(args.cpus)
74+
pd = fluid.layers.ParallelDo(places)
75+
with pd.do():
76+
predict = cnn_model(pd.read_input(images))
77+
label = pd.read_input(label)
78+
cost = fluid.layers.cross_entropy(input=predict, label=label)
79+
avg_cost = fluid.layers.mean(x=cost)
80+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
81+
82+
pd.write_output(avg_cost)
83+
pd.write_output(batch_acc)
84+
85+
avg_cost, batch_acc = pd()
86+
avg_cost = fluid.layers.mean(avg_cost)
87+
batch_acc = fluid.layers.mean(batch_acc)
88+
else:
89+
# Train program
90+
predict = cnn_model(images)
91+
cost = fluid.layers.cross_entropy(input=predict, label=label)
92+
avg_cost = fluid.layers.mean(x=cost)
93+
94+
# Evaluator
95+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
8196

8297
# inference program
8398
inference_program = fluid.default_main_program().clone()

benchmark/fluid/models/resnet.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -132,18 +132,33 @@ def get_model(args):
132132

133133
input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
134134
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
135-
predict = model(input, class_dim)
136-
cost = fluid.layers.cross_entropy(input=predict, label=label)
137-
avg_cost = fluid.layers.mean(x=cost)
138135

139-
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
140-
batch_acc = fluid.layers.accuracy(
141-
input=predict, label=label, total=batch_size_tensor)
136+
if args.device == 'CPU' and args.cpus > 1:
137+
places = fluid.layers.get_places(args.cpus)
138+
pd = fluid.layers.ParallelDo(places)
139+
with pd.do():
140+
predict = model(pd.read_input(input), class_dim)
141+
label = pd.read_input(label)
142+
cost = fluid.layers.cross_entropy(input=predict, label=label)
143+
avg_cost = fluid.layers.mean(x=cost)
144+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
145+
146+
pd.write_output(avg_cost)
147+
pd.write_output(batch_acc)
148+
149+
avg_cost, batch_acc = pd()
150+
avg_cost = fluid.layers.mean(avg_cost)
151+
batch_acc = fluid.layers.mean(batch_acc)
152+
else:
153+
predict = model(input, class_dim)
154+
cost = fluid.layers.cross_entropy(input=predict, label=label)
155+
avg_cost = fluid.layers.mean(x=cost)
156+
batch_acc = fluid.layers.accuracy(input=predict, label=label)
142157

143158
inference_program = fluid.default_main_program().clone()
144159
with fluid.program_guard(inference_program):
145160
inference_program = fluid.io.get_inference_program(
146-
target_vars=[batch_acc, batch_size_tensor])
161+
target_vars=[batch_acc])
147162

148163
optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
149164

benchmark/fluid/models/stacked_dynamic_lstm.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,8 @@ def gate_common(
101101
loss = fluid.layers.mean(x=loss)
102102

103103
# add acc
104-
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
105104
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
106-
shape=[1], dtype='int64'), total=batch_size_tensor)
105+
shape=[1], dtype='int64'))
107106

108107
inference_program = fluid.default_main_program().clone()
109108
with fluid.program_guard(inference_program):

benchmark/fluid/run.sh

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# This script benchmarking the PaddlePaddle Fluid on
33
# single thread single GPU.
44

5+
mkdir -p logs
56
#export FLAGS_fraction_of_gpu_memory_to_use=0.0
67
export CUDNN_PATH=/paddle/cudnn_v5
78

@@ -35,71 +36,74 @@ nohup stdbuf -oL nvidia-smi \
3536
--format=csv \
3637
--filename=mem.log \
3738
-l 1 &
39+
3840
# mnist
3941
# mnist gpu mnist 128
40-
FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
42+
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
43+
--model=mnist \
4144
--device=GPU \
4245
--batch_size=128 \
4346
--skip_batch_num=5 \
4447
--iterations=500 \
45-
2>&1 | tee -a mnist_gpu_128.log
48+
2>&1 | tee -a logs/mnist_gpu_128.log
4649

4750
# vgg16
4851
# gpu cifar10 128
49-
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
52+
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
53+
--model=vgg16 \
5054
--device=GPU \
5155
--batch_size=128 \
5256
--skip_batch_num=5 \
5357
--iterations=30 \
54-
2>&1 | tee -a vgg16_gpu_128.log
58+
2>&1 | tee -a logs/vgg16_gpu_128.log
5559

5660
# flowers gpu 128
57-
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
61+
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
62+
--model=vgg16 \
5863
--device=GPU \
5964
--batch_size=32 \
6065
--data_set=flowers \
6166
--skip_batch_num=5 \
6267
--iterations=30 \
63-
2>&1 | tee -a vgg16_gpu_flowers_32.log
68+
2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
6469

6570
# resnet50
6671
# resnet50 gpu cifar10 128
67-
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
72+
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
73+
--model=resnet \
6874
--device=GPU \
6975
--batch_size=128 \
7076
--data_set=cifar10 \
71-
--model=resnet_cifar10 \
7277
--skip_batch_num=5 \
7378
--iterations=30 \
74-
2>&1 | tee -a resnet50_gpu_128.log
79+
2>&1 | tee -a logs/resnet50_gpu_128.log
7580

7681
# resnet50 gpu flowers 64
77-
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
82+
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
83+
--model=resnet \
7884
--device=GPU \
7985
--batch_size=64 \
8086
--data_set=flowers \
81-
--model=resnet_imagenet \
8287
--skip_batch_num=5 \
8388
--iterations=30 \
84-
2>&1 | tee -a resnet50_gpu_flowers_64.log
89+
2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
8590

8691
# lstm
8792
# lstm gpu imdb 32 # tensorflow only support batch=32
88-
FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
93+
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
94+
--model=stacked_dynamic_lstm \
8995
--device=GPU \
9096
--batch_size=32 \
9197
--skip_batch_num=5 \
9298
--iterations=30 \
93-
--hidden_dim=512 \
94-
--emb_dim=512 \
95-
--crop_size=1500 \
96-
2>&1 | tee -a lstm_gpu_32.log
99+
2>&1 | tee -a logs/lstm_gpu_32.log
97100

98101
# seq2seq
99102
# seq2seq gpu wmb 128
100-
FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
103+
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
104+
--model=machine_translation \
101105
--device=GPU \
102106
--batch_size=128 \
103107
--skip_batch_num=5 \
104108
--iterations=30 \
105-
2>&1 | tee -a lstm_gpu_128.log
109+
2>&1 | tee -a logs/lstm_gpu_128.log

cmake/external/grpc.cmake

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,19 @@ ELSE()
3333
SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
3434
ENDIF()
3535

36+
# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
3637
ExternalProject_Add(
3738
extern_grpc
3839
DEPENDS protobuf zlib
39-
URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz"
40+
# NOTE(wuyi):
41+
# this package is generated by following steps:
42+
# 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
43+
# 2. submodule update --init
44+
# 3. keep only zlib, cares, protobuf, boringssl under "third_party",
45+
# checkout and clean other dirs under third_party
46+
# 4. remove .git, and package the directory.
47+
URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
48+
URL_MD5 "c9c58ee7d0e8929a63155af6a2ecdbd0"
4049
PREFIX ${GRPC_SOURCES_DIR}
4150
UPDATE_COMMAND ""
4251
CONFIGURE_COMMAND ""
@@ -49,7 +58,6 @@ ExternalProject_Add(
4958
INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
5059
)
5160

52-
# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
5361
ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
5462
SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
5563
"${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")

0 commit comments

Comments
 (0)