PaddlePaddle
diff --git a/‎.travis.yml
Lines changed: 1 addition & 21 deletions b/‎.travis.yml
Lines changed: 1 addition & 21 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 4 deletions b/‎README.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmark/cluster/vgg16/run_vgg_dist.sh
Lines changed: 21 additions & 0 deletions b/‎benchmark/cluster/vgg16/run_vgg_dist.sh
Lines changed: 21 additions & 0 deletions
diff --git a/‎benchmark/cluster/vgg16/vgg16_fluid.py
Lines changed: 8 additions & 9 deletions b/‎benchmark/cluster/vgg16/vgg16_fluid.py
Lines changed: 8 additions & 9 deletions
diff --git a/‎contrib/float16/.gitignore
Lines changed: 1 addition & 0 deletions b/‎contrib/float16/.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎contrib/float16/float16_benchmark.md
Lines changed: 97 additions & 0 deletions b/‎contrib/float16/float16_benchmark.md
Lines changed: 97 additions & 0 deletions
@@ -16,34 +16,14 @@ env:
   - JOB=check_style
   - JOB=build_android
 addons:
-  apt:
-    packages:
-      - gcc-4.8
-      - g++-4.8
-      - git
-      - build-essential
-      - python
-      - python-pip
-      - python2.7-dev
-      - python-wheel
-      - libboost-dev
-      - curl
-      - swig
-      - graphviz
-      - clang-format-3.8
-      - automake
-      - libtool
-      - ccache
   ssh_known_hosts: 13.229.163.131
 before_install:
-  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
   - |
     # 43min timeout
-    if [[ "$JOB" != "doc" ]]; then timeout 2580 paddle/scripts/paddle_docker_build.sh ${JOB}; else paddle/scripts/paddle_build.sh ${JOB}; fi;
+    paddle/scripts/paddle_docker_build.sh ${JOB}
     if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
   - |
     if [[ "$JOB" != "doc" ]]; then exit 0; fi;
 
@@ -75,19 +75,19 @@ We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/g
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
 
    You can also run distributed training jobs on Kubernetes clusters.
 
-- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
 
    We appreciate your contributions!
 
 
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Update to point to the source file.
+VGG_SRC="vgg16_fluid.py"
+
+export TRAINING_ROLE=PSERVER
+export TRAINERS=2
+export POD_IP=127.0.0.1
+export PADDLE_INIT_PORT=6174
+MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
+
+# Need to wait for the ps to start first.
+sleep 10
+echo "done start ps"
+
+export TRAINING_ROLE=TRAINER
+export TRAINERS=2
+export POD_IP=127.0.0.1
+export PADDLE_INIT_PORT=6174
+CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
+CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
@@ -200,18 +200,19 @@ def train_loop(exe, trainer_prog):
                 num_samples += len(data)
                 train_pass_acc.add(value=acc, weight=b_size)
                 print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
-                    % (pass_id, iters, loss, acc,
-                       len(data) / (time.time() - ts))
+                    "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
+                    "Speed = %.2f img/s " % (args.task_index, pass_id, iters,
+                                             loss, acc,
+                                             len(data) / (time.time() - ts))
                 )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
             pass_train_acc = train_pass_acc.eval()
             pass_test_acc = test(exe)
-            print(
-                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
-                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
-                   pass_test_acc))
+            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
+                  "Train accuracy = %f, Test accuracy = %f\n" %
+                  (args.task_index, pass_id, num_samples / pass_elapsed,
+                   pass_train_acc, pass_test_acc))
 
     if args.local:
         # Parameter initialization
@@ -239,8 +240,6 @@ def train_loop(exe, trainer_prog):
 
         t = fluid.DistributeTranspiler()
         t.transpile(
-            optimize_ops,
-            params_grads,
             trainer_id=args.task_index,
             pservers=args.ps_hosts,
             trainers=trainers)
 
@@ -0,0 +1 @@
+*.inference.model
@@ -0,0 +1,97 @@
+# float16 benchmark
+
+## Description
+We want to compare the inference benchmark of float16 vs float32 on the "image_classification" example on Nvidia Tesla V100 GPU, where we can enable the tensor core computation for float16 mode. We test Vgg16 and Resnet50 on the imagenet data set, and Vgg16 and Resnet32 on the cifar10 data set. For completeness, we also add the inference benchmark of Vgg16 and Resnet50 on imagenet data set tested on Nvidia GeForce GTX 1080 Ti GPU.
+
+For more details about tensor core, please refer to https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
+
+## Test environment
+- GPU: single Nvidia Tesla V100 or single Nvidia GeForce GTX 1080 Ti 
+- CUDNN: 7.1.1
+- CUDA: 9.0
+- Code: https://github.com/PaddlePaddle/Paddle/pull/10331 (Tensor core is enabled in float16 mode)
+
+## Benchmark on V100
+All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia V100 GPU with respective to different mini-batch(mb) sizes.
+
+### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
+|float32| 14.01 | 9.70  | 22.99 | 28.26 | 53.87  | 84.42 | 178.95 | 
+|float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
+|Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
+
+Total time spent on conv op for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | 
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
+|float32| 11.95 | 6.96  | 18.65 | 21.42 | 41.35  | 60.58 | 130.11 |
+|float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
+|Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
+
+
+### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
+|float32| 7.03  | 7.41  | 9.16  | 12.55 | 21.13  | 38.27 | 67.93  | 127.02 | 
+|float16| 6.13  | 6.32  | 6.24  |  7.40 | 10.90  | 18.18 | 33.20  |  64.52 |
+|Speedup| 1.15  | 1.17  | 1.47  |  1.70 |  1.94  |  2.11 |  2.05  |   1.97 |
+
+Total time spent on conv op for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
+|float32| 5.43  | 5.46  | 6.50  | 8.36  | 13.80  | 24.45 | 41.21  | 73.44  |
+|float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
+|Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
+
+
+### Vgg16 on cifar10 (image.shape = [3, 32, 32]):
+
+Total inference time for one batch:
+
+|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
+|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:| 
+|float32| 3.13 | 3.17 | 3.19 | 3.58 | 3.98  | 6.23  | 8.42  | 13.44  | 24.19  | 44.97  | 
+|float16| 2.72 | 2.77 | 2.76 | 2,88 | 2.96  | 3.24  | 4.01  |  5.78  |  9.65  | 17.37  |
+|Speedup| 1.15 | 1.14 | 1.16 | 1.24 | 1.34  | 1.92  | 2.10  |  2.33  |  2.51  |  2.59  |
+
+
+### Resnet32 on cifar10 (image.shape = [3, 32, 32]):
+
+Total inference time for one batch:
+
+|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
+|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:|
+|float32| 3.11 | 3.14 | 2.99 | 3.04 | 3.10  | 3.28  | 4.47  | 6.86   | 11.63  | 21.16  |
+|float16| 3.70 | 3.81 | 3.75 | 3.83 | 3.77  | 3.97  | 3.92  | 4.15   |  6.41  | 11.02  | 
+|Speedup|      |      |      |      |       |       | 1.14  | 1.65   |  1.81  |  1.92  |
+
+
+## Benchmark on 1080 Ti
+All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia GeForce GTX 1080 Ti GPU with respective to different mini-batch(mb) sizes.
+
+### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32  |
+|-------|-----: |-----: |-----: |-----: |------: |-------:|
+|float32| 5.60  | 9.38  | 15.86 | 29.79 | 57.60  | 117.73 |
+|float16| 4.99  | 7.79  | 13.47 | 26.02 | 52.30  | 102.34 |
+|Speedup| 1.12  | 1.20  |  1.18 |  1.15 |  1.10  |   1.15 |
+
+
+### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32  | mb=64  |
+|-------|-----: |-----: |-----: |-----: |------: |-------:|-------:|
+|float32| 5.63  | 6.23  | 8.85  | 14.71 | 26.07  | 52.86  | 108.95 |
+|float16| 5.89  | 6.44  | 7.94  | 12.57 | 22.03  | 45.06  |  92.68 |
+|Speedup|       |       | 1.12  |  1.17 |  1.18  |  1.17  |   1.18 |