PaddlePaddle
diff --git a/‎.travis.yml
Lines changed: 6 additions & 9 deletions b/‎.travis.yml
Lines changed: 6 additions & 9 deletions
diff --git a/‎AUTHORS.md
Lines changed: 5 additions & 0 deletions b/‎AUTHORS.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎Dockerfile
Lines changed: 2 additions & 3 deletions b/‎Dockerfile
Lines changed: 2 additions & 3 deletions
diff --git a/‎cmake/cuda.cmake
Lines changed: 2 additions & 0 deletions b/‎cmake/cuda.cmake
Lines changed: 2 additions & 0 deletions
diff --git a/‎cmake/external/eigen.cmake
Lines changed: 3 additions & 1 deletion b/‎cmake/external/eigen.cmake
Lines changed: 3 additions & 1 deletion
diff --git a/‎cmake/external/warpctc.cmake
Lines changed: 1 addition & 2 deletions b/‎cmake/external/warpctc.cmake
Lines changed: 1 addition & 2 deletions
diff --git a/‎doc/fluid/design/algorithm/parameter_average.md
Lines changed: 5 additions & 5 deletions b/‎doc/fluid/design/algorithm/parameter_average.md
Lines changed: 5 additions & 5 deletions
diff --git a/‎paddle/cuda/src/hl_cuda_lstm.cu
Lines changed: 5 additions & 5 deletions b/‎paddle/cuda/src/hl_cuda_lstm.cu
Lines changed: 5 additions & 5 deletions
diff --git a/‎paddle/cuda/src/hl_top_k.cu
Lines changed: 1 addition & 1 deletion b/‎paddle/cuda/src/hl_top_k.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
Lines changed: 1 addition & 0 deletions
@@ -12,7 +12,7 @@ services:
 os:
   - linux
 env:
-  - JOB=build_doc
+  - JOB=doc
   - JOB=check_style
   - JOB=build_android
 addons:
@@ -36,21 +36,18 @@ addons:
       - ccache
   ssh_known_hosts: 13.229.163.131
 before_install:
-  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
-  # protobuf version.
   - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
+  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
   - |
     # 43min timeout
-    if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
-    else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
+    if [[ "$JOB" != "doc" ]]; then timeout 2580 paddle/scripts/paddle_docker_build.sh ${JOB}; else paddle/scripts/paddle_build.sh ${JOB}; fi;
+    if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
   - |
-    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
+    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
+    # For document only
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
     if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
     export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
 
@@ -18,7 +18,9 @@
 | hedaoyuan | Dao-Yuan He |
 | helinwang | He-Lin Wang |
 | jacquesqiao | Long-Fei Qiao |
+| jczaja | Jacek Czaja |
 | JiayiFeng | Jia-Yi Feng |
+| kbinias | Krzysztof Binias |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
 | lipeng-unisound | Peng Li |
@@ -27,17 +29,20 @@
 | llxxxll | Yong-Feng Liu |
 | luotao01 | Tao Luo |
 | lzhao4ever | Liang Zhao |
+| mozga-intel | Mateusz Ozga |
 | NHZlX | Zhao-Long Xing |
 | Noplz | Yuan Gao |
 | pakchoi | Chuan-Jiang Song |
 | panyx0718 | Xin Pan |
 | pengli09 | Peng Li |
 | pkuyym | Ya-Ming Yang |
+| pzelazko-intel | Pawel Zelazko |
 | QiJune | Jun Qi |
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Superjom | Chun-Wei Yan |
 | tianbingsz | Tian-Bing Xu |
+| tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
 
@@ -1,7 +1,6 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-
-# When you modify it, please be aware of cudnn-runtime version 
+# When you modify it, please be aware of cudnn-runtime version
 # and libcudnn.so.x in paddle/scripts/docker/build.sh
 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <[email protected]>
@@ -24,7 +23,7 @@ ENV HOME /root
 COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
-    apt-get install -y \
+    apt-get install -y --allow-downgrades \
     git python-pip python-dev openssh-server bison \
     libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
 
@@ -172,6 +172,8 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
 list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# in cuda9, suppress cuda warning on eigen 
+list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
 
 
@@ -22,7 +22,9 @@ else()
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-        GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        # eigen on cuda9.1 missing header of math_funtions.hpp
+        # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
+        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
         PREFIX          ${EIGEN_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
 
@@ -38,8 +38,7 @@ ENDIF()
 ExternalProject_Add(
     extern_warpctc
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
-    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
+    GIT_REPOSITORY  "https://github.com/dzhwinter/warp-ctc.git"
     PREFIX          ${WARPCTC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
 
@@ -1,7 +1,7 @@
 # Averaging Parameter in PaddlePaddle
 
 ## Why Averaging
-In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can.
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.
 
 Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
 
@@ -16,16 +16,16 @@ We propose averaging for any optimizer similar to how ASGD performs it, as menti
 ### How to perform Parameter Averaging in PaddlePaddle
 
 Parameter Averaging in PaddlePaddle works in the following way during training :
-1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer
+1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
 2. The optimizer itself is responsible for updating the parameters.
 3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
-    1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches.
-    2. However, saving all the N instances of the parameters in memory is not feasible.
+    1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
+    2. However, saving all N instances of the parameters in memory is not feasible.
     3. Therefore, an approximation algorithm is used.
 
 Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
 
-During the testing/ saving the model phase, we perform the following steps:
+During the testing/saving the model phase, we perform the following steps:
 1. Perform the delayed operations.
 2. Save current values of the parameters to a temporary variable.
 3. Replace the values of the parameters with the averaged values.
 
@@ -344,9 +344,9 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
   int addr = idx % 32;
 #pragma unroll
   for (int k = 1; k < 32; k++) {
-    // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
-    addr = __shfl(addr, (idx + 1) % 32, 32);
-    a[k] = __shfl(a[k], addr, 32);
+    // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
+    addr = __shfl_sync(addr, (idx + 1) % 32, 32);
+    a[k] = __shfl_sync(a[k], addr, 32);
   }
 
 #pragma unroll
@@ -362,8 +362,8 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
   addr = (32 - idx) % 32;
 #pragma unroll
   for (int k = 0; k < 32; k++) {
-    a[k] = __shfl(a[k], addr, 32);
-    addr = __shfl(addr, (idx + 31) % 32, 32);
+    a[k] = __shfl_sync(a[k], addr, 32);
+    addr = __shfl_sync(addr, (idx + 31) % 32, 32);
   }
 }
 
 
@@ -250,7 +250,7 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
       }
     }
     if (maxId[0] / 32 == warp) {
-      if (__shfl(beam, (maxId[0]) % 32, 32) == maxLength) break;
+      if (__shfl_sync(beam, (maxId[0]) % 32, 32) == maxLength) break;
     }
   }
 }
 
@@ -46,6 +46,7 @@ void ScaleLossGradOpHandle::RunImpl() {
               ->stream();
       memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                    platform::CPUPlace(), &coeff_, sizeof(float), stream);
+      VLOG(1) << place_ << "RUN Scale loss grad op";
     });
 #endif
   }
Original file line number	Diff line number	Diff line change
`@@ -250,7 +250,7 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,`
`250`	`250`	`}`
`251`	`251`	`}`
`252`	`252`	`if (maxId[0] / 32 == warp) {`
`253`		`- if (__shfl(beam, (maxId[0]) % 32, 32) == maxLength) break;`
	`253`	`+ if (__shfl_sync(beam, (maxId[0]) % 32, 32) == maxLength) break;`
`254`	`254`	`}`
`255`	`255`	`}`
`256`	`256`	`}`
Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ void ScaleLossGradOpHandle::RunImpl() {`
`46`	`46`	`->stream();`
`47`	`47`	`memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,`
`48`	`48`	`platform::CPUPlace(), &coeff_, sizeof(float), stream);`
	`49`	`+ VLOG(1) << place_ << "RUN Scale loss grad op";`
`49`	`50`	`});`
`50`	`51`	`#endif`
`51`	`52`	`}`