PaddlePaddle
diff --git a/‎Dockerfile
Lines changed: 1 addition & 1 deletion b/‎Dockerfile
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/cluster/vgg16/run_vgg_dist.sh
Lines changed: 21 additions & 0 deletions b/‎benchmark/cluster/vgg16/run_vgg_dist.sh
Lines changed: 21 additions & 0 deletions
diff --git a/‎benchmark/cluster/vgg16/vgg16_fluid.py
Lines changed: 8 additions & 9 deletions b/‎benchmark/cluster/vgg16/vgg16_fluid.py
Lines changed: 8 additions & 9 deletions
diff --git a/‎paddle/capi/Matrix.cpp
Lines changed: 1 addition & 1 deletion b/‎paddle/capi/Matrix.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/cuda/include/hl_base.h
Lines changed: 18 additions & 6 deletions b/‎paddle/cuda/include/hl_base.h
Lines changed: 18 additions & 6 deletions
diff --git a/‎paddle/cuda/src/hl_cuda_lstm.cu
Lines changed: 9 additions & 5 deletions b/‎paddle/cuda/src/hl_cuda_lstm.cu
Lines changed: 9 additions & 5 deletions
diff --git a/‎paddle/cuda/src/hl_top_k.cu
Lines changed: 4 additions & 1 deletion b/‎paddle/cuda/src/hl_top_k.cu
Lines changed: 4 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/details/multi_devices_graph_builder.cc
Lines changed: 5 additions & 5 deletions b/‎paddle/fluid/framework/details/multi_devices_graph_builder.cc
Lines changed: 5 additions & 5 deletions
diff --git a/‎paddle/fluid/framework/details/multi_devices_graph_builder.h
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/framework/details/multi_devices_graph_builder.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/framework/lod_tensor_test.cc
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/framework/lod_tensor_test.cc
Lines changed: 2 additions & 2 deletions
@@ -32,7 +32,7 @@ RUN apt-get update && \
     automake locales clang-format swig doxygen cmake  \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools libtool && \
+    net-tools libtool ccache && \
     apt-get clean -y
 
 # Install Go and glide
 
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Update to point to the source file.
+VGG_SRC="vgg16_fluid.py"
+
+export TRAINING_ROLE=PSERVER
+export TRAINERS=2
+export POD_IP=127.0.0.1
+export PADDLE_INIT_PORT=6174
+MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
+
+# Need to wait for the ps to start first.
+sleep 10
+echo "done start ps"
+
+export TRAINING_ROLE=TRAINER
+export TRAINERS=2
+export POD_IP=127.0.0.1
+export PADDLE_INIT_PORT=6174
+CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
+CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
@@ -200,18 +200,19 @@ def train_loop(exe, trainer_prog):
                 num_samples += len(data)
                 train_pass_acc.add(value=acc, weight=b_size)
                 print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
-                    % (pass_id, iters, loss, acc,
-                       len(data) / (time.time() - ts))
+                    "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
+                    "Speed = %.2f img/s " % (args.task_index, pass_id, iters,
+                                             loss, acc,
+                                             len(data) / (time.time() - ts))
                 )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
             pass_train_acc = train_pass_acc.eval()
             pass_test_acc = test(exe)
-            print(
-                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
-                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
-                   pass_test_acc))
+            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
+                  "Train accuracy = %f, Test accuracy = %f\n" %
+                  (args.task_index, pass_id, num_samples / pass_elapsed,
+                   pass_train_acc, pass_test_acc))
 
     if args.local:
         # Parameter initialization
@@ -239,8 +240,6 @@ def train_loop(exe, trainer_prog):
 
         t = fluid.DistributeTranspiler()
         t.transpile(
-            optimize_ops,
-            params_grads,
             trainer_id=args.task_index,
             pservers=args.ps_hosts,
             trainers=trainers)
 
@@ -108,7 +108,7 @@ paddle_error paddle_matrix_get_row(paddle_matrix mat,
 paddle_error paddle_matrix_get_shape(paddle_matrix mat,
                                      uint64_t* height,
                                      uint64_t* width) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || cast(mat)->mat == nullptr) return kPD_NULLPTR;
   if (height != nullptr) {
     *height = cast(mat)->mat->getHeight();
   }
 
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef HL_BASE_H_
-#define HL_BASE_H_
+#pragma once
 
 #include <cstddef>
 
@@ -207,8 +206,8 @@ typedef struct {
 
 #ifdef __NVCC__
 
-#include "cuda_runtime.h"
-#include "hl_cuda.h"
+#include <cuda_runtime.h>
+#include "paddle/cuda/include/hl_cuda.h"
 #include "paddle/utils/Logging.h"
 
 extern __thread bool g_sync_flag;
@@ -228,6 +227,19 @@ extern __thread cudaStream_t default_stream;
         << "CUDA error: " << hl_get_device_error_string((size_t)err); \
   }
 
-#endif /* __NVCC__ */
+// __shfl has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T
+__shfl_sync(unsigned, T val, int src_line, int width) {
+  return __shfl(val, src_line, width);
+}
 
-#endif /* HL_BASE_H_ */
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+#endif  // __NVCC__
@@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue,
 }
 
 __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
-  int addr = idx % 32;
+  const int warp_size = 32;
+  int addr = idx % warp_size;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, addr < warp_size);
 #pragma unroll
   for (int k = 1; k < 32; k++) {
     // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
-    addr = __shfl_sync(addr, (idx + 1) % 32, 32);
-    a[k] = __shfl_sync(a[k], addr, 32);
+    addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
   }
 
 #pragma unroll
@@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
   }
 
   addr = (32 - idx) % 32;
+  CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
 #pragma unroll
   for (int k = 0; k < 32; k++) {
-    a[k] = __shfl_sync(a[k], addr, 32);
-    addr = __shfl_sync(addr, (idx + 31) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
+    addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
   }
 }
 
 
@@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
     if (--beamSize == 0) break;
     __syncthreads();
 
+    unsigned mask = 0u;
+    // CREATE_SHFL_MASK(mask, tid < len);
+
     if (tid == maxId[0]) {
       if (beam < maxLength) {
         shTopK[tid] = topK[beam];
       }
     }
     if (maxId[0] / 32 == warp) {
-      if (__shfl_sync(beam, (maxId[0]) % 32, 32) == maxLength) break;
+      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
     }
   }
 }
 
@@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool skip_scale_loss,
+    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale,
     platform::NCCLContextMap *nccl_ctxs)
     : loss_var_name_(loss_var_name),
       places_(places),
@@ -45,15 +45,15 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool skip_scale_loss)
+    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale)
     : loss_var_name_(loss_var_name),
       places_(places),
       local_scopes_(local_scopes) {
 #endif
   for (auto &p : params) {
     grad_names_.insert(GradVarName(p));
   }
-  skip_scale_loss_ = skip_scale_loss;
+  use_default_grad_scale_ = use_default_grad_scale;
 }
 
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@@ -126,8 +126,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     } else if (IsDistTrainOp(*op, send_op)) {
       CreateComputationalOps(&result, *op, 1);
     } else if (IsScaleLossOp(*op)) {
-      // user can customize loss@grad if skip_scale_loss_
-      if (!skip_scale_loss_) {
+      // user can customize loss@grad if not use_default_grad_scale_
+      if (use_default_grad_scale_) {
         CreateScaleLossGradOp(&result);
       }
       is_forwarding = false;
 
@@ -41,7 +41,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                           const std::string &loss_var_name,
                           const std::unordered_set<std::string> &params,
                           const std::vector<Scope *> &local_scopes,
-                          bool skip_scale_loss);
+                          bool use_default_grad_scale);
 #endif
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
@@ -59,7 +59,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
   platform::NCCLContextMap *nccl_ctxs_;
 #endif
-  bool skip_scale_loss_;
+  bool use_default_grad_scale_;
 
   bool IsScaleLossOp(const OpDesc &op) const;
 
 
@@ -255,11 +255,11 @@ TEST(LoDTensor, RecordIO) {
     std::unique_ptr<std::istream> stream_ptr(stream);
     recordio::Scanner scanner(std::move(stream_ptr));
     auto tensors = ReadFromRecordIO(&scanner, ctx);
-    ASSERT_EQ(tensors.size(), 2);
+    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
     tensors = ReadFromRecordIO(&scanner, ctx);
-    ASSERT_EQ(tensors.size(), 2);
+    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
   }
Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ paddle_error paddle_matrix_get_row(paddle_matrix mat,`
`108`	`108`	`paddle_error paddle_matrix_get_shape(paddle_matrix mat,`
`109`	`109`	`uint64_t* height,`
`110`	`110`	`uint64_t* width) {`
`111`		`- if (mat == nullptr) return kPD_NULLPTR;`
	`111`	`+ if (mat == nullptr \|\| cast(mat)->mat == nullptr) return kPD_NULLPTR;`
`112`	`112`	`if (height != nullptr) {`
`113`	`113`	`*height = cast(mat)->mat->getHeight();`
`114`	`114`	`}`
Original file line number	Diff line number	Diff line change
`@@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue,`
`341`	`341`	`}`
`342`	`342`
`343`	`343`	`__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {`
`344`		`- int addr = idx % 32;`
	`344`	`+ const int warp_size = 32;`
	`345`	`+ int addr = idx % warp_size;`
	`346`	`+ unsigned mask = 0u;`
	`347`	`+ CREATE_SHFL_MASK(mask, addr < warp_size);`
`345`	`348`	`#pragma unroll`
`346`	`349`	`for (int k = 1; k < 32; k++) {`
`347`	`350`	`// rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);`
`348`		`- addr = __shfl_sync(addr, (idx + 1) % 32, 32);`
`349`		`- a[k] = __shfl_sync(a[k], addr, 32);`
	`351`	`+ addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);`
	`352`	`+ a[k] = __shfl_sync(mask, a[k], addr, 32);`
`350`	`353`	`}`
`351`	`354`
`352`	`355`	`#pragma unroll`
`@@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {`
`360`	`363`	`}`
`361`	`364`
`362`	`365`	`addr = (32 - idx) % 32;`
	`366`	`+ CREATE_SHFL_MASK(mask, idx % 32 < warp_size);`
`363`	`367`	`#pragma unroll`
`364`	`368`	`for (int k = 0; k < 32; k++) {`
`365`		`- a[k] = __shfl_sync(a[k], addr, 32);`
`366`		`- addr = __shfl_sync(addr, (idx + 31) % 32, 32);`
	`369`	`+ a[k] = __shfl_sync(mask, a[k], addr, 32);`
	`370`	`+ addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);`
`367`	`371`	`}`
`368`	`372`	`}`
`369`	`373`
Original file line number	Diff line number	Diff line change
`@@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,`
`244`	`244`	`if (--beamSize == 0) break;`
`245`	`245`	`__syncthreads();`
`246`	`246`
	`247`	`+ unsigned mask = 0u;`
	`248`	`+ // CREATE_SHFL_MASK(mask, tid < len);`
	`249`	`+`
`247`	`250`	`if (tid == maxId[0]) {`
`248`	`251`	`if (beam < maxLength) {`
`249`	`252`	`shTopK[tid] = topK[beam];`
`250`	`253`	`}`
`251`	`254`	`}`
`252`	`255`	`if (maxId[0] / 32 == warp) {`
`253`		`- if (__shfl_sync(beam, (maxId[0]) % 32, 32) == maxLength) break;`
	`256`	`+ if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;`
`254`	`257`	`}`
`255`	`258`	`}`
`256`	`259`	`}`