PaddlePaddle
diff --git a/‎benchmark/fluid/fluid_benchmark.py
Lines changed: 1 addition & 1 deletion b/‎benchmark/fluid/fluid_benchmark.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/fluid/kube_gen_job.py
Lines changed: 11 additions & 5 deletions b/‎benchmark/fluid/kube_gen_job.py
Lines changed: 11 additions & 5 deletions
diff --git a/‎cmake/external/mkldnn.cmake
Lines changed: 2 additions & 1 deletion b/‎cmake/external/mkldnn.cmake
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/fluid/howto/cluster/fluid_cluster_train_cn.md
Lines changed: 2 additions & 2 deletions b/‎doc/fluid/howto/cluster/fluid_cluster_train_cn.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/fluid/howto/cluster/fluid_recordio.md
Lines changed: 2 additions & 2 deletions b/‎doc/fluid/howto/cluster/fluid_recordio.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/activation_op.cc
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/operators/activation_op.cc
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/detection_map_op.cc
Lines changed: 6 additions & 6 deletions b/‎paddle/fluid/operators/detection_map_op.cc
Lines changed: 6 additions & 6 deletions
diff --git a/‎paddle/fluid/operators/gaussian_random_mkldnn_op.cc
Lines changed: 55 additions & 0 deletions b/‎paddle/fluid/operators/gaussian_random_mkldnn_op.cc
Lines changed: 55 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/gaussian_random_op.cc
Lines changed: 19 additions & 2 deletions b/‎paddle/fluid/operators/gaussian_random_op.cc
Lines changed: 19 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/math/concat.cu
Lines changed: 12 additions & 31 deletions b/‎paddle/fluid/operators/math/concat.cu
Lines changed: 12 additions & 31 deletions
@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
         return train_program, fluid.default_startup_program()
     else:
         raise ValueError(
-            'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
         )
 
 
 
@@ -108,10 +108,10 @@ def gen_job():
     tn_container["ports"][0]["containerPort"] = spreadport
 
     envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
-    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
     envs.append({"name": "PSERVERS", "value": str(args.pservers)})
     envs.append({"name": "ENTRY", "value": args.entry})
-    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
     envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
     # NOTE: these directories below are cluster specific, please modify
     # this settings before you run on your own cluster.
@@ -167,16 +167,22 @@ def gen_job():
     tn_container["volumeMounts"] = volumeMounts
 
     ps_container["env"] = envs
-    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
+    ps_container["env"].append({
+        "name": "PADDLE_TRAINING_ROLE",
+        "value": "PSERVER"
+    })
     tn_container["env"] = envs
     if args.disttype == "pserver":
         tn_container["env"].append({
-            "name": "TRAINING_ROLE",
+            "name": "PADDLE_TRAINING_ROLE",
             "value": "TRAINER"
         })
     elif args.disttype == "nccl2" or args.disttype == "local":
         # NCCL2 have no training role, set to plain WORKER
-        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
+        tn_container["env"].append({
+            "name": "PADDLE_TRAINING_ROLE",
+            "value": "WORKER"
+        })
 
     os.mkdir(args.jobname)
     if args.disttype == "pserver":
 
@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
     MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result")
+SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
+SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
 SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
 ExternalProject_Add(
 
@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
 
 第二步，启动Parameter Server：
 ```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
 ```
 执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
 
 第三步，启动Trainer：
 ```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
 ```
 由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。
 
 
@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
            ret_list.append(f)
    return ret_list
 
-trainers = int(os.getenv("TRAINERS"))
-trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+trainers = int(os.getenv("PADDLE_TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
 data_file = fluid.layers.io.open_files(
     filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
     thread_num=1,
 
@@ -143,7 +143,7 @@ Tanh Activation Operator.
 __attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.
 
-$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC";
 
@@ -385,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 STanh Activation Operator.
 
-$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
 )DOC");
   }
 
@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Detection mAP evaluate operator.
 The general steps are as follows. First, calculate the true positive and
- false positive according to the input of detection and labels, then
- calculate the mAP evaluate value.
- Supporting '11 point' and 'integral' mAP algorithm. Please get more information
- from the following articles:
- https://sanchom.wordpress.com/tag/average-precision/
- https://arxiv.org/abs/1512.02325
+false positive according to the input of detection and labels, then
+calculate the mAP evaluate value.
+Supporting '11 point' and 'integral' mAP algorithm. Please get more information
+from the following articles:
+https://sanchom.wordpress.com/tag/average-precision/
+https://arxiv.org/abs/1512.02325
 
 )DOC");
   }
 
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "paddle/fluid/operators/mean_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+template <typename T>
+class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::normal_distribution<T> dist(mean, std);
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+
+    // The format of output is set as the mkldnn's format
+    // TODO(@mozga-intel) The format of matrix sets inside the another layers.
+    tensor->set_layout(DataLayout::kMKLDNN);
+    tensor->set_format(mkldnn::memory::format::oihw);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(gaussian_random, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::GaussianMKLDNNKernel<float>);
@@ -15,6 +15,10 @@ limitations under the License. */
 #include <random>
 #include "paddle/fluid/framework/op_registry.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -62,9 +66,20 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
     return framework::OpKernelType(
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+        ctx.device_context(), layout, library);
   }
 };
 
@@ -95,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(int, default 5(FP32)) "
                  "Output data type.")
         .SetDefault(framework::proto::VarType::FP32);
-
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 GaussianRandom Operator.
 
 
@@ -22,43 +22,24 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T>
-__device__ T upper_bound(const T* first, T count, T val) {
-  const T* orig = first;
-  const T* it = nullptr;
-  T step = 0;
-  while (count > 0) {
-    it = first;
-    step = count / 2;
-    it += step;
-    if (!(val < *it)) {
-      first = ++it;
-      count -= step + 1;
-    } else {
-      count = step;
-    }
-  }
-  return first - orig;
-}
-
 template <typename T>
 __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
                              const int output_rows, const int output_cols,
                              T* output) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
-
-  int curr_offset = input_cols[segment];
-  int curr_segment = segment;
+  int curr_segment = 0;
+  int curr_offset = input_cols[0];
   for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
-    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
+    int curr_col_offset = input_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
       curr_offset = curr_col_offset;
       ++curr_segment;
+      curr_col_offset = input_cols[curr_segment + 1];
     }
 
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
+
     T* input_ptr = inputs[curr_segment];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
     for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
@@ -89,14 +70,14 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
                                  const int in_col, const int* out_cols,
                                  int out_cols_size, T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
-  int curr_offset = out_cols[segment];
-  int curr_segment = segment;
+  int curr_segment = 0;
+  int curr_offset = out_cols[0];
   for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
-    while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
+    int curr_col_offset = out_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
       curr_offset = curr_col_offset;
       ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
     }
 
     int local_col = tid_x - curr_offset;
@@ -228,7 +209,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
-      int t_col = outputs->at(i)->numel() / out_row;
+      int t_col = ref_inputs.at(i)->numel() / out_row;
       if (sameShape) {
         if (t_col != out0_col) sameShape = false;
       }
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):`
`97`	`97`	`return train_program, fluid.default_startup_program()`
`98`	`98`	`else:`
`99`	`99`	`raise ValueError(`
`100`		`- 'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'`
	`100`	`+ 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'`
`101`	`101`	`)`
`102`	`102`
`103`	`103`