Use stream while memory::Copy in GPU mode

wanghaoshuang · wanghaoshuang · commit 00ad7512cf21 · 2017-10-20T15:23:48.000+08:00
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
@@ -40,7 +40,7 @@ class SeqExpandOp : public framework::OperatorWithKernel {
       out_dim[0] = out_dim[0] * repeat;
     }
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of PadOp should not be null.");
+                   "Output(Out) of SeqExpandOp should not be null.");
     ctx->SetOutputDim("Out", out_dim);
   }
 };
diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h
@@ -75,15 +75,37 @@ class SeqExpandKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     // copy data
-    Place place = boost::get<Place>(context.GetPlace());
+    auto place = context.GetPlace();
     size_t count = 0;
-    for (size_t i = 0; i < scales.size(); ++i) {
-      count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
-      for (size_t j = 0; j < scales[i]; ++j) {
-        memory::Copy(place, out_data, place, x_data, sizeof(T) * count);
-        out_data += count;
+    if (platform::is_cpu_place(place)) {
+      auto& cpu_place = boost::get<platform::CPUPlace>(place);
+      for (size_t i = 0; i < scales.size(); ++i) {
+        count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
+        for (size_t j = 0; j < scales[i]; ++j) {
+          memory::Copy(cpu_place, out_data, cpu_place, x_data,
+                       sizeof(T) * count);
+          out_data += count;
+        }
+        x_data += count;
       }
-      x_data += count;
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto& gpu_place = boost::get<platform::GPUPlace>(place);
+      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                        context.device_context())
+                        .stream();
+      for (size_t i = 0; i < scales.size(); ++i) {
+        count = element_len * (x_lod[0][i + 1] - x_lod[0][i]);
+        for (size_t j = 0; j < scales[i]; ++j) {
+          memory::Copy(gpu_place, out_data, gpu_place, x_data,
+                       sizeof(T) * count, stream);
+          out_data += count;
+        }
+        x_data += count;
+      }
+#else
+      PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
     }
 
     out->set_lod(out_lod);
@@ -113,7 +135,7 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
       Eigen::TensorMap<Eigen::Tensor<T, 1>> d_x_t(
           d_x_data, static_cast<int>((ele_count * element_len) / repeat));
       auto place = context.GetEigenDevice<Place>();
-      d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({0}));
+      d_x_t.device(place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
       d_out_data += (ele_count * element_len);
       d_x_data += ((ele_count * element_len) / repeat);
     }

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ class SeqExpandOp : public framework::OperatorWithKernel {`
`40`	`40`	`out_dim[0] = out_dim[0] * repeat;`
`41`	`41`	`}`
`42`	`42`	`PADDLE_ENFORCE(ctx->HasOutput("Out"),`
`43`		`- "Output(Out) of PadOp should not be null.");`
	`43`	`+ "Output(Out) of SeqExpandOp should not be null.");`
`44`	`44`	`ctx->SetOutputDim("Out", out_dim);`
`45`	`45`	`}`
`46`	`46`	`};`