Merge pull request #5247 from qingqing01/memory_alignment

qingqing01 · web-flow · commit 3d567864bbbe · 2017-11-01T09:41:45.000+08:00
Allocate aligned memory by posix_memalign.
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
@@ -41,7 +41,16 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
 
   index = 0;  // unlock memory
 
-  void* p = malloc(size);
+  void* p;
+
+#ifdef PADDLE_USE_MKLDNN
+  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+  // memory alignment
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
+#else
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0);
+#endif
+  PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
 
   if (p != nullptr) {
     if (FLAGS_use_pinned_memory) {
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
@@ -185,7 +185,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
         recv_tensor.numel() * sizeof(float),
         static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
 
-    for (size_t j = 0; j < f::product(kDims); ++j) {
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
       ASSERT_NEAR(ct[j], result, 1e-5);
     }
   }
@@ -234,7 +234,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
       recv_tensor.numel() * sizeof(float),
       static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
 
-  for (int j = 0; j < f::product(kDims); ++j) {
+  for (int64_t j = 0; j < f::product(kDims); ++j) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
@@ -282,7 +282,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
       recv_tensor.numel() * sizeof(float),
       static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
 
-  for (size_t j = 0; j < f::product(kDims); ++j) {
+  for (int64_t j = 0; j < f::product(kDims); ++j) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
@@ -36,7 +36,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
     auto x_dims = ctx->GetInputDim("X");
     // TODO(qiao) change batch_size
-    for (int i = 1; i < shape.size(); ++i) {
+    for (size_t i = 1; i < shape.size(); ++i) {
       PADDLE_ENFORCE(shape[i] > 0,
                      "Each dimension of shape "
                      "must be positiv except the first.");
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
@@ -34,7 +34,7 @@ TEST(SaveLoadOp, CPU) {
 
   tensor->set_lod(expect_lod);
   int* expect = tensor->mutable_data<int>(place);
-  for (size_t i = 0; i < paddle::framework::product(tensor->dims()); ++i) {
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
     expect[i] = static_cast<int>(i);
   }
   paddle::framework::AttributeMap attrs;
@@ -50,7 +50,7 @@ TEST(SaveLoadOp, CPU) {
       "load", {}, {{"Out", {"out_var"}}}, attrs);
   load_op->Run(scope, ctx);
   int* actual = target->data<int>();
-  for (size_t i = 0; i < paddle::framework::product(tensor->dims()); ++i) {
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
     EXPECT_EQ(expect[i], actual[i]);
   }
   auto& actual_lod = target->lod();
@@ -60,4 +60,4 @@ TEST(SaveLoadOp, CPU) {
       EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
     }
   }
-}
+}

Original file line number	Diff line number	Diff line change
`@@ -185,7 +185,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {`
`185`	`185`	`recv_tensor.numel() * sizeof(float),`
`186`	`186`	`static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());`
`187`	`187`
`188`		`- for (size_t j = 0; j < f::product(kDims); ++j) {`
	`188`	`+ for (int64_t j = 0; j < f::product(kDims); ++j) {`
`189`	`189`	`ASSERT_NEAR(ct[j], result, 1e-5);`
`190`	`190`	`}`
`191`	`191`	`}`
`@@ -234,7 +234,7 @@ TEST_F(NCCLTester, ncclReduceOp) {`
`234`	`234`	`recv_tensor.numel() * sizeof(float),`
`235`	`235`	`static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());`
`236`	`236`
`237`		`- for (int j = 0; j < f::product(kDims); ++j) {`
	`237`	`+ for (int64_t j = 0; j < f::product(kDims); ++j) {`
`238`	`238`	`ASSERT_NEAR(ct[j], result, 1e-5);`
`239`	`239`	`}`
`240`	`240`	`}`
`@@ -282,7 +282,7 @@ TEST_F(NCCLTester, ncclBcastOp) {`
`282`	`282`	`recv_tensor.numel() * sizeof(float),`
`283`	`283`	`static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());`
`284`	`284`
`285`		`- for (size_t j = 0; j < f::product(kDims); ++j) {`
	`285`	`+ for (int64_t j = 0; j < f::product(kDims); ++j) {`
`286`	`286`	`ASSERT_NEAR(ct[j], result, 1e-5);`
`287`	`287`	`}`
`288`	`288`	`}`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ TEST(SaveLoadOp, CPU) {`
`34`	`34`
`35`	`35`	`tensor->set_lod(expect_lod);`
`36`	`36`	`int* expect = tensor->mutable_data<int>(place);`
`37`		`- for (size_t i = 0; i < paddle::framework::product(tensor->dims()); ++i) {`
	`37`	`+ for (int64_t i = 0; i < tensor->numel(); ++i) {`
`38`	`38`	`expect[i] = static_cast<int>(i);`
`39`	`39`	`}`
`40`	`40`	`paddle::framework::AttributeMap attrs;`
`@@ -50,7 +50,7 @@ TEST(SaveLoadOp, CPU) {`
`50`	`50`	`"load", {}, {{"Out", {"out_var"}}}, attrs);`
`51`	`51`	`load_op->Run(scope, ctx);`
`52`	`52`	`int* actual = target->data<int>();`
`53`		`- for (size_t i = 0; i < paddle::framework::product(tensor->dims()); ++i) {`
	`53`	`+ for (int64_t i = 0; i < tensor->numel(); ++i) {`
`54`	`54`	`EXPECT_EQ(expect[i], actual[i]);`
`55`	`55`	`}`
`56`	`56`	`auto& actual_lod = target->lod();`
`@@ -60,4 +60,4 @@ TEST(SaveLoadOp, CPU) {`
`60`	`60`	`EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);`
`61`	`61`	`}`
`62`	`62`	`}`
`63`		`-}`
	`63`	`+}`