Feature/rnn to array to lod tensor (#5411)

reyoung · QiJune · commit f72729d407fc · 2017-11-07T17:37:30.000-08:00
* Add LoDRankTable

LoD Rank Table stores the `level` of `lod` which is ordered by sequence
length in descending order. It is useful when implement dynamic RNN and
is shared by dynamic RNN memory, dynamic RNN slice input and dynamic
RNN slice output operators.

* Add skeleton for array_to_lod_tensor and lod_tensor_to_array

* Add VarType::LoDTensorArray

* Add PyBind of LoDTensorArray

* Add InferVarType

* Add first unittest

* Add ut

* Add unittest

* Add unittest

* Add unittests

* update

* init

* add infershape for lod_tensor_to_array_op

* compelete array_to_lod_tensor_op

* copy data

* clean code

* clean code

* Fix unittest data

* fix bugs

* fix compile error

* Refine TensorToArrayOp

* refactor array_to_lod_tensor

* Unittest

* fix bugs

* Fix unittest

* Fix unittest

* debug

* Debug

* Fix unittest

* clean code

* refactor

* use ostream

* update test

* fix gpu build error

* make gpu test pass
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
@@ -117,7 +117,7 @@ int64_t DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
-int64_t DDim::size() const { return arity(*this); }
+int DDim::size() const { return arity(*this); }
 
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
@@ -71,7 +71,7 @@ struct DDim {
 
   DDim operator*(DDim d) const;
 
-  int64_t size() const;
+  int size() const;
 };
 
 /**
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc
@@ -31,6 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
     TableItem item;
     item.index = i;
     item.length = vec[i + 1] - vec[i];
+    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
     items_.emplace_back(item);
   }
   // NOTE(yuyang18):
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
@@ -27,6 +27,20 @@
 namespace paddle {
 namespace framework {
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+  os << "{";
+  for (auto& v : lod) {
+    os << "{";
+    for (auto& i : v) {
+      os << i << ",";
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
 LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
   LoD new_lod;
   new_lod.reserve(level_end - level_begin);
@@ -136,37 +150,35 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   ShareDataWith(Slice(begin, end));
 }
 
-void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
-                             std::vector<std::vector<size_t>>* lod_length,
-                             size_t* start_offset) {
-  lod_length->clear();
-  PADDLE_ENFORCE(start_idx < lod.size() - 1,
-                 "start_idx should be >= 0 and < lod.size() - 1.");
-  PADDLE_ENFORCE(end_idx < lod.size(),
-                 "end_idx should be >= 0 and < lod.size().");
-  PADDLE_ENFORCE_LE(start_idx, end_idx,
-                    "start_idx should be less than end_idx.");
-  for (size_t level_idx = 0; level_idx < lod.size(); ++level_idx) {
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+                                        size_t end_idx, size_t start_level) {
+  LoD sub_lod;
+
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    PADDLE_ENFORCE_LE(start_idx, end_idx);
+    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
     std::vector<size_t> level_lens;
     for (size_t i = start_idx; i < end_idx; ++i) {
       level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
     }
-    lod_length->emplace_back(level_lens);
+    sub_lod.emplace_back(level_lens);
     start_idx = lod[level_idx][start_idx];
     end_idx = lod[level_idx][end_idx];
   }
-  *start_offset = start_idx;
+
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }
 
-void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length) {
-  PADDLE_ENFORCE_EQ(
-      lod->size(), lod_length.size(),
+void AppendLoD(LoD* lod, const LoD& lod_length) {
+  PADDLE_ENFORCE(
+      lod->empty() || lod->size() == lod_length.size(),
       "The lod_length should has the same size with the appended lod.");
+  if (lod->empty()) {
+    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+  }
   for (size_t i = 0; i < lod->size(); ++i) {
     auto& level = (*lod)[i];
-    if (level.empty()) {
-      level.push_back(0);
-    }
     for (size_t len : lod_length[i]) {
       level.push_back(level.back() + len);
     }
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
@@ -56,6 +56,8 @@ using Vector = thrust::host_vector<
  */
 using LoD = std::vector<Vector<size_t>>;
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+
 /*
  * Slice levels from a LoD.
  * NOTE the lowest level should always be the absolute offsets of the underlying
@@ -181,11 +183,10 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   return tensor;
 }
 
-void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
-                             std::vector<std::vector<size_t>>* lod_length,
-                             size_t* start_offset);
+std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
+    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
 
-void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length);
+void AppendLoD(LoD* lod, const LoD& lod_length);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
@@ -146,43 +146,44 @@ TEST(LodExpand, test) {
 
 TEST(LoD, GetFineGrainedLoDLength) {
   LoD lod;
-  lod.push_back(std::vector<size_t>{0, 2, 4, 5});
-  lod.push_back(std::vector<size_t>{0, 1, 6, 8, 10, 11});
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
   lod.push_back(
-      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29});
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));
 
-  std::vector<std::vector<size_t>> lod_length;
-  size_t start_offset;
-  paddle::framework::GetFineGrainedLoDLength(lod, 1, 2, &lod_length,
-                                             &start_offset);
+  auto lod_and_offset =
+      paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
+  LoD lod_length = lod_and_offset.first;
+  size_t start_offset = lod_and_offset.second.first;
+  size_t end_offset = lod_and_offset.second.second;
 
-  std::vector<std::vector<size_t>> expected;
+  LoD expected;
   expected.push_back(std::vector<size_t>{2});
   expected.push_back(std::vector<size_t>{2, 2});
   expected.push_back(std::vector<size_t>{2, 3, 4, 2});
   EXPECT_EQ(lod_length, expected);
   EXPECT_EQ(start_offset, 15UL);
+  EXPECT_EQ(end_offset, 26UL);
 }
 
 TEST(LoD, AppendLoD) {
-  std::vector<std::vector<size_t>> lod_lens;
-  lod_lens.push_back(std::vector<size_t>{2});
-  lod_lens.push_back(std::vector<size_t>{2, 2});
-  lod_lens.push_back(std::vector<size_t>{2, 3, 4, 2});
+  LoD lod_lens;
+  lod_lens.push_back(std::vector<size_t>({2}));
+  lod_lens.push_back(std::vector<size_t>({2, 2}));
+  lod_lens.push_back(std::vector<size_t>({2, 3, 4, 2}));
 
   LoD origin;
-  origin.push_back(std::vector<size_t>{0, 2});
-  origin.push_back(std::vector<size_t>{0, 1, 6});
-  origin.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15});
+  origin.push_back(std::vector<size_t>({0, 2}));
+  origin.push_back(std::vector<size_t>({0, 1, 6}));
+  origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));
 
   paddle::framework::AppendLoD(&origin, lod_lens);
 
   LoD expected;
-  expected.push_back(std::vector<size_t>{0, 2, 4});
-  expected.push_back(std::vector<size_t>{0, 1, 6, 8, 10});
+  expected.push_back(std::vector<size_t>({0, 2, 4}));
+  expected.push_back(std::vector<size_t>({0, 1, 6, 8, 10}));
   expected.push_back(
-      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26});
-
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
   EXPECT_EQ(origin, expected);
 }
 
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
@@ -45,7 +45,8 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {
       desc_.mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
   }
 }
 
@@ -56,7 +57,8 @@ int32_t VarDescBind::GetLodLevel() const {
     case VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().lod_level();
     default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
   }
 }
 
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
@@ -170,6 +170,8 @@ set(DEPS_OPS
     sequence_conv_op
     sequence_pool_op
     lod_rank_table_op
+    lod_tensor_to_array_op
+    array_to_lod_tensor_op
     lstm_op
     tensor_array_read_write_op
     gru_op)
@@ -182,6 +184,8 @@ op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
 op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <numeric>
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class ArrayToLoDTensorOp : public framework::OperatorBase {
+ public:
+  ArrayToLoDTensorOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    // Check dims, place and data type of input's elements and infer output's
+    // dim
+    PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
+    int rank = x[0].dims().size();
+    platform::Place place = x[0].place();
+    std::type_index data_type = x[0].type();
+    framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
+    int64_t batch_size = x[0].dims()[0];
+    for (size_t i = 1; i < x.size(); ++i) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
+                        "The dimension of the %zu'th element in LoDTensorArray "
+                        "differs from previous ones.",
+                        i);
+      PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
+                     "The place class of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      PADDLE_ENFORCE(x[i].type() == data_type,
+                     "The date type of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      batch_size += x[i].dims()[0];
+    }
+    auto ins_dim_vec = framework::vectorize(ins_dims);
+    ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
+    framework::DDim out_dims = framework::make_ddim(ins_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto &table_items = rank_table.items();
+    std::vector<size_t> table_item_idx(table_items.size());
+    // table_item_idx = range(table_items_idx.size())
+    std::iota(table_item_idx.begin(), table_item_idx.end(), 0);
+    std::sort(table_item_idx.begin(), table_item_idx.end(),
+              [&](size_t a, size_t b) {
+                return table_items[a].index < table_items[b].index;
+              });
+
+    // Build LoDTensor `out`
+    framework::LoD *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+    auto prefix_lod = rank_table.coarse_lod();
+    prefix_lod.emplace_back();
+    auto &cur_level_lod = prefix_lod.back();
+    cur_level_lod.push_back(0);
+    for (size_t idx : table_item_idx) {
+      cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
+      for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
+        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x[x_idx].lod(), idx, idx + 1, 0);
+
+        auto &lod_length = lod_and_offset.first;
+        framework::AppendLoD(out_lod, lod_length);
+
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
+                 << ", " << end_offset << "]";
+        // Copy data
+        PADDLE_ENFORCE_GE(end_offset, start_offset);
+        size_t len = end_offset - start_offset;
+        if (len == 0) {
+          continue;
+        }
+        out->Slice(out_offset, out_offset + len)
+            .CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx);
+        out_offset += len;
+      }
+    }
+    out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
+  }
+};
+
+class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(std::vector<LodTensor>) A vector of tensors that is going to "
+             "be casted to a big LoDTensor.");
+    AddInput("RankTable",
+             "(LoDRankTable) RankTable provides the coarse lod infomation to "
+             "build the output LoDTensor. See "
+             "'paddle/framework/lod_rank_table.h' for more details.");
+    AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
+    AddComment(
+        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor> 
+          and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
+          outputs back to a normal LoDTensor. The std::vector<LoDTensor> 
+          would be the output of RNN Op and the LoDRankTable would be build 
+          with RNN's input.)DOC");
+  }
+};
+
+class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "ArrayToLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("RankTable"),
+                   "ArrayToLoDTensorOp must has input RankTable.");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
+                  ops::ArrayToLoDTensorOpProtoMaker,
+                  ops::ArrayToLoDTensorInferShape);
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
@@ -28,6 +28,7 @@ class LoDRankTableOp : public framework::OperatorBase {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
+    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
     out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
   }
 };
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
diff --git a/python/paddle/v2/framework/tests/test_lod_rank_table.py b/python/paddle/v2/framework/tests/test_lod_rank_table.py
diff --git a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ int64_t DDim::operator[](int idx) const {`
`117`	`117`	`return boost::apply_visitor(DynamicConstIndexer(idx), var);`
`118`	`118`	`}`
`119`	`119`
`120`		`-int64_t DDim::size() const { return arity(*this); }`
	`120`	`+int DDim::size() const { return arity(*this); }`
`121`	`121`
`122`	`122`	`bool DDim::operator==(DDim d) const {`
`123`	`123`	`if (var.which() != d.getVar().which()) {`
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {`
`31`	`31`	`TableItem item;`
`32`	`32`	`item.index = i;`
`33`	`33`	`item.length = vec[i + 1] - vec[i];`
	`34`	`+ VLOG(10) << "Add item to rank table " << item.index << " " << item.length;`
`34`	`35`	`items_.emplace_back(item);`
`35`	`36`	`}`
`36`	`37`	`// NOTE(yuyang18):`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,8 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {`
`45`	`45`	`desc_.mutable_tensor_array()->set_lod_level(lod_level);`
`46`	`46`	`break;`
`47`	`47`	`default:`
`48`		`- PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());`
	`48`	`+ PADDLE_THROW("Tensor type=%d does not support LoDLevel",`
	`49`	`+ desc_.tensor_array().lod_level());`
`49`	`50`	`}`
`50`	`51`	`}`
`51`	`52`
`@@ -56,7 +57,8 @@ int32_t VarDescBind::GetLodLevel() const {`
`56`	`57`	`case VarDesc::LOD_TENSOR_ARRAY:`
`57`	`58`	`return desc_.tensor_array().lod_level();`
`58`	`59`	`default:`
`59`		`- PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());`
	`60`	`+ PADDLE_THROW("Tensor type=%d does not support LoDLevel",`
	`61`	`+ desc_.tensor_array().lod_level());`
`60`	`62`	`}`
`61`	`63`	`}`
`62`	`64`
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ class LoDRankTableOp : public framework::OperatorBase {`
`28`	`28`	`auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();`
`29`	`29`	`auto *out =`
`30`	`30`	`scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();`
	`31`	`+ VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));`
`31`	`32`	`out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));`
`32`	`33`	`}`
`33`	`34`	`};`