Skip to content

Commit 155ebbb

Browse files
authored
Merge pull request #13449 from chengduoZH/speed_up_lod_tensor_to_array
Speed up lod_tensor to array and array to lod_tensor
2 parents 93456fc + 7b464d6 commit 155ebbb

File tree

3 files changed

+131
-25
lines changed

3 files changed

+131
-25
lines changed

paddle/fluid/operators/array_to_lod_tensor_op.cc

Lines changed: 57 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
1111
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
14+
#include <paddle/fluid/operators/math/concat.h>
1415
#include <numeric>
1516

1617
#include "paddle/fluid/framework/lod_rank_table.h"
@@ -24,6 +25,50 @@ namespace operators {
2425

2526
using LoD = framework::LoD;
2627

28+
class ArrayToLoDFunctor;
29+
template <typename DeviceContext>
30+
struct ArrayToLoDFunctorImpl {
31+
const ArrayToLoDFunctor *prev_functor_;
32+
DeviceContext *dev_ctx_;
33+
34+
template <typename T>
35+
void apply();
36+
};
37+
38+
struct ArrayToLoDFunctor : public boost::static_visitor<void> {
39+
std::vector<framework::Tensor> in;
40+
mutable framework::Tensor *out;
41+
42+
template <typename Place>
43+
void operator()(Place place) const {
44+
auto &pool = platform::DeviceContextPool::Instance();
45+
if (std::is_same<Place, platform::CPUPlace>::value) {
46+
Apply(static_cast<platform::CPUDeviceContext *>(pool.Get(place)));
47+
} else {
48+
#ifdef PADDLE_WITH_CUDA
49+
Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place)));
50+
#else
51+
PADDLE_THROW("Fluid is not compiled with CUDA");
52+
#endif
53+
}
54+
}
55+
56+
template <typename DeviceContext>
57+
void Apply(DeviceContext *dev_ctx) const {
58+
ArrayToLoDFunctorImpl<DeviceContext> functor;
59+
functor.dev_ctx_ = dev_ctx;
60+
functor.prev_functor_ = this;
61+
framework::VisitDataType(framework::ToDataType(out->type()), functor);
62+
}
63+
};
64+
65+
template <typename DeviceContext>
66+
template <typename T>
67+
void ArrayToLoDFunctorImpl<DeviceContext>::apply() {
68+
math::ConcatFunctor<DeviceContext, T> func;
69+
func(*dev_ctx_, prev_functor_->in, 0, prev_functor_->out);
70+
}
71+
2772
class ArrayToLoDTensorOp : public framework::OperatorBase {
2873
public:
2974
ArrayToLoDTensorOp(const std::string &type,
@@ -47,14 +92,18 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
4792
int rank = x[0].dims().size();
4893
platform::Place place = x[0].place();
4994
std::type_index data_type = x[0].type();
50-
framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
5195
int64_t batch_size = x[0].dims()[0];
96+
framework::DDim ins_dims = rank > 1
97+
? framework::slice_ddim(x[0].dims(), 1, rank)
98+
: framework::make_ddim({0});
5299
for (size_t i = 1; i < x.size(); ++i) {
53-
PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
100+
auto ins_i_dims = rank > 1 ? framework::slice_ddim(x[i].dims(), 1, rank)
101+
: framework::make_ddim({0});
102+
PADDLE_ENFORCE_EQ(ins_i_dims, ins_dims,
54103
"The dimension of the %zu'th element in LoDTensorArray "
55104
"differs from previous ones.",
56105
i);
57-
PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
106+
PADDLE_ENFORCE(x[i].place() == place,
58107
"The place class of the %zu'th element in LoDTensorArray "
59108
"differs from previous ones.",
60109
i);
@@ -82,13 +131,14 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
82131
// Build LoDTensor `out`
83132
framework::LoD *out_lod = out->mutable_lod();
84133
out_lod->clear();
85-
size_t out_offset = 0;
86134
auto prefix_lod = rank_table.coarse_lod();
87135
prefix_lod.emplace_back();
88136
auto &cur_level_lod = prefix_lod.back();
89137
cur_level_lod.push_back(0);
138+
ArrayToLoDFunctor functor;
90139
for (size_t idx : table_item_idx) {
91140
cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
141+
PADDLE_ENFORCE_LE(table_items[idx].length, x.size());
92142
for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
93143
auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
94144
x[x_idx].lod(), idx, idx + 1, 0);
@@ -106,17 +156,11 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
106156
if (len == 0) {
107157
continue;
108158
}
109-
auto slice = out->Slice(out_offset, out_offset + len);
110-
111-
platform::DeviceContextPool &pool =
112-
platform::DeviceContextPool::Instance();
113-
auto &dev_ctx = *pool.Get(place);
114-
115-
framework::TensorCopy(x[x_idx].Slice(start_offset, end_offset), place,
116-
dev_ctx, &slice);
117-
out_offset += len;
159+
functor.in.emplace_back(x[x_idx].Slice(start_offset, end_offset));
118160
}
119161
}
162+
functor.out = out;
163+
platform::VisitPlace(place, functor);
120164
out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
121165
}
122166
};

paddle/fluid/operators/cross_entropy_op.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,10 @@ class XeGradFunctor {
8686
auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
8787
for (size_t x_offset = sample_id * num_classes_;
8888
x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
89-
dx_[x_offset] =
90-
(x_offset != x_is_true_offset || label_[sample_id] == ignore_index_)
91-
? static_cast<T>(0)
92-
: -dy_[sample_id] / x_[x_offset];
89+
dx_[x_offset] = (x_offset != x_is_true_offset ||
90+
label_[sample_id] == static_cast<int64_t>(ignore_index_))
91+
? static_cast<T>(0)
92+
: -dy_[sample_id] / x_[x_offset];
9393
}
9494
}
9595

paddle/fluid/operators/lod_tensor_to_array_op.cc

Lines changed: 70 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
1111
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
14+
#include <algorithm>
15+
#include <map>
1416
#include "paddle/fluid/framework/lod_rank_table.h"
1517
#include "paddle/fluid/framework/lod_tensor_array.h"
1618
#include "paddle/fluid/framework/op_registry.h"
1719
#include "paddle/fluid/operators/detail/safe_ref.h"
20+
#include "paddle/fluid/operators/math/concat.h"
1821
#include "paddle/fluid/platform/device_context.h"
1922
#include "paddle/fluid/platform/port.h"
2023

@@ -26,6 +29,61 @@ struct CopyRange {
2629
size_t end;
2730
};
2831

32+
struct LoDTensorToArrayFunctor;
33+
34+
template <typename DeviceContext>
35+
struct LoDTensorToArrayFunctorImpl {
36+
const LoDTensorToArrayFunctor *prev_functor_;
37+
DeviceContext *dev_ctx_;
38+
template <typename T>
39+
void apply();
40+
};
41+
42+
struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
43+
std::vector<const framework::Tensor *> ref_inputs_;
44+
mutable std::vector<framework::Tensor *> outputs_;
45+
const framework::Tensor &input_;
46+
47+
explicit LoDTensorToArrayFunctor(const framework::Tensor &input)
48+
: input_(input) {}
49+
50+
void AddOutput(framework::Tensor *t) {
51+
outputs_.emplace_back(t);
52+
ref_inputs_.emplace_back(t);
53+
}
54+
55+
template <typename Place>
56+
void operator()(Place place) const {
57+
auto &pool = platform::DeviceContextPool::Instance();
58+
auto *dev_ctx = pool.Get(place);
59+
if (std::is_same<Place, platform::CPUPlace>::value) {
60+
Apply(static_cast<platform::CPUDeviceContext *>(dev_ctx));
61+
} else {
62+
#ifdef PADDLE_WITH_CUDA
63+
Apply(static_cast<platform::CUDADeviceContext *>(dev_ctx));
64+
#else
65+
PADDLE_THROW("Not compiled with cuda");
66+
#endif
67+
}
68+
}
69+
70+
template <typename DeviceContext>
71+
void Apply(DeviceContext *dev_ctx) const {
72+
LoDTensorToArrayFunctorImpl<DeviceContext> func;
73+
func.prev_functor_ = this;
74+
func.dev_ctx_ = dev_ctx;
75+
framework::VisitDataType(framework::ToDataType(input_.type()), func);
76+
}
77+
};
78+
79+
template <typename DeviceContext>
80+
template <typename T>
81+
void LoDTensorToArrayFunctorImpl<DeviceContext>::apply() {
82+
math::ConcatGradFunctor<DeviceContext, T> func;
83+
func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0,
84+
&prev_functor_->outputs_);
85+
}
86+
2987
class LoDTensorToArrayOp : public framework::OperatorBase {
3088
public:
3189
LoDTensorToArrayOp(const std::string &type,
@@ -72,6 +130,11 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
72130
copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
73131
}
74132
}
133+
134+
auto &outputs = *const_cast<framework::Scope &>(scope)
135+
.Var()
136+
->GetMutable<std::map<size_t, framework::Tensor>>();
137+
75138
for (size_t i = 0; i < max_seq_len; ++i) {
76139
auto &ranges = copy_ranges[i];
77140
size_t height = std::accumulate(
@@ -90,17 +153,16 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
90153
// out[i][offset: offset+len] = x[each_range.begin: each_range.end]
91154
auto slice = out[i].Slice(static_cast<int>(offset),
92155
static_cast<int>(offset + len));
93-
94-
platform::DeviceContextPool &pool =
95-
platform::DeviceContextPool::Instance();
96-
auto &dev_ctx = *pool.Get(place);
97-
98-
framework::TensorCopy(x.Slice(static_cast<int>(each_range.begin),
99-
static_cast<int>(each_range.end)),
100-
x.place(), dev_ctx, &slice);
156+
outputs.insert({each_range.begin, slice});
101157
offset += len;
102158
}
103159
}
160+
161+
LoDTensorToArrayFunctor functor(x);
162+
for (auto &out_pair : outputs) {
163+
functor.AddOutput(&out_pair.second);
164+
}
165+
platform::VisitPlace(place, functor);
104166
}
105167
};
106168

0 commit comments

Comments
 (0)