@@ -26,53 +26,63 @@ class SequenceReshapeKernel : public framework::OpKernel<T> {
26
26
void Compute (const framework::ExecutionContext& context) const override {
27
27
auto * in = context.Input <LoDTensor>(" X" );
28
28
auto * out = context.Output <LoDTensor>(" Out" );
29
- int out_width = context.Attr <int >(" dimension" );
30
- bool whether_padding = context.Attr <bool >(" whether_padding" );
29
+ int out_width = context.Attr <int >(" new_dim" );
31
30
32
31
const T* p_in_data = in->data <T>();
33
- T* p_out_data = out->mutable_data <T>(context.GetPlace ());
34
32
35
- // compute shape for output
36
33
auto in_dims = in->dims ();
37
34
int64_t in_width = in_dims[1 ];
38
35
auto & in_lod = in->lod ();
39
36
40
37
PADDLE_ENFORCE_EQ (in_lod.size (), 1UL ,
41
38
" Only support one level sequence now." );
42
- PADDLE_ENFORCE_GE (
43
- in_dims[0 ],
44
- /* batch size = */ static_cast <int64_t >(in_lod[0 ].size () - 1 ),
45
- " The 1st dimension of Input(X) must be equal or larger than batch "
46
- " size." );
39
+ PADDLE_ENFORCE_EQ (
40
+ in_dims[0 ], in_lod[0 ].back (),
41
+ " Inconsistent size between X.shape[0] and X.lod()[0].back()." );
47
42
48
43
auto in_lod_l0 = in_lod[0 ];
49
44
int seq_num = in_lod_l0.size () - 1 ;
50
45
51
46
auto & out_lod = *out->mutable_lod ();
52
- out_lod.push_back (std::vector<size_t >({0 }));
53
- size_t offset = 0 ;
47
+ out_lod.resize (1 );
48
+ out_lod[0 ].clear ();
49
+ out_lod[0 ].push_back (0 );
54
50
for (int i = 0 ; i < seq_num; ++i) {
55
51
size_t seq_len = in_lod_l0[i + 1 ] - in_lod_l0[i];
56
- if (whether_padding) {
57
- offset += std::ceil ((float )(seq_len * in_width) / out_width);
58
- } else {
59
- offset += (seq_len * in_width) / out_width;
60
- }
61
- out_lod[0 ].push_back (offset);
52
+ size_t offset = 0 ;
53
+ offset = (seq_len * in_width) / out_width;
54
+ PADDLE_ENFORCE_EQ (offset * out_width, seq_len * in_width,
55
+ " Please make sure (sequence_length * dimension) can be "
56
+ " divided by new_dim with no remainder for each "
57
+ " sequence. The %dth sequence is invalid." ,
58
+ i + 1 );
59
+ PADDLE_ENFORCE_GT (offset, 0 ,
60
+ " Illegal operation, length of the %dth sequence become "
61
+ " to 0 after reshaped." ,
62
+ i + 1 );
63
+ out_lod[0 ].push_back (out_lod[0 ].back () + offset);
62
64
}
63
65
64
- out->Resize ({{static_cast <int64_t >(out_lod[0 ].back ()), out_width}});
66
+ out->mutable_data <T>(context.GetPlace ());
67
+ out->Resize ({static_cast <int64_t >(out_lod[0 ].back ()), out_width});
68
+ T* p_out_data = out->mutable_data <T>(context.GetPlace ());
65
69
math::set_constant (context.device_context (), out, 0 .0f );
66
70
67
71
for (int i = 0 ; i < seq_num; ++i) {
68
72
size_t in_offset = in_lod_l0[i] * in_width;
69
73
size_t out_offset = out_lod[0 ][i] * out_width;
70
- size_t bytes = sizeof (T) * (in_lod_l0[i + 1 ] - in_lod_l0[i]) * in_width;
74
+ size_t in_count = (in_lod_l0[i + 1 ] - in_lod_l0[i]) * in_width;
75
+ size_t out_count = (out_lod[0 ][i + 1 ] - out_lod[0 ][i]) * out_width;
76
+ size_t bytes = sizeof (T) * std::min (in_count, out_count);
71
77
if (platform::is_cpu_place (context.GetPlace ())) {
72
- std::memcpy (p_out_data + out_offset, p_in_data + in_offset, bytes);
78
+ memory::Copy (boost::get<platform::CPUPlace>(context.GetPlace ()),
79
+ p_out_data + out_offset,
80
+ boost::get<platform::CPUPlace>(context.GetPlace ()),
81
+ p_in_data + in_offset, bytes);
73
82
} else {
74
83
#ifdef PADDLE_WITH_CUDA
75
- auto & dev_ctx = context.template device_context <DeviceContext>();
84
+ auto & dev_ctx =
85
+ context.template device_context <platform::CUDADeviceContext>();
76
86
memory::Copy (boost::get<platform::CUDAPlace>(context.GetPlace ()),
77
87
p_out_data + out_offset,
78
88
boost::get<platform::CUDAPlace>(context.GetPlace ()),
@@ -103,16 +113,23 @@ class SequenceReshapeGradKernel : public framework::OpKernel<T> {
103
113
auto & out_lod = out_tensor_ptr->lod ();
104
114
int out_width = out_tensor_ptr->dims ()[1 ];
105
115
116
+ math::set_constant (context.device_context (), x_grad_tensor_ptr, 0 .0f );
117
+
106
118
for (int i = 0 ; i < seq_num; ++i) {
107
119
size_t src_offset = out_lod[0 ][i] * out_width;
108
120
size_t dst_offset = x_lod[0 ][i] * x_width;
109
- size_t bytes = sizeof (T) * (x_lod[0 ][i + 1 ] - x_lod[0 ][i]) * x_width;
121
+ size_t src_count = (out_lod[0 ][i + 1 ] - out_lod[0 ][i]) * out_width;
122
+ size_t dst_count = (x_lod[0 ][i + 1 ] - x_lod[0 ][i]) * x_width;
123
+ size_t bytes = sizeof (T) * std::min (src_count, dst_count);
110
124
if (platform::is_cpu_place (context.GetPlace ())) {
111
- std::memcpy (p_x_grad_data + dst_offset, p_out_grad_data + src_offset,
112
- bytes);
125
+ memory::Copy (boost::get<platform::CPUPlace>(context.GetPlace ()),
126
+ p_x_grad_data + dst_offset,
127
+ boost::get<platform::CPUPlace>(context.GetPlace ()),
128
+ p_out_grad_data + src_offset, bytes);
113
129
} else {
114
130
#ifdef PADDLE_WITH_CUDA
115
- auto & dev_ctx = context.template device_context <DeviceContext>();
131
+ auto & dev_ctx =
132
+ context.template device_context <platform::CUDADeviceContext>();
116
133
memory::Copy (boost::get<platform::CUDAPlace>(context.GetPlace ()),
117
134
p_x_grad_data + dst_offset,
118
135
boost::get<platform::CUDAPlace>(context.GetPlace ()),
0 commit comments