@@ -75,15 +75,37 @@ class SeqExpandKernel : public framework::OpKernel<T> {
75
75
T* out_data = out->mutable_data <T>(context.GetPlace ());
76
76
77
77
// copy data
78
- Place place = boost::get<Place>( context.GetPlace () );
78
+ auto place = context.GetPlace ();
79
79
size_t count = 0 ;
80
- for (size_t i = 0 ; i < scales.size (); ++i) {
81
- count = element_len * (x_lod[0 ][i + 1 ] - x_lod[0 ][i]);
82
- for (size_t j = 0 ; j < scales[i]; ++j) {
83
- memory::Copy (place, out_data, place, x_data, sizeof (T) * count);
84
- out_data += count;
80
+ if (platform::is_cpu_place (place)) {
81
+ auto & cpu_place = boost::get<platform::CPUPlace>(place);
82
+ for (size_t i = 0 ; i < scales.size (); ++i) {
83
+ count = element_len * (x_lod[0 ][i + 1 ] - x_lod[0 ][i]);
84
+ for (size_t j = 0 ; j < scales[i]; ++j) {
85
+ memory::Copy (cpu_place, out_data, cpu_place, x_data,
86
+ sizeof (T) * count);
87
+ out_data += count;
88
+ }
89
+ x_data += count;
85
90
}
86
- x_data += count;
91
+ } else {
92
+ #ifdef PADDLE_WITH_CUDA
93
+ auto & gpu_place = boost::get<platform::GPUPlace>(place);
94
+ auto stream = reinterpret_cast <const platform::CUDADeviceContext&>(
95
+ context.device_context ())
96
+ .stream ();
97
+ for (size_t i = 0 ; i < scales.size (); ++i) {
98
+ count = element_len * (x_lod[0 ][i + 1 ] - x_lod[0 ][i]);
99
+ for (size_t j = 0 ; j < scales[i]; ++j) {
100
+ memory::Copy (gpu_place, out_data, gpu_place, x_data,
101
+ sizeof (T) * count, stream);
102
+ out_data += count;
103
+ }
104
+ x_data += count;
105
+ }
106
+ #else
107
+ PADDLE_THROW (" Paddle is not compiled with GPU" );
108
+ #endif
87
109
}
88
110
89
111
out->set_lod (out_lod);
@@ -113,7 +135,7 @@ class SeqExpandGradKernel : public framework::OpKernel<T> {
113
135
Eigen::TensorMap<Eigen::Tensor<T, 1 >> d_x_t (
114
136
d_x_data, static_cast <int >((ele_count * element_len) / repeat));
115
137
auto place = context.GetEigenDevice <Place>();
116
- d_x_t .device (place) = d_out_t .sum (Eigen::array<int , 1 >({0 }));
138
+ d_x_t .device (place) = d_out_t .sum (Eigen::array<int , 1 >({{ 0 } }));
117
139
d_out_data += (ele_count * element_len);
118
140
d_x_data += ((ele_count * element_len) / repeat);
119
141
}
0 commit comments