Skip to content

Commit ad1ad73

Browse files
committed
add gpu support for concat
1 parent 9c128fe commit ad1ad73

File tree

2 files changed

+26
-17
lines changed

2 files changed

+26
-17
lines changed

paddle/fluid/operators/math/concat.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
9898
int col_idx = 0;
9999
for (int j = 0; j < num; ++j) {
100100
int col_len = output_cols[j];
101-
auto* out_tensor = (*outputs)[j];
101+
auto* out_tensor = outputs->at(j);
102102
if (out_tensor != nullptr) {
103103
T* dst_ptr = out_tensor->data<T>() + k * col_len;
104104
memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,

paddle/fluid/operators/math/concat.cu

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
102102
int local_col = tid_x - curr_offset;
103103
int segment_width = curr_col_offset - curr_offset;
104104
T* output_ptr = outputs_data[curr_segment];
105-
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
106-
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
107-
output_ptr[tid_y * segment_width + local_col] =
108-
input_data[tid_y * in_col + tid_x];
105+
if (output_ptr != nullptr) {
106+
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
107+
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
108+
output_ptr[tid_y * segment_width + local_col] =
109+
input_data[tid_y * in_col + tid_x];
110+
}
109111
}
110112
}
111113

@@ -118,10 +120,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
118120
int split = tid_x / fixed_out_col;
119121
int in_offset = tid_x - split * fixed_out_col;
120122
T* output_ptr = outputs_data[split];
121-
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
122-
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
123-
output_ptr[tid_y * fixed_out_col + in_offset] =
124-
input_data[tid_y * in_col + tid_x];
123+
if (output_ptr != nullptr) {
124+
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
125+
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
126+
output_ptr[tid_y * fixed_out_col + in_offset] =
127+
input_data[tid_y * in_col + tid_x];
128+
}
125129
}
126130
}
127131

@@ -203,17 +207,18 @@ template <typename T>
203207
class ConcatGradFunctor<platform::CUDADeviceContext, T> {
204208
public:
205209
void operator()(const platform::CUDADeviceContext& context,
206-
const framework::Tensor& input, const int axis,
207-
std::vector<framework::Tensor>* outputs) {
210+
const framework::Tensor& input,
211+
const std::vector<const framework::Tensor*>& ref_inputs,
212+
const int axis, std::vector<framework::Tensor*>* outputs) {
208213
// TODO(zcd): Add input data validity checking
209214
int o_num = outputs->size();
210215
int out_row = 1;
211-
auto dim_0 = outputs->at(0).dims();
216+
auto dim_0 = ref_inputs[0]->dims();
212217
for (int i = 0; i < axis; ++i) {
213218
out_row *= dim_0[i];
214219
}
215220

216-
int out_col = outputs->at(0).numel() / out_row;
221+
int out0_col = ref_inputs[0]->numel() / out_row;
217222
int in_col = 0, in_row = out_row;
218223
bool sameShape = true;
219224

@@ -223,13 +228,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
223228

224229
outputs_cols[0] = 0;
225230
for (int i = 0; i < o_num; ++i) {
226-
int t_col = outputs->at(i).numel() / out_row;
231+
int t_col = outputs->at(i)->numel() / out_row;
227232
if (sameShape) {
228-
if (t_col != out_col) sameShape = false;
233+
if (t_col != out0_col) sameShape = false;
229234
}
230235
in_col += t_col;
231236
outputs_cols[i + 1] = in_col;
232-
outputs_ptr[i] = outputs->at(i).data<T>();
237+
if (outputs->at(i) != nullptr) {
238+
outputs_ptr[i] = outputs->at(i)->data<T>();
239+
} else {
240+
outputs_ptr[i] = nullptr;
241+
}
233242
}
234243

235244
T** dev_out_gpu_data =
@@ -255,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
255264

256265
if (sameShape) {
257266
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
258-
input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
267+
input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
259268
} else {
260269
const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
261270
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(

0 commit comments

Comments
 (0)