Skip to content

Commit 3e3599f

Browse files
committed
Refine split tensorrt plugin
1 parent 33c6551 commit 3e3599f

File tree

3 files changed

+134
-35
lines changed

3 files changed

+134
-35
lines changed

paddle/fluid/inference/tensorrt/convert/split_op.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,14 @@ class SplitOpConverter : public OpConverter {
4040
int axis = boost::get<int>(op_desc.GetAttr("axis"));
4141
std::vector<int> output_lengths =
4242
boost::get<std::vector<int>>(op_desc.GetAttr("sections"));
43-
PADDLE_ENFORCE(axis != 0);
43+
// PADDLE_ENFORCE(axis != 0);
4444
if (axis < 0) {
4545
axis += input_dims.nbDims;
4646
} else {
4747
axis -= 1;
4848
}
4949

5050
PADDLE_ENFORCE(output_lengths.size() == output_num);
51-
5251
//
5352
plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths);
5453
nvinfer1::IPluginLayer* layer =

paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu

Lines changed: 126 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,61 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
#include <cuda_fp16.h>
16+
#include <algorithm>
1517
#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
1618

1719
namespace paddle {
1820
namespace inference {
1921
namespace tensorrt {
2022
namespace plugin {
2123

24+
// copied from operators::math::SplitFunctor
25+
template <typename T>
26+
__global__ void SplitKernel(const T* input_data, const int in_row,
27+
const int in_col, const int* out_cols,
28+
int out_cols_size, T** outputs_data) {
29+
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
30+
int curr_segment = 0;
31+
int curr_offset = out_cols[0];
32+
for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
33+
int curr_col_offset = out_cols[curr_segment + 1];
34+
while (curr_col_offset <= tid_x) {
35+
curr_offset = curr_col_offset;
36+
++curr_segment;
37+
curr_col_offset = out_cols[curr_segment + 1];
38+
}
39+
40+
int local_col = tid_x - curr_offset;
41+
int segment_width = curr_col_offset - curr_offset;
42+
T* output_ptr = outputs_data[curr_segment];
43+
if (output_ptr != nullptr) {
44+
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
45+
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
46+
output_ptr[tid_y * segment_width + local_col] =
47+
input_data[tid_y * in_col + tid_x];
48+
}
49+
}
50+
}
51+
52+
template <typename T>
53+
__global__ void SplitKernel(const T* input_data, const int in_row,
54+
const int in_col, const int fixed_out_col,
55+
T** outputs_data) {
56+
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
57+
for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
58+
int split = tid_x / fixed_out_col;
59+
int in_offset = tid_x - split * fixed_out_col;
60+
T* output_ptr = outputs_data[split];
61+
if (output_ptr != nullptr) {
62+
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
63+
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
64+
output_ptr[tid_y * fixed_out_col + in_offset] =
65+
input_data[tid_y * in_col + tid_x];
66+
}
67+
}
68+
}
69+
2270
nvinfer1::Dims SplitPlugin::getOutputDimensions(
2371
int index, const nvinfer1::Dims* input_dims, int num_inputs) {
2472
PADDLE_ENFORCE_EQ(num_inputs, 1);
@@ -31,48 +79,95 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions(
3179

3280
int SplitPlugin::initialize() {
3381
PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS);
34-
82+
// notice input dims is [C, H, W]
83+
nvinfer1::Dims dims = this->getInputDims(0);
84+
outer_rows_ = 1;
85+
inner_cols_ = 1;
86+
for (int i = 0; i < axis_; ++i) {
87+
outer_rows_ *= dims.d[i];
88+
}
89+
for (int i = axis_ + 1; i < dims.nbDims; ++i) {
90+
inner_cols_ *= dims.d[i];
91+
}
92+
same_shape_ = true;
3593
std::vector<int> segment_offsets(1, 0);
3694
for (int i = 0; i < this->getNbOutputs(); ++i) {
37-
segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
95+
if (output_length_[i] != output_length_[0]) {
96+
same_shape_ = false;
97+
}
98+
segment_offsets.push_back(segment_offsets.back() +
99+
output_length_[i] * inner_cols_);
38100
}
39-
segment_offsets_ = segment_offsets;
40-
nvinfer1::Dims dims = this->getInputDims(0);
41-
nx_ = 1;
42-
for (int i = dims.nbDims - 1; i > axis_; --i) {
43-
nx_ *= dims.d[i];
101+
inner_cols_ *= dims.d[axis_];
102+
d_segment_offsets_ = segment_offsets;
103+
segment_offsets_ = std::move(segment_offsets);
104+
d_output_ptrs_.resize(this->getNbOutputs(), nullptr);
105+
return 0;
106+
}
107+
108+
template <typename T>
109+
inline void Split(cudaStream_t stream, const bool same_shape,
110+
const int outer_rows, const int inner_cols,
111+
const std::vector<int>& segment_offsets,
112+
const int* d_segment_offsets, const T* input, T** outputs) {
113+
const int kThreadsPerBlock = 1024;
114+
const int kMaxBlocks = 65535;
115+
int block_cols = kThreadsPerBlock;
116+
if (inner_cols < kThreadsPerBlock) { // block_cols is aligned by 32.
117+
block_cols = ((inner_cols + 31) >> 5) << 5;
44118
}
45-
ny_ = dims.d[axis_];
46-
nz_ = 1;
47-
for (int i = axis_ - 1; i >= 0; --i) {
48-
nz_ *= dims.d[i];
119+
int block_rows = kThreadsPerBlock / block_cols;
120+
dim3 block_size = dim3(block_cols, block_rows, 1);
121+
122+
int grid_cols =
123+
std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks);
124+
int grid_rows =
125+
std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1));
126+
dim3 grid_size = dim3(grid_cols, grid_rows, 1);
127+
128+
if (same_shape) {
129+
SplitKernel<<<grid_size, block_size, 0, stream>>>(
130+
input, outer_rows, inner_cols, segment_offsets[1], outputs);
131+
} else {
132+
SplitKernel<<<grid_size, block_size, 0, stream>>>(
133+
input, outer_rows, inner_cols, d_segment_offsets,
134+
static_cast<int>(segment_offsets.size()), outputs);
49135
}
50-
return 0;
51136
}
52137

53138
int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
54139
void** outputs, void* workspace, cudaStream_t stream) {
55-
auto const& input_dims = this->getInputDims(0);
56-
int input_size = 0;
57-
float const* idata = reinterpret_cast<float const*>(inputs[0]);
58-
float** odatas = reinterpret_cast<float**>(outputs);
59-
60-
// kernel impl here.
61-
int inputBatchOffset = nx_ * ny_ * nz_;
62-
for (size_t i = 0; i < this->getNbOutputs(); i++) {
63-
for (size_t j = 0; j < batchSize; j++) {
64-
cudaMemcpyAsync(
65-
odatas[i] +
66-
j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ *
67-
sizeof(float),
68-
inputs[0] +
69-
(inputBatchOffset * j + segment_offsets_[i] * nx_) *
70-
sizeof(float),
71-
(segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float),
72-
cudaMemcpyDeviceToDevice, stream);
140+
float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
141+
if (axis_ == -1 && this->getNbOutputs() < 10) {
142+
float** output_ptrs = reinterpret_cast<float**>(outputs);
143+
int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT)
144+
? sizeof(__half)
145+
: sizeof(float);
146+
for (int i = 0; i < this->getNbOutputs(); ++i) {
147+
PADDLE_ENFORCE(
148+
cudaMemcpyAsync(
149+
output_ptrs[i], input_ptr + segment_offsets_[i],
150+
(segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size,
151+
cudaMemcpyDeviceToDevice, stream) == cudaSuccess);
152+
}
153+
} else {
154+
outer_rows_ *= batchSize;
155+
const int* d_segment_offsets_ptr =
156+
thrust::raw_pointer_cast(&d_segment_offsets_[0]);
157+
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
158+
PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs,
159+
this->getNbOutputs() * sizeof(float*),
160+
cudaMemcpyHostToDevice,
161+
stream) == cudaSuccess);
162+
if (this->getDataType() == nvinfer1::DataType::kFLOAT) {
163+
Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
164+
d_segment_offsets_ptr, input_ptr, output_ptrs);
165+
} else {
166+
Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
167+
d_segment_offsets_ptr, (__half*)input_ptr, // NOLINT
168+
(__half**)output_ptrs); // NOLINT
73169
}
74170
}
75-
76171
return cudaGetLastError() != cudaSuccess;
77172
}
78173

paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#pragma once
1616

17+
#include <thrust/device_vector.h>
1718
#include <vector>
1819
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
1920

@@ -25,7 +26,7 @@ namespace plugin {
2526
class SplitPlugin : public PluginTensorRT {
2627
public:
2728
SplitPlugin(int axis, std::vector<int> const &output_lengths)
28-
: axis_(axis), output_length_(output_lengths) {}
29+
: axis_(axis), same_shape_(true), output_length_(output_lengths) {}
2930

3031
SplitPlugin(void const *serial_data, size_t serial_length) {
3132
deserializeBase(serial_data, serial_length);
@@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT {
6061
}
6162

6263
int axis_;
64+
int outer_rows_;
65+
int inner_cols_;
66+
bool same_shape_;
6367
std::vector<int> output_length_;
64-
int nx_, ny_, nz_;
6568
std::vector<int> segment_offsets_;
69+
thrust::device_vector<int> d_segment_offsets_;
70+
thrust::device_vector<float *> d_output_ptrs_;
6671
};
6772

6873
} // namespace plugin

0 commit comments

Comments
 (0)