Skip to content

Commit 5dbe9e5

Browse files
[cherry-pick] Improve topk performance. (#21087) (#21441)
* Improve topk performance. give 200000 data to compute topk, before opt: cost 1s after opt: cost 0.0028s. * Refine return value. * Add cuda util funtions. * Fix ComputeBlockSize bug & refine comments. Signed-off-by: zhaoyuchen <[email protected]>
1 parent 2f0f10b commit 5dbe9e5

File tree

6 files changed

+223
-6
lines changed

6 files changed

+223
-6
lines changed

paddle/fluid/operators/top_k_op.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ class TopkOp : public framework::OperatorWithKernel {
4242

4343
framework::DDim dims = input_dims;
4444
dims[dims.size() - 1] = k;
45+
// If has K as tensor, set k=-1 as not know real size at this time.
46+
if (ctx->HasInput("K")) {
47+
dims[dims.size() - 1] = -1;
48+
}
49+
4550
ctx->SetOutputDim("Out", dims);
4651
ctx->SetOutputDim("Indices", dims);
4752
ctx->ShareLoD("X", "Out");

paddle/fluid/operators/top_k_op.cu

Lines changed: 179 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

15+
#include "cub/cub.cuh"
1516
#include "paddle/fluid/framework/op_registry.h"
1617
#include "paddle/fluid/operators/top_k_op.h"
1718
#include "paddle/fluid/platform/cuda_device_function.h"
1819
#include "paddle/fluid/platform/float16.h"
1920

21+
// set cub base traits in order to handle float16
22+
namespace cub {
23+
template <>
24+
struct NumericTraits<paddle::platform::float16>
25+
: BaseTraits<FLOATING_POINT, true, false, uint16_t,
26+
paddle::platform::float16> {};
27+
} // namespace cub
28+
2029
namespace paddle {
2130
namespace operators {
2231

@@ -303,6 +312,160 @@ inline static int GetDesiredBlockDim(int dim) {
303312
}
304313
}
305314

315+
// Iter for move to next row
316+
struct SegmentOffsetIter {
317+
EIGEN_DEVICE_FUNC
318+
explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
319+
320+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
321+
return idx * num_cols_;
322+
}
323+
324+
int num_cols_;
325+
};
326+
327+
// Iter using into a column
328+
struct ColumnIndexIter {
329+
explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {}
330+
331+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(
332+
const Eigen::array<int, 1>& ix) const {
333+
return ix[0] % num_cols_;
334+
}
335+
336+
int num_cols_;
337+
};
338+
339+
__global__ void InitIndex(int64_t* indices, int64_t num_rows,
340+
int64_t num_cols) {
341+
int col_id = threadIdx.x;
342+
int row_id = blockIdx.x;
343+
344+
for (int64_t j = row_id; j < num_rows; j += gridDim.x) {
345+
for (int64_t i = col_id; i < num_cols; i += blockDim.x) {
346+
indices[j * num_cols + i] = i;
347+
}
348+
}
349+
}
350+
351+
template <typename T>
352+
bool SortTopk(const platform::CUDADeviceContext& ctx,
353+
const framework::Tensor* input_tensor, const int64_t num_cols,
354+
const int64_t num_rows, const int k,
355+
framework::Tensor* out_tensor,
356+
framework::Tensor* indices_tensor) {
357+
auto cu_stream = ctx.stream();
358+
359+
Tensor input_indices;
360+
const std::vector<int64_t> dims = {num_rows, num_cols};
361+
auto dim = framework::make_ddim(dims);
362+
input_indices.Resize(dim);
363+
// input_indices.Resize(num_rows*num_cols);
364+
input_indices.mutable_data<int64_t>(ctx.GetPlace());
365+
size_t temp_storage_bytes = -1;
366+
367+
auto ComputeBlockSize = [](int col) {
368+
if (col > 512)
369+
return 1024;
370+
else if (col > 256 && col <= 512)
371+
return 512;
372+
else if (col > 128 && col <= 256)
373+
return 256;
374+
else if (col > 64 && col <= 128)
375+
return 128;
376+
else
377+
return 64;
378+
};
379+
380+
int block_size = ComputeBlockSize(num_cols);
381+
382+
unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
383+
// actually, int num_rows < max_grid_size
384+
unsigned int grid_size = num_rows < maxGridDimX
385+
? static_cast<unsigned int>(num_rows)
386+
: maxGridDimX;
387+
// Init a index array
388+
InitIndex<<<grid_size, block_size, 0, cu_stream>>>(
389+
input_indices.data<int64_t>(), num_rows, num_cols);
390+
391+
// create iter for counting input
392+
cub::CountingInputIterator<int64_t> counting_iter(0);
393+
// segment_offset is used for move to next row
394+
cub::TransformInputIterator<int64_t, SegmentOffsetIter,
395+
cub::CountingInputIterator<int64_t>>
396+
segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
397+
398+
T* sorted_values_ptr;
399+
int64_t* sorted_indices_ptr;
400+
401+
Tensor temp_values;
402+
Tensor temp_indices;
403+
404+
const T* input = input_tensor->data<T>();
405+
T* values = out_tensor->data<T>();
406+
int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
407+
408+
if (k == num_cols) {
409+
// Doing a full sort.
410+
sorted_values_ptr = values;
411+
sorted_indices_ptr = indices;
412+
} else {
413+
temp_values.Resize(dim);
414+
temp_indices.Resize(dim);
415+
sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
416+
sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
417+
}
418+
419+
// Get temp storage buffer size, maybe can allocate a fixed buffer to save
420+
// time.
421+
auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
422+
nullptr, temp_storage_bytes, input, sorted_values_ptr,
423+
input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
424+
num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
425+
cu_stream);
426+
if (err != cudaSuccess) {
427+
LOG(ERROR)
428+
<< "TopKOP failed as could not launch "
429+
"cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
430+
"temp_storage_bytes, status: "
431+
<< cudaGetErrorString(err);
432+
return false;
433+
}
434+
Tensor temp_storage;
435+
temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
436+
437+
err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
438+
temp_storage.data<uint8_t>(), temp_storage_bytes, input,
439+
sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
440+
num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
441+
0, sizeof(T) * 8, cu_stream);
442+
if (err != cudaSuccess) {
443+
LOG(ERROR)
444+
<< "TopKOP failed as could not launch "
445+
"cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
446+
"temp_storage_bytes: "
447+
<< temp_storage_bytes << ", status: " << cudaGetErrorString(err);
448+
return false;
449+
}
450+
auto& dev = *ctx.eigen_device();
451+
if (k < num_cols) {
452+
// copy sliced data to output.
453+
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
454+
const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
455+
auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
456+
auto e_tmp_indices = EigenMatrix<int64_t>::From(temp_indices);
457+
458+
std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
459+
auto dim = framework::make_ddim(odims);
460+
auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
461+
auto e_tmp_values = EigenMatrix<T>::From(temp_values);
462+
463+
e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
464+
e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
465+
}
466+
return true;
467+
}
468+
306469
#define FIXED_BLOCK_DIM_BASE(dim, ...) \
307470
case (dim): { \
308471
constexpr auto kBlockDim = (dim); \
@@ -324,7 +487,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
324487
auto* input = ctx.Input<Tensor>("X");
325488
auto* output = ctx.Output<Tensor>("Out");
326489
auto* indices = ctx.Output<Tensor>("Indices");
327-
size_t k = static_cast<int>(ctx.Attr<int>("k"));
490+
int k = static_cast<int>(ctx.Attr<int>("k"));
328491

329492
auto* k_t = ctx.Input<Tensor>("K");
330493
if (k_t) {
@@ -340,21 +503,31 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
340503
const T* input_data = input->data<T>();
341504
T* output_data = output->mutable_data<T>(ctx.GetPlace());
342505
// FIXME(typhoonzero): data is always converted to type T?
343-
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
344506

345507
framework::DDim inputdims = input->dims();
346-
const size_t input_height = framework::product(
508+
const int64_t input_height = framework::product(
347509
framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
348-
const size_t input_width = inputdims[inputdims.size() - 1];
349-
510+
const int64_t input_width = inputdims[inputdims.size() - 1];
511+
const auto& dev_ctx = ctx.cuda_device_context();
512+
513+
if ((input_width <= 1024 || k >= 128 || k == input_width)) {
514+
if (SortTopk<T>(dev_ctx, input, input_width, input_height, k, output,
515+
indices)) {
516+
// Successed, return.
517+
return;
518+
} else {
519+
LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use "
520+
"default topk kernel.";
521+
}
522+
}
523+
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
350524
if (k > input_width) k = input_width;
351525

352526
// NOTE: pass lds and dim same to input width.
353527
// NOTE: old matrix implementation of stride is different to eigen.
354528
// TODO(typhoonzero): refine this kernel.
355529
const int kMaxHeight = 2048;
356530
int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
357-
auto& dev_ctx = ctx.cuda_device_context();
358531
switch (GetDesiredBlockDim(input_width)) {
359532
FIXED_BLOCK_DIM(
360533
KeMatrixTopK<T, 5,

paddle/fluid/platform/device_context.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
216216
compute_capability_ = GetCUDAComputeCapability(place_.device);
217217
multi_process_ = GetCUDAMultiProcessors(place_.device);
218218
max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
219+
max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
219220
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
220221
eigen_stream_.reset(new EigenCudaStreamDevice());
221222
eigen_stream_->Reinitialize(&stream_, place);
@@ -351,6 +352,10 @@ bool CUDADeviceContext::tensor_core_available() const {
351352
return cublas_tensor_core_handle_ != nullptr;
352353
}
353354

355+
dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
356+
return max_grid_dim_size_;
357+
}
358+
354359
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
355360

356361
CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {

paddle/fluid/platform/device_context.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ class CUDADeviceContext : public DeviceContext {
9696
/*! \brief Return the max physical thread count in the device context */
9797
int GetMaxPhysicalThreadCount() const;
9898

99+
/*! \brief Return the max grid dim size in the device context */
100+
dim3 GetCUDAMaxGridDimSize() const;
101+
99102
/*! \brief Return eigen device in the device context. */
100103
Eigen::GpuDevice* eigen_device() const;
101104

@@ -184,6 +187,7 @@ class CUDADeviceContext : public DeviceContext {
184187
int driver_version_;
185188
int multi_process_;
186189
int max_threads_per_mp_;
190+
dim3 max_grid_dim_size_;
187191

188192
// StreamCallbackManager is thread-safe
189193
std::unique_ptr<StreamCallbackManager> callback_manager_;

paddle/fluid/platform/gpu_info.cc

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,33 @@ int GetCUDAComputeCapability(int id) {
8585
return device_prop.major * 10 + device_prop.minor;
8686
}
8787

88+
dim3 GetGpuMaxGridDimSize(int id) {
89+
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
90+
dim3 ret;
91+
int size;
92+
auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
93+
PADDLE_ENFORCE_EQ(error_code_x, 0,
94+
"cudaDevAttrMaxGridDimX failed in "
95+
"paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
96+
error_code_x, CudaErrorWebsite());
97+
ret.x = size;
98+
99+
auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
100+
PADDLE_ENFORCE_EQ(error_code_y, 0,
101+
"cudaDevAttrMaxGridDimY failed in "
102+
"paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
103+
error_code_y, CudaErrorWebsite());
104+
ret.y = size;
105+
106+
auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
107+
PADDLE_ENFORCE_EQ(error_code_z, 0,
108+
"cudaDevAttrMaxGridDimZ failed in "
109+
"paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
110+
error_code_z, CudaErrorWebsite());
111+
ret.z = size;
112+
return ret;
113+
}
114+
88115
int GetCUDARuntimeVersion(int id) {
89116
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
90117
int runtime_version = 0;

paddle/fluid/platform/gpu_info.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ int GetCUDAMaxThreadsPerMultiProcessor(int i);
4848
//! Get the current GPU device id in system.
4949
int GetCurrentDeviceId();
5050

51+
//! Get the maximum GridDim size for GPU buddy allocator.
52+
dim3 GetGpuMaxGridDimSize(int);
53+
5154
//! Get a list of device ids from environment variable or use all.
5255
std::vector<int> GetSelectedDevices();
5356

0 commit comments

Comments
 (0)