Skip to content

Commit 1c8ee3f

Browse files
authored
Merge pull request opencv#17885 from alalek:dnn_ocl_slice_update
DNN: OpenCL/slice update * dnn(ocl/slice): make slice kernel VTune friendly - more unique names - inline code of copy functions * dnn(ocl/slice): prefer to spawn more work groups - even in case with 1D copy - perf improvement up to 2x of kernel time (due to changed configuration 128x1x1 => 128x32x1) * dnn(ocl/slice): cache kernel exec info
1 parent 9221080 commit 1c8ee3f

File tree

2 files changed

+196
-124
lines changed

2 files changed

+196
-124
lines changed

modules/dnn/src/layers/slice_layer.cpp

Lines changed: 110 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,10 @@ class SliceLayerImpl : public SliceLayer
160160

161161
void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
162162
{
163+
#ifdef HAVE_OPENCL
164+
ocl_exec_cache.clear();
165+
#endif
166+
163167
std::vector<Mat> inputs, outputs;
164168
inputs_arr.getMatVector(inputs);
165169
outputs_arr.getMatVector(outputs);
@@ -214,26 +218,33 @@ class SliceLayerImpl : public SliceLayer
214218
}
215219

216220
#ifdef HAVE_OPENCL
217-
bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
221+
struct OpenCLExecInfo
218222
{
219-
std::vector<UMat> inputs;
220-
std::vector<UMat> outputs;
223+
std::string kernel_name;
224+
std::string build_opts;
225+
size_t local_size[2];
226+
size_t global_size[2];
221227

222-
inputs_.getUMatVector(inputs);
223-
outputs_.getUMatVector(outputs);
228+
OpenCLExecInfo()
229+
{
230+
local_size[0] = local_size[1] = 0;
231+
global_size[0] = global_size[1] = 0;
232+
}
233+
};
234+
std::vector<OpenCLExecInfo> ocl_exec_cache;
235+
236+
void ocl_prepare(const std::vector<UMat>& inputs, const std::vector<UMat>& outputs)
237+
{
238+
CV_TRACE_FUNCTION();
224239

225240
CV_Assert(outputs.size() == finalSliceRanges.size());
241+
ocl_exec_cache.resize(outputs.size());
226242

227243
const UMat& input = inputs[0];
228-
if (input.dims > 5)
229-
{
230-
CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << ". Fallback to CPU");
231-
return false;
232-
}
244+
const int dims = input.dims;
233245

234246
size_t WSZ = 128;
235247

236-
const int dims = input.dims;
237248
const int elemSize = (int)input.elemSize();
238249
String opts0 = cv::format(
239250
"-DDIMS=%d -DELEMSIZE=%d",
@@ -243,10 +254,11 @@ class SliceLayerImpl : public SliceLayer
243254
{
244255
opts0 += cv::format(" -DSRC_STEP_%d=%d", d, (int)input.step[dims - 1 - d]);
245256
}
246-
String kname = cv::format("slice_%d", dims);
247257
for (size_t i = 0; i < outputs.size(); i++)
248258
{
249-
UMat& output = outputs[i];
259+
OpenCLExecInfo& ocl = ocl_exec_cache[i];
260+
261+
const UMat& output = outputs[i];
250262
const std::vector<Range>& range = finalSliceRanges[i];
251263

252264
String opts = opts0;
@@ -262,6 +274,8 @@ class SliceLayerImpl : public SliceLayer
262274
CV_CheckEQ(range[d].size(), (int)output.size[d], "");
263275
}
264276

277+
const size_t param_LIMIT_BLOCK_SIZE_PER_WG = WSZ * 64;
278+
265279
int block_dims = 0;
266280
size_t block_size = elemSize;
267281
for (int i = dims - 1; i >= 0; --i)
@@ -270,12 +284,14 @@ class SliceLayerImpl : public SliceLayer
270284
break;
271285
block_size *= output.size[i];
272286
block_dims++;
287+
if (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG)
288+
break;
273289
}
274290

275291
const size_t total = output.total() * elemSize;
276292
size_t num_blocks = total / block_size;
277293

278-
if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= WSZ * 64))
294+
if ((num_blocks <= 8 && block_size >= WSZ * 4) || (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG))
279295
{
280296
// use 1D copy mode
281297
opts += cv::format(" -DUSE_COPY_1D=1");
@@ -345,23 +361,98 @@ class SliceLayerImpl : public SliceLayer
345361

346362
opts += cv::format(" -DWSZ=%d", (int)WSZ);
347363

348-
size_t local[] = { WSZ, 1 };
349-
size_t global[] = { WSZ, num_blocks };
364+
std::ostringstream kernel_suffix;
365+
kernel_suffix << dims << 'x' << elemSize << "_bsz" << block_size;
366+
kernel_suffix << "__src_";
367+
for (int d = 0; d < dims; d++)
368+
{
369+
kernel_suffix << input.size[dims - 1 - d] << '_';
370+
}
371+
kernel_suffix << '_';
372+
/*for (int d = 0; d < dims; d++)
373+
{
374+
kernel_suffix << input.step[dims - 1 - d] << '_';
375+
}
376+
kernel_suffix << '_';*/
350377

351-
ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, opts);
378+
kernel_suffix << "dst_";
379+
for (int d = 0; d < dims; d++)
380+
{
381+
kernel_suffix << output.size[dims - 1 - d] << '_';
382+
}
383+
/*kernel_suffix << '_';
384+
for (int d = 0; d < dims; d++)
385+
{
386+
kernel_suffix << output.step[dims - 1 - d] << '_';
387+
}*/
388+
kernel_suffix << "_slice_";
389+
for (int d = 0; d < dims; d++)
390+
{
391+
kernel_suffix << range[dims - 1 - d].start << '_';
392+
}
393+
for (int d = 0; d < dims; d++)
394+
{
395+
kernel_suffix << '_' << range[dims - 1 - d].end;
396+
}
397+
398+
std::string kernel_suffix_str = kernel_suffix.str();
399+
opts += cv::format(" -DSLICE_KERNEL_SUFFIX=%s", kernel_suffix_str.c_str());
400+
401+
ocl.kernel_name = cv::format("slice_%s", kernel_suffix_str.c_str());
402+
ocl.build_opts = opts;
403+
ocl.local_size[0] = WSZ;
404+
ocl.local_size[1] = 1;
405+
ocl.global_size[0] = WSZ;
406+
ocl.global_size[1] = num_blocks;
407+
} // for outputs.size()
408+
} // ocl_prepare
409+
410+
bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
411+
{
412+
CV_TRACE_FUNCTION();
413+
414+
std::vector<UMat> inputs;
415+
std::vector<UMat> outputs;
416+
417+
inputs_.getUMatVector(inputs);
418+
outputs_.getUMatVector(outputs);
419+
420+
CV_Assert(outputs.size() == finalSliceRanges.size());
421+
422+
const UMat& input = inputs[0];
423+
const int dims = input.dims;
424+
if (dims > 5)
425+
{
426+
CV_LOG_INFO(NULL, "DNN/OpenCL/Slice: implementation doesn't support dims=" << dims << ". Fallback to CPU");
427+
return false;
428+
}
429+
430+
if (ocl_exec_cache.empty())
431+
{
432+
ocl_prepare(inputs, outputs);
433+
}
434+
CV_CheckEQ(ocl_exec_cache.size(), outputs.size(), "");
435+
436+
for (size_t i = 0; i < outputs.size(); i++)
437+
{
438+
const OpenCLExecInfo& ocl = ocl_exec_cache[i];
439+
440+
UMat& output = outputs[i];
441+
442+
ocl::Kernel kernel(ocl.kernel_name.c_str(), ocl::dnn::slice_oclsrc, ocl.build_opts);
352443
if (kernel.empty())
353444
return false;
354445
bool ret = kernel.args(
355446
ocl::KernelArg::PtrReadOnly(input),
356447
ocl::KernelArg::PtrWriteOnly(output)
357448
)
358-
.run(2, global, local, false);
449+
.run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
359450
if (!ret)
360451
return false;
361452
} // for outputs.size()
362453

363454
return true;
364-
}
455+
} // forward_ocl
365456
#endif
366457

367458
void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE

0 commit comments

Comments
 (0)