@@ -160,6 +160,10 @@ class SliceLayerImpl : public SliceLayer
160
160
161
161
void finalize (InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
162
162
{
163
+ #ifdef HAVE_OPENCL
164
+ ocl_exec_cache.clear ();
165
+ #endif
166
+
163
167
std::vector<Mat> inputs, outputs;
164
168
inputs_arr.getMatVector (inputs);
165
169
outputs_arr.getMatVector (outputs);
@@ -214,26 +218,33 @@ class SliceLayerImpl : public SliceLayer
214
218
}
215
219
216
220
#ifdef HAVE_OPENCL
217
- bool forward_ocl (InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
221
+ struct OpenCLExecInfo
218
222
{
219
- std::vector<UMat> inputs;
220
- std::vector<UMat> outputs;
223
+ std::string kernel_name;
224
+ std::string build_opts;
225
+ size_t local_size[2 ];
226
+ size_t global_size[2 ];
221
227
222
- inputs_.getUMatVector (inputs);
223
- outputs_.getUMatVector (outputs);
228
+ OpenCLExecInfo ()
229
+ {
230
+ local_size[0 ] = local_size[1 ] = 0 ;
231
+ global_size[0 ] = global_size[1 ] = 0 ;
232
+ }
233
+ };
234
+ std::vector<OpenCLExecInfo> ocl_exec_cache;
235
+
236
+ void ocl_prepare (const std::vector<UMat>& inputs, const std::vector<UMat>& outputs)
237
+ {
238
+ CV_TRACE_FUNCTION ();
224
239
225
240
CV_Assert (outputs.size () == finalSliceRanges.size ());
241
+ ocl_exec_cache.resize (outputs.size ());
226
242
227
243
const UMat& input = inputs[0 ];
228
- if (input.dims > 5 )
229
- {
230
- CV_LOG_INFO (NULL , " DNN/OpenCL/Slice: implementation doesn't support dims=" << input.dims << " . Fallback to CPU" );
231
- return false ;
232
- }
244
+ const int dims = input.dims ;
233
245
234
246
size_t WSZ = 128 ;
235
247
236
- const int dims = input.dims ;
237
248
const int elemSize = (int )input.elemSize ();
238
249
String opts0 = cv::format (
239
250
" -DDIMS=%d -DELEMSIZE=%d" ,
@@ -243,10 +254,11 @@ class SliceLayerImpl : public SliceLayer
243
254
{
244
255
opts0 += cv::format (" -DSRC_STEP_%d=%d" , d, (int )input.step [dims - 1 - d]);
245
256
}
246
- String kname = cv::format (" slice_%d" , dims);
247
257
for (size_t i = 0 ; i < outputs.size (); i++)
248
258
{
249
- UMat& output = outputs[i];
259
+ OpenCLExecInfo& ocl = ocl_exec_cache[i];
260
+
261
+ const UMat& output = outputs[i];
250
262
const std::vector<Range>& range = finalSliceRanges[i];
251
263
252
264
String opts = opts0;
@@ -262,6 +274,8 @@ class SliceLayerImpl : public SliceLayer
262
274
CV_CheckEQ (range[d].size (), (int )output.size [d], " " );
263
275
}
264
276
277
+ const size_t param_LIMIT_BLOCK_SIZE_PER_WG = WSZ * 64 ;
278
+
265
279
int block_dims = 0 ;
266
280
size_t block_size = elemSize;
267
281
for (int i = dims - 1 ; i >= 0 ; --i)
@@ -270,12 +284,14 @@ class SliceLayerImpl : public SliceLayer
270
284
break ;
271
285
block_size *= output.size [i];
272
286
block_dims++;
287
+ if (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG)
288
+ break ;
273
289
}
274
290
275
291
const size_t total = output.total () * elemSize;
276
292
size_t num_blocks = total / block_size;
277
293
278
- if ((num_blocks <= 8 && block_size >= WSZ * 4 ) || (block_size >= WSZ * 64 ))
294
+ if ((num_blocks <= 8 && block_size >= WSZ * 4 ) || (block_size >= param_LIMIT_BLOCK_SIZE_PER_WG ))
279
295
{
280
296
// use 1D copy mode
281
297
opts += cv::format (" -DUSE_COPY_1D=1" );
@@ -345,23 +361,98 @@ class SliceLayerImpl : public SliceLayer
345
361
346
362
opts += cv::format (" -DWSZ=%d" , (int )WSZ);
347
363
348
- size_t local[] = { WSZ, 1 };
349
- size_t global[] = { WSZ, num_blocks };
364
+ std::ostringstream kernel_suffix;
365
+ kernel_suffix << dims << ' x' << elemSize << " _bsz" << block_size;
366
+ kernel_suffix << " __src_" ;
367
+ for (int d = 0 ; d < dims; d++)
368
+ {
369
+ kernel_suffix << input.size [dims - 1 - d] << ' _' ;
370
+ }
371
+ kernel_suffix << ' _' ;
372
+ /* for (int d = 0; d < dims; d++)
373
+ {
374
+ kernel_suffix << input.step[dims - 1 - d] << '_';
375
+ }
376
+ kernel_suffix << '_';*/
350
377
351
- ocl::Kernel kernel (kname.c_str (), ocl::dnn::slice_oclsrc, opts);
378
+ kernel_suffix << " dst_" ;
379
+ for (int d = 0 ; d < dims; d++)
380
+ {
381
+ kernel_suffix << output.size [dims - 1 - d] << ' _' ;
382
+ }
383
+ /* kernel_suffix << '_';
384
+ for (int d = 0; d < dims; d++)
385
+ {
386
+ kernel_suffix << output.step[dims - 1 - d] << '_';
387
+ }*/
388
+ kernel_suffix << " _slice_" ;
389
+ for (int d = 0 ; d < dims; d++)
390
+ {
391
+ kernel_suffix << range[dims - 1 - d].start << ' _' ;
392
+ }
393
+ for (int d = 0 ; d < dims; d++)
394
+ {
395
+ kernel_suffix << ' _' << range[dims - 1 - d].end ;
396
+ }
397
+
398
+ std::string kernel_suffix_str = kernel_suffix.str ();
399
+ opts += cv::format (" -DSLICE_KERNEL_SUFFIX=%s" , kernel_suffix_str.c_str ());
400
+
401
+ ocl.kernel_name = cv::format (" slice_%s" , kernel_suffix_str.c_str ());
402
+ ocl.build_opts = opts;
403
+ ocl.local_size [0 ] = WSZ;
404
+ ocl.local_size [1 ] = 1 ;
405
+ ocl.global_size [0 ] = WSZ;
406
+ ocl.global_size [1 ] = num_blocks;
407
+ } // for outputs.size()
408
+ } // ocl_prepare
409
+
410
+ bool forward_ocl (InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
411
+ {
412
+ CV_TRACE_FUNCTION ();
413
+
414
+ std::vector<UMat> inputs;
415
+ std::vector<UMat> outputs;
416
+
417
+ inputs_.getUMatVector (inputs);
418
+ outputs_.getUMatVector (outputs);
419
+
420
+ CV_Assert (outputs.size () == finalSliceRanges.size ());
421
+
422
+ const UMat& input = inputs[0 ];
423
+ const int dims = input.dims ;
424
+ if (dims > 5 )
425
+ {
426
+ CV_LOG_INFO (NULL , " DNN/OpenCL/Slice: implementation doesn't support dims=" << dims << " . Fallback to CPU" );
427
+ return false ;
428
+ }
429
+
430
+ if (ocl_exec_cache.empty ())
431
+ {
432
+ ocl_prepare (inputs, outputs);
433
+ }
434
+ CV_CheckEQ (ocl_exec_cache.size (), outputs.size (), " " );
435
+
436
+ for (size_t i = 0 ; i < outputs.size (); i++)
437
+ {
438
+ const OpenCLExecInfo& ocl = ocl_exec_cache[i];
439
+
440
+ UMat& output = outputs[i];
441
+
442
+ ocl::Kernel kernel (ocl.kernel_name .c_str (), ocl::dnn::slice_oclsrc, ocl.build_opts );
352
443
if (kernel.empty ())
353
444
return false ;
354
445
bool ret = kernel.args (
355
446
ocl::KernelArg::PtrReadOnly (input),
356
447
ocl::KernelArg::PtrWriteOnly (output)
357
448
)
358
- .run (2 , global, local , false );
449
+ .run (2 , ( size_t *)ocl. global_size , ( size_t *)ocl. local_size , false );
359
450
if (!ret)
360
451
return false ;
361
452
} // for outputs.size()
362
453
363
454
return true ;
364
- }
455
+ } // forward_ocl
365
456
#endif
366
457
367
458
void forward (InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
0 commit comments