Skip to content

Commit 0970ef5

Browse files
committed
Merge pull request opencv#17754 from YashasSamaga:cuda4dnn-slice-concat-copy-reduction
2 parents f98c97f + cbdaa93 commit 0970ef5

File tree

4 files changed

+55
-15
lines changed

4 files changed

+55
-15
lines changed

modules/dnn/src/cuda/concat.cu

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "../cuda4dnn/csl/tensor.hpp"
1717
#include "../cuda4dnn/csl/span.hpp"
1818

19+
#include "../cuda4dnn/kernels/fill_copy.hpp"
20+
1921
#include <cstddef>
2022
#include <vector>
2123

@@ -95,6 +97,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
9597
TensorSpan<T> output, std::size_t output_axis_offset,
9698
TensorView<T> input, std::size_t axis)
9799
{
100+
CV_Assert(output.rank() == input.rank());
101+
CV_Assert(output_axis_offset < output.get_axis_size(axis));
102+
103+
/* if axes preceeding the concat axis are all singleton, the concat blocks are contiguous
104+
* in the output and we can copy each block directly
105+
*/
106+
if (output.size_range(0, axis) == 1)
107+
{
108+
auto stride = output.size_range(axis + 1, output.rank());
109+
auto sliced_output = Span<T>(output.get() + output_axis_offset * stride, input.size());
110+
kernels::copy<T>(stream, sliced_output, input);
111+
return;
112+
}
113+
98114
/* let's call the axis of interest as the channel axis for the purpose of the following discussion
99115
* even though it can be any axis
100116
*

modules/dnn/src/cuda/slice.cu

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,14 @@
1515
#include "../cuda4dnn/csl/tensor.hpp"
1616
#include "../cuda4dnn/csl/span.hpp"
1717

18+
#include "../cuda4dnn/kernels/fill_copy.hpp"
19+
1820
#include <opencv2/core.hpp>
1921

2022
#include <cstddef>
2123
#include <vector>
2224
#include <iostream>
25+
#include <algorithm>
2326

2427
using namespace cv::dnn::cuda4dnn::csl;
2528
using namespace cv::dnn::cuda4dnn::csl::device;
@@ -79,6 +82,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
7982
CV_Assert(output.rank() == input.rank());
8083
CV_Assert(output.rank() == offsets.size());
8184

85+
/* copy directly if no slicing is required */
86+
if (is_shape_same(output, input))
87+
{
88+
CV_Assert(std::all_of(std::begin(offsets), std::end(offsets), [] (std::size_t x) { return x == 0; }));
89+
kernels::copy<T>(stream, output, input);
90+
return;
91+
}
92+
8293
/* squeezable axes at the beginning of both tensors can be eliminated
8394
*
8495
* Reasoning:
@@ -146,6 +157,27 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
146157

147158
auto rank = inShape.size();
148159

160+
/* We can do a copy if the reduced rank is two and only the first axis is sliced.
161+
* The general requirement is that only one axis is sliced and all the axes that
162+
* preceed the sliced axis are singleton. However, the reductions above will remove
163+
* all the leading singleton axes and merge the trailing unsliced axes into one, or
164+
* zero if there are no trailing unsliced axes. The latter is handled separately.
165+
*/
166+
if (rank == 2 && offsets[0] != 0 && offsets[1] == 0)
167+
{
168+
auto stride = inShape[1];
169+
auto sliced_input = View<T>(input.get() + offsets[0] * stride, output.size());
170+
kernels::copy<T>(stream, output, sliced_input);
171+
return;
172+
}
173+
174+
if (rank == 1)
175+
{
176+
auto sliced_input = View<T>(input.get() + offsets[0], output.size());
177+
kernels::copy<T>(stream, output, sliced_input);
178+
return;
179+
}
180+
149181
std::vector<std::size_t> inStride(rank), outStride(rank);
150182
inStride.back() = 1;
151183
outStride.back() = 1;

modules/dnn/src/cuda4dnn/primitives/slice.hpp

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -47,20 +47,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
4747

4848
CV_Assert(offsets.size() == outputs.size());
4949

50-
/* one output with the same shape as the input => direct copy */
51-
if (outputs.size() == 1)
52-
{
53-
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
54-
auto output = output_wrapper->getSpan();
55-
56-
if (is_shape_same(output, input))
57-
{
58-
CV_Assert(std::all_of(std::begin(offsets[0]), std::end(offsets[0]), [] (std::size_t x) { return x == 0; }));
59-
kernels::copy<T>(stream, output, input);
60-
return;
61-
}
62-
}
63-
6450
for (int i = 0; i < outputs.size(); ++i)
6551
{
6652
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();

modules/dnn/src/dnn.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2788,7 +2788,13 @@ struct Net::Impl : public detail::NetImplBase
27882788
if (preferableBackend == DNN_BACKEND_CUDA &&
27892789
(inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
27902790
(inp_i_data->layerInstance->type != "Convolution" &&
2791-
inp_i_data->layerInstance->type != "Pooling")))
2791+
inp_i_data->layerInstance->type != "Pooling" &&
2792+
inp_i_data->layerInstance->type != "Resize" &&
2793+
inp_i_data->layerInstance->type != "Flatten" &&
2794+
inp_i_data->layerInstance->type != "Permute" &&
2795+
inp_i_data->layerInstance->type != "Reorg" &&
2796+
inp_i_data->layerInstance->type != "Eltwise" &&
2797+
inp_i_data->layerInstance.dynamicCast<ActivationLayer>().empty())))
27922798
{
27932799
break;
27942800
}

0 commit comments

Comments
 (0)