|
15 | 15 | #include "../cuda4dnn/csl/tensor.hpp"
|
16 | 16 | #include "../cuda4dnn/csl/span.hpp"
|
17 | 17 |
|
| 18 | +#include "../cuda4dnn/kernels/fill_copy.hpp" |
| 19 | + |
18 | 20 | #include <opencv2/core.hpp>
|
19 | 21 |
|
20 | 22 | #include <cstddef>
|
21 | 23 | #include <vector>
|
22 | 24 | #include <iostream>
|
| 25 | +#include <algorithm> |
23 | 26 |
|
24 | 27 | using namespace cv::dnn::cuda4dnn::csl;
|
25 | 28 | using namespace cv::dnn::cuda4dnn::csl::device;
|
@@ -79,6 +82,14 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
79 | 82 | CV_Assert(output.rank() == input.rank());
|
80 | 83 | CV_Assert(output.rank() == offsets.size());
|
81 | 84 |
|
| 85 | + /* copy directly if no slicing is required */ |
| 86 | + if (is_shape_same(output, input)) |
| 87 | + { |
| 88 | + CV_Assert(std::all_of(std::begin(offsets), std::end(offsets), [] (std::size_t x) { return x == 0; })); |
| 89 | + kernels::copy<T>(stream, output, input); |
| 90 | + return; |
| 91 | + } |
| 92 | + |
82 | 93 | /* squeezable axes at the beginning of both tensors can be eliminated
|
83 | 94 | *
|
84 | 95 | * Reasoning:
|
@@ -146,6 +157,27 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
146 | 157 |
|
147 | 158 | auto rank = inShape.size();
|
148 | 159 |
|
| 160 | + /* We can do a copy if the reduced rank is two and only the first axis is sliced. |
| 161 | + * The general requirement is that only one axis is sliced and all the axes that |
| 162 | + * preceed the sliced axis are singleton. However, the reductions above will remove |
| 163 | + * all the leading singleton axes and merge the trailing unsliced axes into one, or |
| 164 | + * zero if there are no trailing unsliced axes. The latter is handled separately. |
| 165 | + */ |
| 166 | + if (rank == 2 && offsets[0] != 0 && offsets[1] == 0) |
| 167 | + { |
| 168 | + auto stride = inShape[1]; |
| 169 | + auto sliced_input = View<T>(input.get() + offsets[0] * stride, output.size()); |
| 170 | + kernels::copy<T>(stream, output, sliced_input); |
| 171 | + return; |
| 172 | + } |
| 173 | + |
| 174 | + if (rank == 1) |
| 175 | + { |
| 176 | + auto sliced_input = View<T>(input.get() + offsets[0], output.size()); |
| 177 | + kernels::copy<T>(stream, output, sliced_input); |
| 178 | + return; |
| 179 | + } |
| 180 | + |
149 | 181 | std::vector<std::size_t> inStride(rank), outStride(rank);
|
150 | 182 | inStride.back() = 1;
|
151 | 183 | outStride.back() = 1;
|
|
0 commit comments