artoolkitx
diff --git a/‎modules/dnn/src/cuda/activation_eltwise.cu‎
Lines changed: 121 additions & 0 deletions b/‎modules/dnn/src/cuda/activation_eltwise.cu‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎modules/dnn/src/cuda/activations.cu‎
Lines changed: 42 additions & 42 deletions b/‎modules/dnn/src/cuda/activations.cu‎
Lines changed: 42 additions & 42 deletions
@@ -0,0 +1,121 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "functors.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, class ActivationOp, class EltwiseOp, std::size_t N>
+    __global__ void generic_op_eltwise_op_inplace_vec(Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params act_params, const typename EltwiseOp::Params eltwise_params) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+        auto eltwise_vPtr = vector_type::get_pointer(eltwise.data());
+
+        ActivationOp activation_op(act_params);
+        EltwiseOp eltwise_op(eltwise_params);
+
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            vector_type output_vec, eltwise_vec;
+            v_load(output_vec, inplace_output_vPtr[i]);
+            v_load(eltwise_vec, eltwise_vPtr[i]);
+            for(int j = 0; j < output_vec.size(); j++)
+                output_vec.data[j] = eltwise_op(activation_op(output_vec.data[j]), eltwise_vec.data[j]);
+            v_store(inplace_output_vPtr[i], output_vec);
+        }
+    }
+}
+
+template <class T, class ActivationOp, class EltwiseOp, std::size_t N> static
+void launch_vectorized_generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params& act_params, const typename EltwiseOp::Params& eltwise_params) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(is_fully_aligned<T>(eltwise, N));
+
+    auto kernel = raw::generic_op_eltwise_op_inplace_vec<T, ActivationOp, EltwiseOp, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, eltwise, act_params, eltwise_params);
+}
+
+template <class T, class ActivationOp, class EltwiseOp> static
+void generic_op_eltwise_op_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, const typename ActivationOp::Params& act_params = {}, const typename EltwiseOp::Params& eltwise_params = {}) {
+    CV_Assert(inplace_output.size() == eltwise.size());
+
+    if (is_fully_aligned<T>(inplace_output, 4) && is_fully_aligned<T>(eltwise, 4)) {
+        launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 4>(stream, inplace_output, eltwise, act_params, eltwise_params);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && is_fully_aligned<T>(eltwise, 2)) {
+        launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 2>(stream, inplace_output, eltwise, act_params, eltwise_params);
+    } else {
+        launch_vectorized_generic_op_eltwise_op_inplace<T, ActivationOp, EltwiseOp, 1>(stream, inplace_output, eltwise, act_params, eltwise_params);
+    }
+}
+
+template <class T>
+void relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T slope) {
+    generic_op_eltwise_op_inplace<T, ReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {slope});
+}
+
+template <class T>
+void clipped_relu_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    generic_op_eltwise_op_inplace<T, ClippedReLUFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {floor, ceiling});
+}
+
+template <class T>
+void tanh_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, TanHFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void swish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, SwishFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void mish_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, MishFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void sigmoid_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise) {
+    generic_op_eltwise_op_inplace<T, SigmoidFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise);
+}
+
+template <class T>
+void power_eltwise_sum_2_inplace(const Stream& stream, Span<T> inplace_output, View<T> eltwise, T exp, T scale, T shift) {
+    generic_op_eltwise_op_inplace<T, PowerFunctor<T>, SumFunctor<T>>(stream, inplace_output, eltwise, {exp, scale, shift});
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half);
+template void clipped_relu_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void tanh_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void swish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void mish_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void sigmoid_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>);
+template void power_eltwise_sum_2_inplace<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
+#endif
+
+template void relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float);
+template void clipped_relu_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float, float);
+template void tanh_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void swish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void mish_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void sigmoid_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>);
+template void power_eltwise_sum_2_inplace<float>(const Stream&, Span<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
@@ -26,20 +26,20 @@ using namespace cv::dnn::cuda4dnn::csl::device;
 namespace cv { namespace dnn { namespace cuda4dnn  { namespace kernels {
 
 namespace raw {
-    template <class T, class Functor, std::size_t N, class ...FunctorArgs>
-    __global__ void generic_op_vec(Span<T> output, View<T> input, FunctorArgs ...functorArgs) {
+    template <class T, class ActivationOp, std::size_t N>
+    __global__ void generic_op_vec(Span<T> output, View<T> input, const typename ActivationOp::Params params) {
         using vector_type = get_vector_type_t<T, N>;
 
         auto output_vPtr = vector_type::get_pointer(output.data());
         auto input_vPtr = vector_type::get_pointer(input.data());
 
-        Functor functor(functorArgs...);
+        ActivationOp activation_op(params);
 
         for (auto i : grid_stride_range(output.size() / vector_type::size())) {
             vector_type vec;
             v_load(vec, input_vPtr[i]);
             for (int j = 0; j < vector_type::size(); j++)
-                vec.data[j] = functor(vec.data[j]);
+                vec.data[j] = activation_op(vec.data[j]);
             v_store(output_vPtr[i], vec);
         }
     }
@@ -51,9 +51,8 @@ namespace raw {
         auto output_vPtr = vector_type::get_pointer(output.data());
         auto input_vPtr = vector_type::get_pointer(input.data());
 
-        inner_size /= vector_type::size();
         for (auto i : grid_stride_range(output.size() / vector_type::size())) {
-            const index_type c = (i / inner_size) % static_cast<size_type>(slope.size());
+            const index_type c = (i / inner_size) % slope.size();
 
             vector_type vec;
             v_load(vec, input_vPtr[i]);
@@ -65,73 +64,73 @@ namespace raw {
 
 } /* namespace raw */
 
-template <class T, template <class> class Activation, std::size_t N, class ...ActivationArgs> static
-void launch_vectorized_generic_op(const Stream& stream, Span<T> output, View<T> input, ActivationArgs ...activationArgs) {
+template <class T, class ActivationOp, std::size_t N> static
+void launch_vectorized_generic_op(const Stream& stream, Span<T> output, View<T> input, const typename ActivationOp::Params& params) {
     CV_Assert(is_fully_aligned<T>(output, N));
     CV_Assert(is_fully_aligned<T>(input, N));
 
-    auto kernel = raw::generic_op_vec<T, Activation<T>, N, ActivationArgs...>;
+    auto kernel = raw::generic_op_vec<T, ActivationOp, N>;
     auto policy = make_policy(kernel, output.size() / N, 0, stream);
-    launch_kernel(kernel, policy, output, input, activationArgs...);
+    launch_kernel(kernel, policy, output, input, params);
 }
 
-template <class T, template <class> class Activation, class ...ActivationArgs> static
-void generic_op(const Stream& stream, Span<T> output, View<T> input, ActivationArgs ...activationArgs) {
+template <class T, class ActivationOp> static
+void generic_op(const Stream& stream, Span<T> output, View<T> input, const typename ActivationOp::Params& params = {}) {
     CV_Assert(input.size() == output.size());
 
     if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
-        launch_vectorized_generic_op<T, Activation, 4>(stream, output, input, activationArgs...);
+        launch_vectorized_generic_op<T, ActivationOp, 4>(stream, output, input, params);
     } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
-        launch_vectorized_generic_op<T, Activation, 2>(stream, output, input, activationArgs...);
+        launch_vectorized_generic_op<T, ActivationOp, 2>(stream, output, input, params);
     } else {
-        launch_vectorized_generic_op<T, Activation, 1>(stream, output, input, activationArgs...);
+        launch_vectorized_generic_op<T, ActivationOp, 1>(stream, output, input, params);
     }
 }
 
 template <class T>
-void abs(const Stream& stream, Span<T> output, View<T> input) {
-    generic_op<T, abs_functor>(stream, output, input);
+void relu(const Stream& stream, Span<T> output, View<T> input, T slope) {
+    generic_op<T, ReLUFunctor<T>>(stream, output, input, {slope});
+}
+
+template <class T>
+void clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) {
+    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+    generic_op<T, ClippedReLUFunctor<T>>(stream, output, input, {floor, ceiling});
 }
 
 template <class T>
 void tanh(const Stream& stream, Span<T> output, View<T> input) {
-    generic_op<T, tanh_functor>(stream, output, input);
+    generic_op<T, TanHFunctor<T>>(stream, output, input);
 }
 
 template <class T>
 void swish(const Stream& stream, Span<T> output, View<T> input) {
-    generic_op<T, swish_functor>(stream, output, input);
+    generic_op<T, SwishFunctor<T>>(stream, output, input);
 }
 
 template <class T>
 void mish(const Stream& stream, Span<T> output, View<T> input) {
-    generic_op<T, mish_functor>(stream, output, input);
+    generic_op<T, MishFunctor<T>>(stream, output, input);
 }
 
 template <class T>
 void sigmoid(const Stream& stream, Span<T> output, View<T> input) {
-    generic_op<T, sigmoid_functor>(stream, output, input);
-}
-
-template <class T>
-void bnll(const Stream& stream, Span<T> output, View<T> input) {
-    generic_op<T, bnll_functor>(stream, output, input);
+    generic_op<T, SigmoidFunctor<T>>(stream, output, input);
 }
 
 template <class T>
 void elu(const Stream& stream, Span<T> output, View<T> input) {
-    generic_op<T, elu_functor>(stream, output, input);
+    generic_op<T, ELUFunctor<T>>(stream, output, input);
 }
 
 template <class T>
-void relu(const Stream& stream, Span<T> output, View<T> input, T slope) {
-    generic_op<T, relu_functor>(stream, output, input, slope);
+void bnll(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, BNLLFunctor<T>>(stream, output, input);
 }
 
 template <class T>
-void clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) {
-    CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
-    generic_op<T, clipped_relu_functor>(stream, output, input, floor, ceiling);
+void abs(const Stream& stream, Span<T> output, View<T> input) {
+    generic_op<T, AbsFunctor<T>>(stream, output, input);
 }
 
 template <class T>
@@ -143,31 +142,32 @@ void power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale,
         return;
     }
 
-    generic_op<T, power_functor>(stream, output, input, exp, scale, shift);
+    generic_op<T, PowerFunctor<T>>(stream, output, input, {exp, scale, shift});
 }
 
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
-template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input);
+template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half);
+template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
 template void tanh<__half>(const Stream&, Span<__half>, View<__half>);
 template void swish<__half>(const Stream&, Span<__half>, View<__half>);
 template void mish<__half>(const Stream&, Span<__half>, View<__half>);
 template void sigmoid<__half>(const Stream&, Span<__half>, View<__half>);
-template void bnll<__half>(const Stream&, Span<__half>, View<__half>);
 template void elu<__half>(const Stream&, Span<__half>, View<__half>);
-template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half);
-template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input);
+template void bnll<__half>(const Stream&, Span<__half>, View<__half>);
 template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
 #endif
 
-template void abs<float>(const Stream& stream, Span<float> output, View<float> input);
+
+template void relu<float>(const Stream&, Span<float>, View<float>, float);
+template void clipped_relu<float>(const Stream&, Span<float>, View<float>, float, float);
 template void tanh<float>(const Stream&, Span<float>, View<float>);
 template void swish<float>(const Stream&, Span<float>, View<float>);
 template void mish<float>(const Stream&, Span<float>, View<float>);
 template void sigmoid<float>(const Stream&, Span<float>, View<float>);
-template void bnll<float>(const Stream&, Span<float>, View<float>);
 template void elu<float>(const Stream&, Span<float>, View<float>);
-template void relu<float>(const Stream&, Span<float>, View<float>, float);
-template void clipped_relu<float>(const Stream&, Span<float>, View<float>, float, float);
+template void abs<float>(const Stream& stream, Span<float> output, View<float> input);
+template void bnll<float>(const Stream&, Span<float>, View<float>);
 template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
 
 template <class T, std::size_t N> static
@@ -178,7 +178,7 @@ void launch_vectorized_axiswise_relu(const Stream& stream, Span<T> output, View<
 
     auto kernel = raw::axiswise_relu_vec<T, N>;
     auto policy = make_policy(kernel, output.size() / N, 0, stream);
-    launch_kernel(kernel, policy, output, input, inner_size, slope);
+    launch_kernel(kernel, policy, output, input, inner_size / N, slope);
 }
 
 template <class T>