Merge pull request #3843 from vrabaud:cuda

asmorkalov · web-flow · commit 8e6285a79b3b · 2024-12-16T11:36:15.000+03:00
Get cudalegacy to compile with clang CUDA and without CUDA
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy.hpp b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
@@ -44,11 +44,13 @@
 #define OPENCV_CUDALEGACY_HPP
 
 #include "opencv2/core/cuda.hpp"
+#if defined (HAVE_CUDA) && !defined (CUDA_DISABLER)
 #include "opencv2/cudalegacy/NCV.hpp"
 #include "opencv2/cudalegacy/NPP_staging.hpp"
 #include "opencv2/cudalegacy/NCVPyramid.hpp"
 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
 #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
+#endif
 #include "opencv2/video/background_segm.hpp"
 
 /**
diff --git a/modules/cudalegacy/src/NCV.cpp b/modules/cudalegacy/src/NCV.cpp
@@ -42,6 +42,8 @@
 
 #include "precomp.hpp"
 
+#if defined (HAVE_CUDA) && !defined (CUDA_DISABLER)
+
 //==============================================================================
 //
 // Error handling helpers
@@ -886,3 +888,4 @@ NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
 {
     return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
 }
+#endif
diff --git a/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
@@ -695,8 +695,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
     //prepare image pyramid
     ImagePyramid pyr(desc.number_of_outer_iterations);
 
-    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
-
     float scale = 1.0f;
 
     //cuda arrays for frames
diff --git a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
@@ -193,7 +193,7 @@ __global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u>
             if (tbDoAtomicCompaction) bInactiveThread = true; else return;
         }
 
-        if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
+        if (!tbDoAtomicCompaction || (tbDoAtomicCompaction && !bInactiveThread))
         {
             outMaskVal = d_inMask[maskOffset];
             y_offs = outMaskVal >> 16;
@@ -210,7 +210,7 @@ __global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u>
             if (tbDoAtomicCompaction) bInactiveThread = true; else return;
         }
 
-        if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
+        if (!tbDoAtomicCompaction || (tbDoAtomicCompaction && !bInactiveThread))
         {
             maskOffset = y_offs * mask2Dstride + x_offs;
 
diff --git a/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp b/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp
@@ -84,7 +84,7 @@ template<> struct TConvVec2Base<double1> {typedef Ncv64f TBase;};
 template<> struct TConvVec2Base<double3> {typedef Ncv64f TBase;};
 template<> struct TConvVec2Base<double4> {typedef Ncv64f TBase;};
 
-#define NC(T)       (sizeof(T) / sizeof(TConvVec2Base<T>::TBase))
+#define NC(T)       (sizeof(T) / sizeof(typename TConvVec2Base<T>::TBase))
 
 template<typename TBase, Ncv32u NC> struct TConvBase2Vec;
 template<> struct TConvBase2Vec<Ncv8u, 1>  {typedef uchar1 TVec;};
@@ -115,7 +115,7 @@ template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, N
 template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);}
 template<typename Tin> inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
 
-template<typename Tout> inline Tout _pixMakeZero();
+template<typename Tout> inline __host__ __device__ Tout _pixMakeZero();
 template<> inline __host__ __device__ uchar1 _pixMakeZero<uchar1>() {return make_uchar1(0);}
 template<> inline __host__ __device__ uchar3 _pixMakeZero<uchar3>() {return make_uchar3(0,0,0);}
 template<> inline __host__ __device__ uchar4 _pixMakeZero<uchar4>() {return make_uchar4(0,0,0,0);}
diff --git a/modules/cudalegacy/src/cuda/NPP_staging.cu b/modules/cudalegacy/src/cuda/NPP_staging.cu
@@ -85,26 +85,24 @@ const Ncv32u NUM_SCAN_THREADS = 256;
 const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
 
 
-template<class T_in, class T_out>
+template<class T_in, class T_out, bool tbDoSqr>
 struct _scanElemOp
 {
-    template<bool tbDoSqr>
-    static inline __host__ __device__ T_out scanElemOp(T_in elem)
-    {
-        return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
-    }
-
-private:
-
-    template <int v> struct Int2Type { enum { value = v }; };
+    static __host__ __device__ T_out scanElemOp(T_in elem);
+};
 
-    static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>)
-    {
-        return (T_out)elem;
+template<class T_in, class T_out>
+struct _scanElemOp<T_in, T_out, false>
+{
+    static inline __host__ __device__ T_out scanElemOp(T_in elem) {
+        return (T_out)(elem);
     }
+};
 
-    static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>)
-    {
+template<class T_in, class T_out>
+struct _scanElemOp<T_in, T_out, true>
+{
+    static inline __host__ __device__ T_out scanElemOp(T_in elem) {
         return (T_out)(elem*elem);
     }
 };
@@ -177,15 +175,15 @@ __global__ void scanRows(cv::cudev::TexturePtr<Ncv8u> tex8u, T_in *d_src, Ncv32u
         Ncv32u curElemOffs = offsetX + threadIdx.x;
         T_out curScanElem;
 
-        T_in curElem;
+        T_in curElem = 0;
         T_out curElemMod;
 
         if (curElemOffs < srcWidth)
         {
             //load elements
             curElem = readElem<T_in>(tex8u, d_src, texOffs, srcStride, curElemOffs);
         }
-        curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
+        curElemMod = _scanElemOp<T_in, T_out, tbDoSqr>::scanElemOp(curElem);
 
         //inclusive scan
         curScanElem = cv::cudev::blockScanInclusive<NUM_SCAN_THREADS>(curElemMod, shmem, threadIdx.x);
diff --git a/modules/cudalegacy/src/cuda/needle_map.cu b/modules/cudalegacy/src/cuda/needle_map.cu
@@ -76,19 +76,19 @@ namespace cv { namespace cuda { namespace device
                 // now add the column sums
                 const uint X = threadIdx.x;
 
-                if (X | 0xfe == 0xfe)  // bit 0 is 0
+                if (X | (0xfe == 0xfe))  // bit 0 is 0
                 {
                     u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
                     v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
                 }
 
-                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
+                if (X | (0xfe == 0xfc)) // bits 0 & 1 == 0
                 {
                     u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
                     v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
                 }
 
-                if (X | 0xf8 == 0xf8)
+                if (X | (0xf8 == 0xf8))
                 {
                     u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
                     v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
diff --git a/modules/cudalegacy/src/precomp.hpp b/modules/cudalegacy/src/precomp.hpp
@@ -80,6 +80,8 @@
 #endif
 
 #include "opencv2/core/private.cuda.hpp"
+#if defined (HAVE_CUDA) && !defined (CUDA_DISABLER)
 #include "opencv2/cudalegacy/private.hpp"
+#endif
 
 #endif /* __OPENCV_PRECOMP_H__ */

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,8 @@`
`42`	`42`
`43`	`43`	`#include "precomp.hpp"`
`44`	`44`
	`45`	`+#if defined (HAVE_CUDA) && !defined (CUDA_DISABLER)`
	`46`	`+`
`45`	`47`	`//==============================================================================`
`46`	`48`	`//`
`47`	`49`	`// Error handling helpers`
`@@ -886,3 +888,4 @@ NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,`
`886`	`888`	`{`
`887`	`889`	`return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);`
`888`	`890`	`}`
	`891`	`+#endif`
Original file line number	Diff line number	Diff line change
`@@ -193,7 +193,7 @@ __global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u>`
`193`	`193`	`if (tbDoAtomicCompaction) bInactiveThread = true; else return;`
`194`	`194`	`}`
`195`	`195`
`196`		`- if (!tbDoAtomicCompaction \|\| tbDoAtomicCompaction && !bInactiveThread)`
	`196`	`+ if (!tbDoAtomicCompaction \|\| (tbDoAtomicCompaction && !bInactiveThread))`
`197`	`197`	`{`
`198`	`198`	`outMaskVal = d_inMask[maskOffset];`
`199`	`199`	`y_offs = outMaskVal >> 16;`
`@@ -210,7 +210,7 @@ __global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u>`
`210`	`210`	`if (tbDoAtomicCompaction) bInactiveThread = true; else return;`
`211`	`211`	`}`
`212`	`212`
`213`		`- if (!tbDoAtomicCompaction \|\| tbDoAtomicCompaction && !bInactiveThread)`
	`213`	`+ if (!tbDoAtomicCompaction \|\| (tbDoAtomicCompaction && !bInactiveThread))`
`214`	`214`	`{`
`215`	`215`	`maskOffset = y_offs * mask2Dstride + x_offs;`
`216`	`216`
Original file line number	Diff line number	Diff line change
`@@ -76,19 +76,19 @@ namespace cv { namespace cuda { namespace device`
`76`	`76`	`// now add the column sums`
`77`	`77`	`const uint X = threadIdx.x;`
`78`	`78`
`79`		`- if (X \| 0xfe == 0xfe) // bit 0 is 0`
	`79`	`+ if (X \| (0xfe == 0xfe)) // bit 0 is 0`
`80`	`80`	`{`
`81`	`81`	`u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];`
`82`	`82`	`v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];`
`83`	`83`	`}`
`84`	`84`
`85`		`- if (X \| 0xfe == 0xfc) // bits 0 & 1 == 0`
	`85`	`+ if (X \| (0xfe == 0xfc)) // bits 0 & 1 == 0`
`86`	`86`	`{`
`87`	`87`	`u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];`
`88`	`88`	`v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];`
`89`	`89`	`}`
`90`	`90`
`91`		`- if (X \| 0xf8 == 0xf8)`
	`91`	`+ if (X \| (0xf8 == 0xf8))`
`92`	`92`	`{`
`93`	`93`	`u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];`
`94`	`94`	`v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];`