@@ -729,11 +729,12 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
729
729
return k.run (2 , globalsize, localsize, false );
730
730
}
731
731
732
- const int shift_bits = 8 ;
733
-
734
732
static bool ocl_sepRowFilter2D (const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
735
- int borderType, int ddepth, bool fast8uc1, bool int_arithm)
733
+ int borderType, int ddepth, bool fast8uc1,
734
+ bool int_arithm, int shift_bits)
736
735
{
736
+ CV_Assert (shift_bits == 0 || int_arithm);
737
+
737
738
int type = src.type (), cn = CV_MAT_CN (type), sdepth = CV_MAT_DEPTH (type);
738
739
bool doubleSupport = ocl::Device::getDefault ().doubleFPConfig () > 0 ;
739
740
Size bufSize = buf.size ();
@@ -801,8 +802,11 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX
801
802
return k.run (2 , globalsize, localsize, false );
802
803
}
803
804
804
- static bool ocl_sepColFilter2D (const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
805
+ static bool ocl_sepColFilter2D (const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor,
806
+ bool int_arithm, int shift_bits)
805
807
{
808
+ CV_Assert (shift_bits == 0 || int_arithm);
809
+
806
810
bool doubleSupport = ocl::Device::getDefault ().doubleFPConfig () > 0 ;
807
811
if (dst.depth () == CV_64F && !doubleSupport)
808
812
return false ;
@@ -821,13 +825,16 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
821
825
globalsize[1 ] = DIVUP (sz.height , localsize[1 ]) * localsize[1 ];
822
826
globalsize[0 ] = DIVUP (sz.width , localsize[0 ]) * localsize[0 ];
823
827
824
- char cvt[40 ];
828
+ char cvt[2 ][40 ];
829
+ int floatT = std::max (CV_32F, bdepth);
825
830
cv::String build_options = cv::format (" -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
826
- " -D srcT=%s -D dstT=%s -D convertToDstT=%s"
831
+ " -D srcT=%s -D dstT=%s -D convertToFloatT=%s -D floatT=%s -D convertToDstT=%s"
827
832
" -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s" ,
828
833
anchor, (int )localsize[0 ], (int )localsize[1 ], cn,
829
834
ocl::typeToStr (buf_type), ocl::typeToStr (dtype),
830
- ocl::convertTypeStr (bdepth, ddepth, cn, cvt),
835
+ ocl::convertTypeStr (bdepth, floatT, cn, cvt[0 ]),
836
+ ocl::typeToStr (CV_MAKETYPE (floatT, cn)),
837
+ ocl::convertTypeStr (shift_bits ? floatT : bdepth, ddepth, cn, cvt[1 ]),
831
838
ocl::typeToStr (bdepth), ocl::typeToStr (ddepth),
832
839
2 *shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : " " ,
833
840
int_arithm ? " -D INTEGER_ARITHMETIC" : " " );
@@ -839,7 +846,7 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
839
846
return false ;
840
847
841
848
k.args (ocl::KernelArg::ReadOnly (buf), ocl::KernelArg::WriteOnly (dst),
842
- static_cast <float >(delta));
849
+ static_cast <float >(delta * ( 1u << ( 2 * shift_bits)) ));
843
850
844
851
return k.run (2 , globalsize, localsize, false );
845
852
}
@@ -848,16 +855,21 @@ const int optimizedSepFilterLocalWidth = 16;
848
855
const int optimizedSepFilterLocalHeight = 8 ;
849
856
850
857
static bool ocl_sepFilter2D_SinglePass (InputArray _src, OutputArray _dst,
851
- Mat row_kernel, Mat col_kernel,
852
- double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
858
+ const Mat& kernelX_, const Mat& kernelY_,
859
+ double delta, int borderType, int ddepth, int bdepth,
860
+ bool int_arithm, int shift_bits)
853
861
{
854
- Size size = _src.size (), wholeSize;
855
- Point origin;
862
+ // CV_Assert(shift_bits == 0 || int_arithm);
863
+
864
+ const ocl::Device& d = ocl::Device::getDefault ();
865
+
866
+ Size size = _src.size ();
856
867
int stype = _src.type (), sdepth = CV_MAT_DEPTH (stype), cn = CV_MAT_CN (stype),
857
868
esz = CV_ELEM_SIZE (stype), wdepth = std::max (std::max (sdepth, ddepth), bdepth),
858
869
dtype = CV_MAKE_TYPE (ddepth, cn);
859
870
size_t src_step = _src.step (), src_offset = _src.offset ();
860
- bool doubleSupport = ocl::Device::getDefault ().doubleFPConfig () > 0 ;
871
+
872
+ bool doubleSupport = d.doubleFPConfig () > 0 ;
861
873
862
874
if (esz == 0 || src_step == 0
863
875
|| (src_offset % src_step) % esz != 0
@@ -869,6 +881,13 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
869
881
|| borderType == BORDER_REFLECT_101))
870
882
return false ;
871
883
884
+ Mat kernelX, kernelY;
885
+ kernelX_.convertTo (kernelX, wdepth);
886
+ if (kernelX_.data != kernelY_.data )
887
+ kernelY_.convertTo (kernelY, wdepth);
888
+ else
889
+ kernelY = kernelX;
890
+
872
891
size_t lt2[2 ] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
873
892
size_t gt2[2 ] = { lt2[0 ] * (1 + (size.width - 1 ) / lt2[0 ]), lt2[1 ]};
874
893
@@ -879,9 +898,9 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
879
898
String opts = cv::format (" -D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
880
899
" -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
881
900
" -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s" ,
882
- (int )lt2[0 ], (int )lt2[1 ], row_kernel .cols / 2 , col_kernel .cols / 2 ,
883
- ocl::kernelToStr (row_kernel , wdepth, " KERNEL_MATRIX_X" ).c_str (),
884
- ocl::kernelToStr (col_kernel , wdepth, " KERNEL_MATRIX_Y" ).c_str (),
901
+ (int )lt2[0 ], (int )lt2[1 ], kernelX .cols / 2 , kernelY .cols / 2 ,
902
+ ocl::kernelToStr (kernelX , wdepth, " KERNEL_MATRIX_X" ).c_str (),
903
+ ocl::kernelToStr (kernelY , wdepth, " KERNEL_MATRIX_Y" ).c_str (),
885
904
ocl::typeToStr (stype), ocl::convertTypeStr (sdepth, wdepth, cn, cvt[0 ]),
886
905
ocl::typeToStr (CV_MAKE_TYPE (wdepth, cn)), ocl::typeToStr (dtype),
887
906
ocl::convertTypeStr (wdepth, ddepth, cn, cvt[1 ]), borderMap[borderType],
@@ -896,21 +915,30 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
896
915
_dst.create (size, dtype);
897
916
UMat dst = _dst.getUMat ();
898
917
899
- int src_offset_x = static_cast <int >((src_offset % src_step) / esz);
900
- int src_offset_y = static_cast <int >(src_offset / src_step);
918
+ // TODO Future: emit error on inplace processing
919
+ // CV_Assert(src.u != dst.u && "Inplace processing is not allowed with UMat");
920
+ if (src.u == dst.u )
921
+ {
922
+ CV_LOG_ONCE_WARNING (NULL , " sepFilter2D: inplace arguments are not allowed for non-inplace operations. Performance impact warning." );
923
+ src = src.clone ();
924
+ }
901
925
926
+ Size wholeSize;
927
+ Point origin;
902
928
src.locateROI (wholeSize, origin);
903
929
904
- k.args (ocl::KernelArg::PtrReadOnly (src), (int )src_step, src_offset_x, src_offset_y ,
930
+ k.args (ocl::KernelArg::PtrReadOnly (src), (int )src_step, origin. x , origin. y ,
905
931
wholeSize.height , wholeSize.width , ocl::KernelArg::WriteOnly (dst),
906
- static_cast <float >(delta));
932
+ static_cast <float >(delta * ( 1u << ( 2 * shift_bits)) ));
907
933
908
934
return k.run (2 , gt2, lt2, false );
909
935
}
910
936
911
- bool ocl_sepFilter2D ( InputArray _src, OutputArray _dst, int ddepth,
912
- InputArray _kernelX, InputArray _kernelY, Point anchor,
913
- double delta, int borderType )
937
+ bool ocl_sepFilter2D (
938
+ InputArray _src, OutputArray _dst, int ddepth,
939
+ InputArray _kernelX, InputArray _kernelY, Point anchor,
940
+ double delta, int borderType
941
+ )
914
942
{
915
943
const ocl::Device & d = ocl::Device::getDefault ();
916
944
Size imgSize = _src.size ();
@@ -934,59 +962,152 @@ bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
934
962
if (anchor.y < 0 )
935
963
anchor.y = kernelY.cols >> 1 ;
936
964
937
- int rtype = getKernelType (kernelX,
938
- kernelX.rows == 1 ? Point (anchor.x , 0 ) : Point (0 , anchor.x ));
939
- int ctype = getKernelType (kernelY,
940
- kernelY.rows == 1 ? Point (anchor.y , 0 ) : Point (0 , anchor.y ));
941
-
942
965
int bdepth = CV_32F;
943
966
bool int_arithm = false ;
944
- if ( sdepth == CV_8U && ddepth == CV_8U &&
945
- rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
946
- ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL )
967
+ int shift_bits = 0 ;
968
+
969
+ while (sdepth == CV_8U && ddepth == CV_8U )
947
970
{
948
- if (ocl::Device::getDefault ().isIntel ())
971
+ int bits_ = 8 ;
972
+ if (delta * 256 .0f != (float )(int )(delta * 256 ))
949
973
{
950
- for (int i=0 ; i<kernelX.cols ; i++)
951
- kernelX.at <float >(0 , i) = (float ) cvRound (kernelX.at <float >(0 , i) * (1 << shift_bits));
952
- if (kernelX.data != kernelY.data )
953
- for (int i=0 ; i<kernelX.cols ; i++)
954
- kernelY.at <float >(0 , i) = (float ) cvRound (kernelY.at <float >(0 , i) * (1 << shift_bits));
955
- } else
974
+ CV_LOG_DEBUG (NULL , " ocl_sepFilter2D: bit-exact delta can't be applied: delta=" << delta);
975
+ break ;
976
+ }
977
+ Mat kernelX_BitExact, kernelY_BitExact;
978
+ bool isValidBitExactRowKernel = createBitExactKernel_32S (kernelX, kernelX_BitExact, bits_);
979
+ bool isValidBitExactColumnKernel = createBitExactKernel_32S (kernelY, kernelY_BitExact, bits_);
980
+ if (!isValidBitExactRowKernel)
981
+ {
982
+ CV_LOG_DEBUG (NULL , " ocl_sepFilter2D: bit-exact row-kernel can't be applied: ksize=" << kernelX_BitExact.total ());
983
+ }
984
+ else if (!isValidBitExactColumnKernel)
985
+ {
986
+ CV_LOG_DEBUG (NULL , " ocl_sepFilter2D: bit-exact column-kernel can't be applied: ksize=" << kernelY_BitExact.total ());
987
+ }
988
+ else
956
989
{
957
990
bdepth = CV_32S;
958
- kernelX.convertTo ( kernelX, bdepth, 1 << shift_bits );
959
- kernelY.convertTo ( kernelY, bdepth, 1 << shift_bits );
991
+ shift_bits = bits_;
992
+ int_arithm = true ;
993
+
994
+ kernelX = kernelX_BitExact;
995
+ kernelY = kernelY_BitExact;
960
996
}
961
- int_arithm = true ;
997
+ break ;
962
998
}
963
999
964
- CV_OCL_RUN_ (kernelY.cols <= 21 && kernelX.cols <= 21 &&
965
- imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
966
- imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
967
- (!(borderType & BORDER_ISOLATED) || _src.offset () == 0 ) &&
968
- anchor == Point (kernelX.cols >> 1 , kernelY.cols >> 1 ) &&
969
- OCL_PERFORMANCE_CHECK (d.isIntel ()), // TODO FIXIT
970
- ocl_sepFilter2D_SinglePass (_src, _dst, kernelX, kernelY, delta,
971
- borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true )
1000
+ CV_OCL_RUN_ (
1001
+ kernelY.cols <= 21 && kernelX.cols <= 21 &&
1002
+ imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
1003
+ imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
1004
+ (!(borderType & BORDER_ISOLATED) || _src.offset () == 0 ) &&
1005
+ anchor == Point (kernelX.cols >> 1 , kernelY.cols >> 1 ) &&
1006
+ OCL_PERFORMANCE_CHECK (d.isIntel ()), // TODO FIXIT
1007
+ ocl_sepFilter2D_SinglePass (
1008
+ _src, _dst, kernelX, kernelY, delta,
1009
+ borderType & ~BORDER_ISOLATED, ddepth,
1010
+ CV_32F, // force FP32 mode
1011
+ false , shift_bits
1012
+ ),
1013
+ true
1014
+ );
972
1015
973
1016
UMat src = _src.getUMat ();
974
- Size srcWholeSize; Point srcOffset;
975
- src.locateROI (srcWholeSize, srcOffset);
976
1017
977
- bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
978
- src.cols % 4 == 0 && src.step % 4 == 0 ;
1018
+ bool fast8uc1 = false ;
1019
+ if (type == CV_8UC1)
1020
+ {
1021
+ Size srcWholeSize;
1022
+ Point srcOffset;
1023
+ src.locateROI (srcWholeSize, srcOffset);
1024
+ fast8uc1 = srcOffset.x % 4 == 0 &&
1025
+ src.cols % 4 == 0 && src.step % 4 == 0 ;
1026
+ }
1027
+
1028
+ Size srcSize = src.size ();
1029
+ Size bufSize (srcSize.width , srcSize.height + kernelY.cols - 1 );
1030
+ UMat buf (bufSize, CV_MAKETYPE (bdepth, cn));
1031
+ if (!ocl_sepRowFilter2D (src, buf, kernelX, anchor.x , borderType, ddepth, fast8uc1, int_arithm, shift_bits))
1032
+ return false ;
1033
+
1034
+ _dst.create (srcSize, CV_MAKETYPE (ddepth, cn));
1035
+ UMat dst = _dst.getUMat ();
1036
+
1037
+ return ocl_sepColFilter2D (buf, dst, kernelY, delta, anchor.y , int_arithm, shift_bits);
1038
+ }
1039
+
1040
+ bool ocl_sepFilter2D_BitExact (
1041
+ InputArray _src, OutputArray _dst, int ddepth,
1042
+ const Size& ksize,
1043
+ const uint16_t *fkx, const uint16_t *fky,
1044
+ Point anchor,
1045
+ double delta, int borderType,
1046
+ int shift_bits
1047
+ )
1048
+ {
1049
+ const ocl::Device & d = ocl::Device::getDefault ();
1050
+ Size imgSize = _src.size ();
1051
+
1052
+ int type = _src.type (), sdepth = CV_MAT_DEPTH (type), cn = CV_MAT_CN (type);
1053
+ if (cn > 4 )
1054
+ return false ;
1055
+
1056
+ if (ksize.width % 2 != 1 )
1057
+ return false ;
1058
+ if (ksize.height % 2 != 1 )
1059
+ return false ;
1060
+
1061
+ Mat kernelX (1 , ksize.width , CV_16SC1, (void *)fkx);
1062
+ Mat kernelY (1 , ksize.height , CV_16SC1, (void *)fky);
1063
+
1064
+ if (ddepth < 0 )
1065
+ ddepth = sdepth;
1066
+
1067
+ if (anchor.x < 0 )
1068
+ anchor.x = kernelX.cols >> 1 ;
1069
+ if (anchor.y < 0 )
1070
+ anchor.y = kernelY.cols >> 1 ;
1071
+
1072
+ int bdepth = sdepth == CV_8U ? CV_32S : CV_32F;
1073
+
1074
+ CV_OCL_RUN_ (
1075
+ kernelY.cols <= 21 && kernelX.cols <= 21 &&
1076
+ imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
1077
+ imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
1078
+ (!(borderType & BORDER_ISOLATED) || _src.offset () == 0 ) &&
1079
+ anchor == Point (kernelX.cols >> 1 , kernelY.cols >> 1 ) &&
1080
+ OCL_PERFORMANCE_CHECK (d.isIntel ()), // TODO FIXIT
1081
+ ocl_sepFilter2D_SinglePass (
1082
+ _src, _dst, kernelX, kernelY, delta,
1083
+ borderType & ~BORDER_ISOLATED, ddepth, bdepth,
1084
+ true , shift_bits
1085
+ ),
1086
+ true
1087
+ );
1088
+
1089
+ UMat src = _src.getUMat ();
1090
+
1091
+ bool fast8uc1 = false ;
1092
+ if (type == CV_8UC1)
1093
+ {
1094
+ Size srcWholeSize;
1095
+ Point srcOffset;
1096
+ src.locateROI (srcWholeSize, srcOffset);
1097
+ fast8uc1 = srcOffset.x % 4 == 0 &&
1098
+ src.cols % 4 == 0 && src.step % 4 == 0 ;
1099
+ }
979
1100
980
1101
Size srcSize = src.size ();
981
1102
Size bufSize (srcSize.width , srcSize.height + kernelY.cols - 1 );
982
1103
UMat buf (bufSize, CV_MAKETYPE (bdepth, cn));
983
- if (!ocl_sepRowFilter2D (src, buf, kernelX, anchor.x , borderType, ddepth, fast8uc1, int_arithm ))
1104
+ if (!ocl_sepRowFilter2D (src, buf, kernelX, anchor.x , borderType, ddepth, fast8uc1, true , shift_bits ))
984
1105
return false ;
985
1106
986
1107
_dst.create (srcSize, CV_MAKETYPE (ddepth, cn));
987
1108
UMat dst = _dst.getUMat ();
988
1109
989
- return ocl_sepColFilter2D (buf, dst, kernelY, delta, anchor.y , int_arithm );
1110
+ return ocl_sepColFilter2D (buf, dst, kernelY, delta, anchor.y , true , shift_bits );
990
1111
}
991
1112
992
1113
#endif
@@ -1444,7 +1565,7 @@ void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth,
1444
1565
CV_Assert (!_kernelX.empty ());
1445
1566
CV_Assert (!_kernelY.empty ());
1446
1567
1447
- CV_OCL_RUN (_dst.isUMat () && _src.dims () <= 2 && (size_t )_src.rows () > _kernelY.total () && (size_t )_src.cols () > _kernelX.total (),
1568
+ CV_OCL_RUN (_dst.isUMat () && _src.dims () <= 2 && (size_t )_src.rows () >= _kernelY.total () && (size_t )_src.cols () >= _kernelX.total (),
1448
1569
ocl_sepFilter2D (_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))
1449
1570
1450
1571
Mat src = _src.getMat (), kernelX = _kernelX.getMat (), kernelY = _kernelY.getMat ();
0 commit comments