Skip to content

Commit 762a5c8

Browse files
committed
imgproc: align GaussianBlur/sepFilter2D OpenCL with CPU version
1 parent 2fed41d commit 762a5c8

File tree

7 files changed

+273
-108
lines changed

7 files changed

+273
-108
lines changed

modules/imgproc/src/filter.dispatch.cpp

Lines changed: 177 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -729,11 +729,12 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
729729
return k.run(2, globalsize, localsize, false);
730730
}
731731

732-
const int shift_bits = 8;
733-
734732
static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
735-
int borderType, int ddepth, bool fast8uc1, bool int_arithm)
733+
int borderType, int ddepth, bool fast8uc1,
734+
bool int_arithm, int shift_bits)
736735
{
736+
CV_Assert(shift_bits == 0 || int_arithm);
737+
737738
int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
738739
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
739740
Size bufSize = buf.size();
@@ -801,8 +802,11 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX
801802
return k.run(2, globalsize, localsize, false);
802803
}
803804

804-
static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
805+
static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor,
806+
bool int_arithm, int shift_bits)
805807
{
808+
CV_Assert(shift_bits == 0 || int_arithm);
809+
806810
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
807811
if (dst.depth() == CV_64F && !doubleSupport)
808812
return false;
@@ -821,13 +825,16 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
821825
globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
822826
globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
823827

824-
char cvt[40];
828+
char cvt[2][40];
829+
int floatT = std::max(CV_32F, bdepth);
825830
cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
826-
" -D srcT=%s -D dstT=%s -D convertToDstT=%s"
831+
" -D srcT=%s -D dstT=%s -D convertToFloatT=%s -D floatT=%s -D convertToDstT=%s"
827832
" -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s",
828833
anchor, (int)localsize[0], (int)localsize[1], cn,
829834
ocl::typeToStr(buf_type), ocl::typeToStr(dtype),
830-
ocl::convertTypeStr(bdepth, ddepth, cn, cvt),
835+
ocl::convertTypeStr(bdepth, floatT, cn, cvt[0]),
836+
ocl::typeToStr(CV_MAKETYPE(floatT, cn)),
837+
ocl::convertTypeStr(shift_bits ? floatT : bdepth, ddepth, cn, cvt[1]),
831838
ocl::typeToStr(bdepth), ocl::typeToStr(ddepth),
832839
2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
833840
int_arithm ? " -D INTEGER_ARITHMETIC" : "");
@@ -839,7 +846,7 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
839846
return false;
840847

841848
k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst),
842-
static_cast<float>(delta));
849+
static_cast<float>(delta * (1u << (2 * shift_bits))));
843850

844851
return k.run(2, globalsize, localsize, false);
845852
}
@@ -848,16 +855,21 @@ const int optimizedSepFilterLocalWidth = 16;
848855
const int optimizedSepFilterLocalHeight = 8;
849856

850857
static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
851-
Mat row_kernel, Mat col_kernel,
852-
double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
858+
const Mat& kernelX_, const Mat& kernelY_,
859+
double delta, int borderType, int ddepth, int bdepth,
860+
bool int_arithm, int shift_bits)
853861
{
854-
Size size = _src.size(), wholeSize;
855-
Point origin;
862+
//CV_Assert(shift_bits == 0 || int_arithm);
863+
864+
const ocl::Device& d = ocl::Device::getDefault();
865+
866+
Size size = _src.size();
856867
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
857868
esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth),
858869
dtype = CV_MAKE_TYPE(ddepth, cn);
859870
size_t src_step = _src.step(), src_offset = _src.offset();
860-
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
871+
872+
bool doubleSupport = d.doubleFPConfig() > 0;
861873

862874
if (esz == 0 || src_step == 0
863875
|| (src_offset % src_step) % esz != 0
@@ -869,6 +881,13 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
869881
|| borderType == BORDER_REFLECT_101))
870882
return false;
871883

884+
Mat kernelX, kernelY;
885+
kernelX_.convertTo(kernelX, wdepth);
886+
if (kernelX_.data != kernelY_.data)
887+
kernelY_.convertTo(kernelY, wdepth);
888+
else
889+
kernelY = kernelX;
890+
872891
size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
873892
size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]};
874893

@@ -879,9 +898,9 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
879898
String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
880899
" -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
881900
" -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s",
882-
(int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2,
883-
ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(),
884-
ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(),
901+
(int)lt2[0], (int)lt2[1], kernelX.cols / 2, kernelY.cols / 2,
902+
ocl::kernelToStr(kernelX, wdepth, "KERNEL_MATRIX_X").c_str(),
903+
ocl::kernelToStr(kernelY, wdepth, "KERNEL_MATRIX_Y").c_str(),
885904
ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
886905
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
887906
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
@@ -896,21 +915,30 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
896915
_dst.create(size, dtype);
897916
UMat dst = _dst.getUMat();
898917

899-
int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
900-
int src_offset_y = static_cast<int>(src_offset / src_step);
918+
// TODO Future: emit error on inplace processing
919+
//CV_Assert(src.u != dst.u && "Inplace processing is not allowed with UMat");
920+
if (src.u == dst.u)
921+
{
922+
CV_LOG_ONCE_WARNING(NULL, "sepFilter2D: inplace arguments are not allowed for non-inplace operations. Performance impact warning.");
923+
src = src.clone();
924+
}
901925

926+
Size wholeSize;
927+
Point origin;
902928
src.locateROI(wholeSize, origin);
903929

904-
k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
930+
k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, origin.x, origin.y,
905931
wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst),
906-
static_cast<float>(delta));
932+
static_cast<float>(delta * (1u << (2 * shift_bits))));
907933

908934
return k.run(2, gt2, lt2, false);
909935
}
910936

911-
bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
912-
InputArray _kernelX, InputArray _kernelY, Point anchor,
913-
double delta, int borderType )
937+
bool ocl_sepFilter2D(
938+
InputArray _src, OutputArray _dst, int ddepth,
939+
InputArray _kernelX, InputArray _kernelY, Point anchor,
940+
double delta, int borderType
941+
)
914942
{
915943
const ocl::Device & d = ocl::Device::getDefault();
916944
Size imgSize = _src.size();
@@ -934,59 +962,152 @@ bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
934962
if (anchor.y < 0)
935963
anchor.y = kernelY.cols >> 1;
936964

937-
int rtype = getKernelType(kernelX,
938-
kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x));
939-
int ctype = getKernelType(kernelY,
940-
kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y));
941-
942965
int bdepth = CV_32F;
943966
bool int_arithm = false;
944-
if( sdepth == CV_8U && ddepth == CV_8U &&
945-
rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
946-
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
967+
int shift_bits = 0;
968+
969+
while (sdepth == CV_8U && ddepth == CV_8U)
947970
{
948-
if (ocl::Device::getDefault().isIntel())
971+
int bits_ = 8;
972+
if (delta * 256.0f != (float)(int)(delta * 256))
949973
{
950-
for (int i=0; i<kernelX.cols; i++)
951-
kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits));
952-
if (kernelX.data != kernelY.data)
953-
for (int i=0; i<kernelX.cols; i++)
954-
kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits));
955-
} else
974+
CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact delta can't be applied: delta=" << delta);
975+
break;
976+
}
977+
Mat kernelX_BitExact, kernelY_BitExact;
978+
bool isValidBitExactRowKernel = createBitExactKernel_32S(kernelX, kernelX_BitExact, bits_);
979+
bool isValidBitExactColumnKernel = createBitExactKernel_32S(kernelY, kernelY_BitExact, bits_);
980+
if (!isValidBitExactRowKernel)
981+
{
982+
CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact row-kernel can't be applied: ksize=" << kernelX_BitExact.total());
983+
}
984+
else if (!isValidBitExactColumnKernel)
985+
{
986+
CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact column-kernel can't be applied: ksize=" << kernelY_BitExact.total());
987+
}
988+
else
956989
{
957990
bdepth = CV_32S;
958-
kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
959-
kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
991+
shift_bits = bits_;
992+
int_arithm = true;
993+
994+
kernelX = kernelX_BitExact;
995+
kernelY = kernelY_BitExact;
960996
}
961-
int_arithm = true;
997+
break;
962998
}
963999

964-
CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
965-
imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
966-
imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
967-
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
968-
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
969-
OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT
970-
ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
971-
borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true)
1000+
CV_OCL_RUN_(
1001+
kernelY.cols <= 21 && kernelX.cols <= 21 &&
1002+
imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
1003+
imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
1004+
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
1005+
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
1006+
OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT
1007+
ocl_sepFilter2D_SinglePass(
1008+
_src, _dst, kernelX, kernelY, delta,
1009+
borderType & ~BORDER_ISOLATED, ddepth,
1010+
CV_32F, // force FP32 mode
1011+
false, shift_bits
1012+
),
1013+
true
1014+
);
9721015

9731016
UMat src = _src.getUMat();
974-
Size srcWholeSize; Point srcOffset;
975-
src.locateROI(srcWholeSize, srcOffset);
9761017

977-
bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
978-
src.cols % 4 == 0 && src.step % 4 == 0;
1018+
bool fast8uc1 = false;
1019+
if (type == CV_8UC1)
1020+
{
1021+
Size srcWholeSize;
1022+
Point srcOffset;
1023+
src.locateROI(srcWholeSize, srcOffset);
1024+
fast8uc1 = srcOffset.x % 4 == 0 &&
1025+
src.cols % 4 == 0 && src.step % 4 == 0;
1026+
}
1027+
1028+
Size srcSize = src.size();
1029+
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
1030+
UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
1031+
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm, shift_bits))
1032+
return false;
1033+
1034+
_dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
1035+
UMat dst = _dst.getUMat();
1036+
1037+
return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm, shift_bits);
1038+
}
1039+
1040+
bool ocl_sepFilter2D_BitExact(
1041+
InputArray _src, OutputArray _dst, int ddepth,
1042+
const Size& ksize,
1043+
const uint16_t *fkx, const uint16_t *fky,
1044+
Point anchor,
1045+
double delta, int borderType,
1046+
int shift_bits
1047+
)
1048+
{
1049+
const ocl::Device & d = ocl::Device::getDefault();
1050+
Size imgSize = _src.size();
1051+
1052+
int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1053+
if (cn > 4)
1054+
return false;
1055+
1056+
if (ksize.width % 2 != 1)
1057+
return false;
1058+
if (ksize.height % 2 != 1)
1059+
return false;
1060+
1061+
Mat kernelX(1, ksize.width, CV_16SC1, (void*)fkx);
1062+
Mat kernelY(1, ksize.height, CV_16SC1, (void*)fky);
1063+
1064+
if (ddepth < 0)
1065+
ddepth = sdepth;
1066+
1067+
if (anchor.x < 0)
1068+
anchor.x = kernelX.cols >> 1;
1069+
if (anchor.y < 0)
1070+
anchor.y = kernelY.cols >> 1;
1071+
1072+
int bdepth = sdepth == CV_8U ? CV_32S : CV_32F;
1073+
1074+
CV_OCL_RUN_(
1075+
kernelY.cols <= 21 && kernelX.cols <= 21 &&
1076+
imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
1077+
imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
1078+
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
1079+
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
1080+
OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT
1081+
ocl_sepFilter2D_SinglePass(
1082+
_src, _dst, kernelX, kernelY, delta,
1083+
borderType & ~BORDER_ISOLATED, ddepth, bdepth,
1084+
true, shift_bits
1085+
),
1086+
true
1087+
);
1088+
1089+
UMat src = _src.getUMat();
1090+
1091+
bool fast8uc1 = false;
1092+
if (type == CV_8UC1)
1093+
{
1094+
Size srcWholeSize;
1095+
Point srcOffset;
1096+
src.locateROI(srcWholeSize, srcOffset);
1097+
fast8uc1 = srcOffset.x % 4 == 0 &&
1098+
src.cols % 4 == 0 && src.step % 4 == 0;
1099+
}
9791100

9801101
Size srcSize = src.size();
9811102
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
9821103
UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
983-
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm))
1104+
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, true, shift_bits))
9841105
return false;
9851106

9861107
_dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
9871108
UMat dst = _dst.getUMat();
9881109

989-
return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm);
1110+
return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, true, shift_bits);
9901111
}
9911112

9921113
#endif
@@ -1444,7 +1565,7 @@ void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth,
14441565
CV_Assert(!_kernelX.empty());
14451566
CV_Assert(!_kernelY.empty());
14461567

1447-
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > _kernelY.total() && (size_t)_src.cols() > _kernelX.total(),
1568+
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() >= _kernelY.total() && (size_t)_src.cols() >= _kernelX.total(),
14481569
ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))
14491570

14501571
Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();

modules/imgproc/src/filter.hpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,25 @@
4646
namespace cv
4747
{
4848
#ifdef HAVE_OPENCL
49-
bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
50-
InputArray _kernelX, InputArray _kernelY, Point anchor,
51-
double delta, int borderType );
49+
bool ocl_sepFilter2D(
50+
InputArray _src, OutputArray _dst, int ddepth,
51+
InputArray _kernelX, InputArray _kernelY, Point anchor,
52+
double delta, int borderType
53+
);
54+
55+
bool ocl_sepFilter2D_BitExact(
56+
InputArray _src, OutputArray _dst, int ddepth,
57+
const Size& ksize,
58+
const uint16_t *fkx, const uint16_t *fky,
59+
Point anchor,
60+
double delta, int borderType,
61+
int shift_bits
62+
);
5263
#endif
5364

54-
void preprocess2DKernel(const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs);
55-
}
65+
void preprocess2DKernel(const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs);
66+
67+
} // namespace
5668

5769
#endif
5870

0 commit comments

Comments
 (0)