diff --git a/perf/perf_skeleton.cpp b/perf/perf_skeleton.cpp index 8f144b8..5c3f1f2 100644 --- a/perf/perf_skeleton.cpp +++ b/perf/perf_skeleton.cpp @@ -77,6 +77,7 @@ PERF_TEST_P(Size_Only, ImageResize, testing::Values(MAT_SIZES)) // Test(s) for the Thinning function // + PERF_TEST_P(Size_Only, Thinning, testing::Values(MAT_SIZES)) { Size sz = GetParam(); @@ -190,3 +191,4 @@ TEST(CompleteColorSpace, ConvertColor_fpt) ASSERT_LT(cv::countNonZero(diff), 7565); // ASSERT_EQ(0, cv::countNonZero(diff)); } + diff --git a/src/convertcolor.cpp b/src/convertcolor.cpp index 6787e23..cb659df 100644 --- a/src/convertcolor.cpp +++ b/src/convertcolor.cpp @@ -84,12 +84,12 @@ void ConvertColor_BGR2GRAY_BT709_simd(const cv::Mat& src, cv::Mat& dst) dst.create(sz, CV_8UC1); #ifdef HAVE_SSE - // __m128i ssse3_blue_indices_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0); - // __m128i ssse3_blue_indices_1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1); - // __m128i ssse3_blue_indices_2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); - // __m128i ssse3_green_indices_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1); - // __m128i ssse3_green_indices_1 = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1); - // __m128i ssse3_green_indices_2 = _mm_set_epi8(14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i ssse3_blue_indices_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0); + __m128i ssse3_blue_indices_1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1); + __m128i ssse3_blue_indices_2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); + __m128i ssse3_green_indices_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1); + __m128i ssse3_green_indices_1 = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1); + __m128i ssse3_green_indices_2 = _mm_set_epi8(14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i ssse3_red_indices_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2); __m128i ssse3_red_indices_1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1); __m128i ssse3_red_indices_2 = _mm_set_epi8(15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); @@ -100,7 +100,7 @@ void ConvertColor_BGR2GRAY_BT709_simd(const cv::Mat& src, cv::Mat& dst) __m128i bias = _mm_set1_epi16(128); __m128i zero = _mm_setzero_si128(); #endif - + int k; for (int y = 0; y < sz.height; y++) { const uchar *psrc = src.ptr(y); @@ -120,23 +120,61 @@ void ConvertColor_BGR2GRAY_BT709_simd(const cv::Mat& src, cv::Mat& dst) _mm_shuffle_epi8(chunk1, ssse3_red_indices_1)), _mm_shuffle_epi8(chunk2, ssse3_red_indices_2)); - /* ??? */ - - __m128i gray_packed; // Initialize it properly + __m128i green = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, ssse3_green_indices_0), + _mm_shuffle_epi8(chunk1, ssse3_green_indices_1)), + _mm_shuffle_epi8(chunk2, ssse3_green_indices_2)); + + __m128i blue = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, ssse3_blue_indices_0), + _mm_shuffle_epi8(chunk1, ssse3_blue_indices_1)), + _mm_shuffle_epi8(chunk2, ssse3_blue_indices_2)); + + __m128i big_red_1 = _mm_unpacklo_epi8(red, zero); + __m128i big_red_2 = _mm_unpackhi_epi8(red, zero); + __m128i big_green_1 = _mm_unpacklo_epi8(green, zero); + __m128i big_green_2 = _mm_unpackhi_epi8(green, zero); + __m128i big_blue_1 = _mm_unpacklo_epi8(blue, zero); + __m128i big_blue_2 = _mm_unpackhi_epi8(blue, zero); + + __m128i red_summand_1 = _mm_mullo_epi16(red_coeff, big_red_1); + __m128i red_summand_2 = _mm_mullo_epi16(red_coeff, big_red_2); + __m128i green_summand_1 = _mm_mullo_epi16(green_coeff, big_green_1); + __m128i green_summand_2 = _mm_mullo_epi16(green_coeff, big_green_2); + __m128i blue_summand_1 = _mm_mullo_epi16(blue_coeff, big_blue_1); + __m128i blue_summand_2 = _mm_mullo_epi16(blue_coeff, big_blue_2); + + __m128i first_oper_1 = _mm_add_epi16(zero, red_summand_1); + __m128i second_oper_1 = _mm_add_epi16(first_oper_1, green_summand_1); + __m128i third_oper_1 = _mm_add_epi16(second_oper_1, blue_summand_1); + __m128i first_oper_2 = _mm_add_epi16(zero, red_summand_2); + __m128i second_oper_2 = _mm_add_epi16(first_oper_2, green_summand_2); + __m128i third_oper_2 = _mm_add_epi16(second_oper_2, blue_summand_2); + + __m128i gray_packed_1 = _mm_add_epi16(third_oper_1, bias); + __m128i gray_packed_2 = _mm_add_epi16(third_oper_2, bias); + + __m128i gray_packed_shift_1 = _mm_srli_epi16(gray_packed_1, 8); + __m128i gray_packed_shift_2 = _mm_srli_epi16(gray_packed_2, 8); + + __m128i gray_packed = _mm_packus_epi16(gray_packed_shift_1, gray_packed_shift_2); _mm_storeu_si128((__m128i*)(pdst + x), gray_packed); } #endif // Process leftover pixels - for (; x < sz.width; x++) + int shift = 16; + + short rw = (short)(0.2126 * (1 << shift) + 0.5); + short gw = (short)(0.7152 * (1 << shift) + 0.5); + short bw = (short)(0.0722 * (1 << shift) + 0.5); + + for (; x < sz.width; x++) { - float color = 0.2126 * psrc[3 * x + 2] + 0.7152 * psrc[3 * x + 1] + 0.0722 * psrc[3 * x]; - pdst[x] = (int)(color + 0.5); + pdst[x] = (rw * psrc[3 * x + 2] + gw * psrc[3 * x + 1] + bw * psrc[3 * x] + (1<<(shift-1))) >> shift; } } // ! Remove this before writing your optimizations ! - ConvertColor_BGR2GRAY_BT709_fpt(src, dst); + // ConvertColor_BGR2GRAY_BT709_fpt(src, dst); // ! Remove this before writing your optimizations ! } diff --git a/src/resize.cpp b/src/resize.cpp index 67283a0..830f68e 100644 --- a/src/resize.cpp +++ b/src/resize.cpp @@ -39,7 +39,8 @@ void ImageResize(const cv::Mat &src, cv::Mat &dst, const cv::Size sz) ( (x1 == x2) ? (int)(q11 * (y2 - y) + q22 * (y - y1)) : ( (y1 == y2) ? (int)(q11 * (x2 - x) + q22 * (x - x1)) : (int)(q11 * (x2 - x) * (y2 - y) + q21 * (x - x1) * (y2 - y) + q12 * (x2 - x) * (y - y1) + q22 * (x - x1) * (y - y1)))); - ptr_dst[col] = (temp < 0) ? 0 : ((temp > 255) ? 255 : (uchar)temp); + + ptr_dst[col] = (temp < 0) ? 0 : ((temp > 255) ? 255 : (uchar)temp); } } } @@ -50,11 +51,13 @@ void ImageResize_optimized(const cv::Mat &src, cv::Mat &dst, const cv::Size sz) cv::Size sz_src = src.size(); dst.create(sz, src.type()); - const int src_rows = src.rows; - const int src_cols = src.cols; + int src_rows = src.rows; + int src_cols = src.cols; - const int dst_rows = sz.height; - const int dst_cols = sz.width; + int dst_rows = sz.height; + int dst_cols = sz.width; + float xscale = (float)sz_src.width / sz.width; + float yscale = (float)sz_src.height / sz.height; for (int row = 0; row < dst_rows; row++) { @@ -62,27 +65,28 @@ void ImageResize_optimized(const cv::Mat &src, cv::Mat &dst, const cv::Size sz) for (int col = 0; col < dst_cols; col++) { - const float x = (((float)col) + .5f) * sz_src.width / sz.width - .5f; - const float y = (((float)row) + .5f) * sz_src.height / sz.height - .5f; + float x = (((float)col) + .5f) * xscale - .5f; + float y = (((float)row) + .5f) * yscale - .5f; - const int ix = (int)floor(x); - const int iy = (int)floor(y); + int ix = (x > 0) ? (int)x : (int)floor(x); + int iy = (y > 0) ? (int)y : (int)floor(y); - const int x1 = (ix < 0) ? 0 : ((ix >= src_cols) ? src_cols - 1 : ix); - const int x2 = (ix < 0) ? 0 : ((ix >= src_cols - 1) ? src_cols - 1 : ix + 1); - const int y1 = (iy < 0) ? 0 : ((iy >= src_rows) ? src_rows - 1 : iy); - const int y2 = (iy < 0) ? 0 : ((iy >= src_rows - 1) ? src_rows - 1 : iy + 1); + int x1 = ix; + int x2 = (ix >= src_cols - 1) ? src_cols - 1 : ix + 1; + int y1 = iy; + int y2 = (iy >= src_rows - 1) ? src_rows - 1 : iy + 1; - const uchar q11 = src.at(y1, x1); - const uchar q12 = src.at(y2, x1); - const uchar q21 = src.at(y1, x2); - const uchar q22 = src.at(y2, x2); + int q11 = src.at(y1, x1); + int q12 = src.at(y2, x1); + int q21 = src.at(y1, x2); + int q22 = src.at(y2, x2); - const int temp = ((x1 == x2) && (y1 == y2)) ? (int)q11 : - ( (x1 == x2) ? (int)(q11 * (y2 - y) + q22 * (y - y1)) : - ( (y1 == y2) ? (int)(q11 * (x2 - x) + q22 * (x - x1)) : - (int)(q11 * (x2 - x) * (y2 - y) + q21 * (x - x1) * (y2 - y) + q12 * (x2 - x) * (y - y1) + q22 * (x - x1) * (y - y1)))); - ptr_dst[col] = (temp < 0) ? 0 : ((temp > 255) ? 255 : (uchar)temp); + int temp = ((x1 == x2) && (y1 == y2)) ? q11 : + ( (x1 == x2) ? (q11 * (y2 - y) + q22 * (y - y1)) : + ( (y1 == y2) ? (q11 * (x2 - x) + q22 * (x - x1)) : + (q11 * (x2 - x) * (y2 - y) + q21 * (x - x1) * (y2 - y) + q12 * (x2 - x) * (y - y1) + q22 * (x - x1) * (y - y1)))); + + ptr_dst[col] = (uchar)temp; } } } diff --git a/src/thinning.cpp b/src/thinning.cpp index f4552f5..7ab5cf7 100644 --- a/src/thinning.cpp +++ b/src/thinning.cpp @@ -60,7 +60,7 @@ void GuoHallThinning(const cv::Mat& src, cv::Mat& dst) static void GuoHallIteration_optimized(cv::Mat& im, int iter) { - cv::Mat marker = cv::Mat::zeros(im.size(), CV_8UC1); + cv::Mat marker = cv::Mat::zeros(im.size(), CV_8UC1); for (int i = 1; i < im.rows-1; i++) { @@ -99,6 +99,7 @@ void GuoHallThinning_optimized(const cv::Mat& src, cv::Mat& dst) cv::Mat prev = cv::Mat::zeros(src.size(), CV_8UC1); cv::Mat diff; + do { GuoHallIteration_optimized(dst, 0);