Skip to content

Commit fd0ac96

Browse files
committed
core: replace raw intrinsics with universal intrinsics in copy.cpp
- use universal intrinsic instead of raw intrinsic - add performance check for Mat::copyTo/setTo with mask
1 parent 1bc1f3d commit fd0ac96

File tree

2 files changed

+89
-47
lines changed

2 files changed

+89
-47
lines changed

modules/core/perf/perf_mat.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ PERF_TEST_P(Size_MatType, Mat_Clone,
5757
Size size = get<0>(GetParam());
5858
int type = get<1>(GetParam());
5959
Mat source(size.height, size.width, type);
60-
Mat destination(size.height, size.width, type);;
60+
Mat destination(size.height, size.width, type);
6161

6262
declare.in(source, WARMUP_RNG).out(destination);
6363

@@ -97,6 +97,47 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,
9797
SANITY_CHECK(destination, 1);
9898
}
9999

100+
PERF_TEST_P(Size_MatType, Mat_CopyToWithMask,
101+
testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
102+
testing::Values(CV_8UC1, CV_8UC2))
103+
)
104+
{
105+
const Size_MatType_t params = GetParam();
106+
const Size size = get<0>(params);
107+
const int type = get<1>(params);
108+
109+
Mat src(size, type), dst(size, type), mask(size, CV_8UC1);
110+
declare.in(src, mask, WARMUP_RNG).out(dst);
111+
112+
TEST_CYCLE()
113+
{
114+
src.copyTo(dst, mask);
115+
}
116+
117+
SANITY_CHECK(dst);
118+
}
119+
120+
PERF_TEST_P(Size_MatType, Mat_SetToWithMask,
121+
testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
122+
testing::Values(CV_8UC1, CV_8UC2))
123+
)
124+
{
125+
const Size_MatType_t params = GetParam();
126+
const Size size = get<0>(params);
127+
const int type = get<1>(params);
128+
const Scalar sc = Scalar::all(27);
129+
130+
Mat src(size, type), mask(size, CV_8UC1);
131+
declare.in(src, mask, WARMUP_RNG).out(src);
132+
133+
TEST_CYCLE()
134+
{
135+
src.setTo(sc, mask);
136+
}
137+
138+
SANITY_CHECK(src);
139+
}
140+
100141
///////////// Transform ////////////////////////
101142

102143
PERF_TEST_P(Size_MatType, Mat_Transform,

modules/core/src/copy.cpp

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -90,28 +90,27 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
9090
const uchar* src = (const uchar*)_src;
9191
uchar* dst = (uchar*)_dst;
9292
int x = 0;
93-
#if CV_SSE4_2
94-
if(USE_SSE4_2)//
95-
{
96-
__m128i zero = _mm_setzero_si128 ();
97-
98-
for( ; x <= size.width - 16; x += 16 )
99-
{
100-
const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x));
101-
__m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x));
102-
__m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x));
103-
__m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
104-
rDst = _mm_blendv_epi8(rSrc, rDst, _negMask);
105-
_mm_storeu_si128((__m128i*)(dst + x), rDst);
106-
}
107-
}
108-
#elif CV_NEON
109-
uint8x16_t v_one = vdupq_n_u8(1);
110-
for( ; x <= size.width - 16; x += 16 )
111-
{
112-
uint8x16_t v_mask = vcgeq_u8(vld1q_u8(mask + x), v_one);
113-
uint8x16_t v_dst = vld1q_u8(dst + x), v_src = vld1q_u8(src + x);
114-
vst1q_u8(dst + x, vbslq_u8(v_mask, v_src, v_dst));
93+
#if CV_SIMD128
94+
if( hasSIMD128()
95+
#if CV_SSE4_2
96+
&& USE_SSE4_2
97+
#endif
98+
) {
99+
v_uint8x16 v_zero = v_setzero_u8();
100+
101+
for( ; x <= size.width - 16; x += 16 )
102+
{
103+
v_uint8x16 v_src = v_load(src + x),
104+
v_dst = v_load(dst + x),
105+
v_nmask = v_load(mask + x) == v_zero;
106+
107+
#if CV_SSE4_2
108+
v_dst = v_uint8x16(_mm_blendv_epi8(v_src.val, v_dst.val, v_nmask.val));
109+
#else
110+
v_dst = v_select(v_nmask, v_dst, v_src);
111+
#endif
112+
v_store(dst + x, v_dst);
113+
}
115114
}
116115
#endif
117116
for( ; x < size.width; x++ )
@@ -130,31 +129,33 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
130129
const ushort* src = (const ushort*)_src;
131130
ushort* dst = (ushort*)_dst;
132131
int x = 0;
133-
#if CV_SSE4_2
134-
if(USE_SSE4_2)//
135-
{
136-
__m128i zero = _mm_setzero_si128 ();
137-
for( ; x <= size.width - 8; x += 8 )
132+
#if CV_SIMD128
133+
if( hasSIMD128()
134+
#if CV_SSE4_2
135+
&& USE_SSE4_2
136+
#endif
137+
) {
138+
v_uint8x16 v_zero = v_setzero_u8();
139+
140+
for( ; x <= size.width - 16; x += 16 )
138141
{
139-
const __m128i rSrc =_mm_lddqu_si128((const __m128i*)(src+x));
140-
__m128i _mask = _mm_loadl_epi64((const __m128i*)(mask+x));
141-
_mask = _mm_unpacklo_epi8(_mask, _mask);
142-
__m128i rDst = _mm_lddqu_si128((const __m128i*)(dst+x));
143-
__m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
144-
rDst = _mm_blendv_epi8(rSrc, rDst, _negMask);
145-
_mm_storeu_si128((__m128i*)(dst + x), rDst);
146-
}
147-
}
148-
#elif CV_NEON
149-
uint8x8_t v_one = vdup_n_u8(1);
150-
for( ; x <= size.width - 8; x += 8 )
151-
{
152-
uint8x8_t v_mask = vcge_u8(vld1_u8(mask + x), v_one);
153-
uint8x8x2_t v_mask2 = vzip_u8(v_mask, v_mask);
154-
uint16x8_t v_mask_res = vreinterpretq_u16_u8(vcombine_u8(v_mask2.val[0], v_mask2.val[1]));
155-
156-
uint16x8_t v_src = vld1q_u16(src + x), v_dst = vld1q_u16(dst + x);
157-
vst1q_u16(dst + x, vbslq_u16(v_mask_res, v_src, v_dst));
142+
v_uint16x8 v_src1 = v_load(src + x), v_src2 = v_load(src + x + 8),
143+
v_dst1 = v_load(dst + x), v_dst2 = v_load(dst + x + 8);
144+
145+
v_uint8x16 v_nmask1, v_nmask2;
146+
v_uint8x16 v_nmask = v_load(mask + x) == v_zero;
147+
v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2);
148+
149+
#if CV_SSE4_2
150+
v_dst1 = v_uint16x8(_mm_blendv_epi8(v_src1.val, v_dst1.val, v_nmask1.val));
151+
v_dst2 = v_uint16x8(_mm_blendv_epi8(v_src2.val, v_dst2.val, v_nmask2.val));
152+
#else
153+
v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1);
154+
v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2);
155+
#endif
156+
v_store(dst + x, v_dst1);
157+
v_store(dst + x + 8, v_dst2);
158+
}
158159
}
159160
#endif
160161
for( ; x < size.width; x++ )

0 commit comments

Comments
 (0)