@@ -90,28 +90,27 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
90
90
const uchar* src = (const uchar*)_src;
91
91
uchar* dst = (uchar*)_dst;
92
92
int x = 0 ;
93
- #if CV_SSE4_2
94
- if (USE_SSE4_2)//
95
- {
96
- __m128i zero = _mm_setzero_si128 ();
97
-
98
- for ( ; x <= size.width - 16 ; x += 16 )
99
- {
100
- const __m128i rSrc = _mm_lddqu_si128 ((const __m128i*)(src+x));
101
- __m128i _mask = _mm_lddqu_si128 ((const __m128i*)(mask+x));
102
- __m128i rDst = _mm_lddqu_si128 ((__m128i*)(dst+x));
103
- __m128i _negMask = _mm_cmpeq_epi8 (_mask, zero);
104
- rDst = _mm_blendv_epi8 (rSrc, rDst, _negMask);
105
- _mm_storeu_si128 ((__m128i*)(dst + x), rDst);
106
- }
107
- }
108
- #elif CV_NEON
109
- uint8x16_t v_one = vdupq_n_u8 (1 );
110
- for ( ; x <= size.width - 16 ; x += 16 )
111
- {
112
- uint8x16_t v_mask = vcgeq_u8 (vld1q_u8 (mask + x), v_one);
113
- uint8x16_t v_dst = vld1q_u8 (dst + x), v_src = vld1q_u8 (src + x);
114
- vst1q_u8 (dst + x, vbslq_u8 (v_mask, v_src, v_dst));
93
+ #if CV_SIMD128
94
+ if ( hasSIMD128 ()
95
+ #if CV_SSE4_2
96
+ && USE_SSE4_2
97
+ #endif
98
+ ) {
99
+ v_uint8x16 v_zero = v_setzero_u8 ();
100
+
101
+ for ( ; x <= size.width - 16 ; x += 16 )
102
+ {
103
+ v_uint8x16 v_src = v_load (src + x),
104
+ v_dst = v_load (dst + x),
105
+ v_nmask = v_load (mask + x) == v_zero;
106
+
107
+ #if CV_SSE4_2
108
+ v_dst = v_uint8x16 (_mm_blendv_epi8 (v_src.val , v_dst.val , v_nmask.val ));
109
+ #else
110
+ v_dst = v_select (v_nmask, v_dst, v_src);
111
+ #endif
112
+ v_store (dst + x, v_dst);
113
+ }
115
114
}
116
115
#endif
117
116
for ( ; x < size.width ; x++ )
@@ -130,31 +129,33 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
130
129
const ushort* src = (const ushort*)_src;
131
130
ushort* dst = (ushort*)_dst;
132
131
int x = 0 ;
133
- #if CV_SSE4_2
134
- if (USE_SSE4_2)//
135
- {
136
- __m128i zero = _mm_setzero_si128 ();
137
- for ( ; x <= size.width - 8 ; x += 8 )
132
+ #if CV_SIMD128
133
+ if ( hasSIMD128 ()
134
+ #if CV_SSE4_2
135
+ && USE_SSE4_2
136
+ #endif
137
+ ) {
138
+ v_uint8x16 v_zero = v_setzero_u8 ();
139
+
140
+ for ( ; x <= size.width - 16 ; x += 16 )
138
141
{
139
- const __m128i rSrc =_mm_lddqu_si128 ((const __m128i*)(src+x));
140
- __m128i _mask = _mm_loadl_epi64 ((const __m128i*)(mask+x));
141
- _mask = _mm_unpacklo_epi8 (_mask, _mask);
142
- __m128i rDst = _mm_lddqu_si128 ((const __m128i*)(dst+x));
143
- __m128i _negMask = _mm_cmpeq_epi8 (_mask, zero);
144
- rDst = _mm_blendv_epi8 (rSrc, rDst, _negMask);
145
- _mm_storeu_si128 ((__m128i*)(dst + x), rDst);
146
- }
147
- }
148
- #elif CV_NEON
149
- uint8x8_t v_one = vdup_n_u8 (1 );
150
- for ( ; x <= size.width - 8 ; x += 8 )
151
- {
152
- uint8x8_t v_mask = vcge_u8 (vld1_u8 (mask + x), v_one);
153
- uint8x8x2_t v_mask2 = vzip_u8 (v_mask, v_mask);
154
- uint16x8_t v_mask_res = vreinterpretq_u16_u8 (vcombine_u8 (v_mask2.val [0 ], v_mask2.val [1 ]));
155
-
156
- uint16x8_t v_src = vld1q_u16 (src + x), v_dst = vld1q_u16 (dst + x);
157
- vst1q_u16 (dst + x, vbslq_u16 (v_mask_res, v_src, v_dst));
142
+ v_uint16x8 v_src1 = v_load (src + x), v_src2 = v_load (src + x + 8 ),
143
+ v_dst1 = v_load (dst + x), v_dst2 = v_load (dst + x + 8 );
144
+
145
+ v_uint8x16 v_nmask1, v_nmask2;
146
+ v_uint8x16 v_nmask = v_load (mask + x) == v_zero;
147
+ v_zip (v_nmask, v_nmask, v_nmask1, v_nmask2);
148
+
149
+ #if CV_SSE4_2
150
+ v_dst1 = v_uint16x8 (_mm_blendv_epi8 (v_src1.val , v_dst1.val , v_nmask1.val ));
151
+ v_dst2 = v_uint16x8 (_mm_blendv_epi8 (v_src2.val , v_dst2.val , v_nmask2.val ));
152
+ #else
153
+ v_dst1 = v_select (v_reinterpret_as_u16 (v_nmask1), v_dst1, v_src1);
154
+ v_dst2 = v_select (v_reinterpret_as_u16 (v_nmask2), v_dst2, v_src2);
155
+ #endif
156
+ v_store (dst + x, v_dst1);
157
+ v_store (dst + x + 8 , v_dst2);
158
+ }
158
159
}
159
160
#endif
160
161
for ( ; x < size.width ; x++ )
0 commit comments