Skip to content

Commit 8931c68

Browse files
committed
Merge pull request opencv#17707 from Yosshi999:gsoc_sift-universal-intrinsic
2 parents f78f908 + 920c180 commit 8931c68

File tree

1 file changed

+122
-158
lines changed

1 file changed

+122
-158
lines changed

modules/features2d/src/sift.simd.hpp

Lines changed: 122 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,23 @@ float calcOrientationHist(
167167
int i, j, k, len = (radius*2+1)*(radius*2+1);
168168

169169
float expf_scale = -1.f/(2.f * sigma * sigma);
170+
#if CV_SIMD
171+
AutoBuffer<float> bufX(len + v_float32::nlanes);
172+
AutoBuffer<float> bufY(len + v_float32::nlanes);
173+
AutoBuffer<float> bufO(len + v_float32::nlanes);
174+
AutoBuffer<float> bufW(len + v_float32::nlanes);
175+
AutoBuffer<float> bufT(n+4 + v_float32::nlanes);
176+
float *X = alignPtr(bufX.data(), CV_SIMD_WIDTH);
177+
float *Y = alignPtr(bufY.data(), CV_SIMD_WIDTH);
178+
float *Mag = X;
179+
float *Ori = alignPtr(bufO.data(), CV_SIMD_WIDTH);
180+
float *W = alignPtr(bufW.data(), CV_SIMD_WIDTH);
181+
float *temphist = alignPtr(bufT.data(), CV_SIMD_WIDTH)+2;
182+
#else
170183
AutoBuffer<float> buf(len*4 + n+4);
171184
float *X = buf.data(), *Y = X + len, *Mag = X, *Ori = Y + len, *W = Ori + len;
172185
float* temphist = W + len + 2;
186+
#endif
173187

174188
for( i = 0; i < n; i++ )
175189
temphist[i] = 0.f;
@@ -201,32 +215,29 @@ float calcOrientationHist(
201215
cv::hal::magnitude32f(X, Y, Mag, len);
202216

203217
k = 0;
204-
#if CV_AVX2
218+
#if CV_SIMD
219+
const int vecsize = v_float32::nlanes;
220+
v_float32 nd360 = vx_setall_f32(n/360.f);
221+
v_int32 __n = vx_setall_s32(n);
222+
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[vecsize];
223+
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[vecsize];
224+
225+
for( ; k <= len - vecsize; k += vecsize )
205226
{
206-
__m256 __nd360 = _mm256_set1_ps(n/360.f);
207-
__m256i __n = _mm256_set1_epi32(n);
208-
int CV_DECL_ALIGNED(32) bin_buf[8];
209-
float CV_DECL_ALIGNED(32) w_mul_mag_buf[8];
210-
for ( ; k <= len - 8; k+=8 )
227+
v_float32 w = vx_load_aligned( W + k );
228+
v_float32 mag = vx_load_aligned( Mag + k );
229+
v_float32 ori = vx_load_aligned( Ori + k );
230+
v_int32 bin = v_round( nd360 * ori );
231+
232+
bin = v_select(bin >= __n, bin - __n, bin);
233+
bin = v_select(bin < vx_setzero_s32(), bin + __n, bin);
234+
235+
w = w * mag;
236+
v_store_aligned(bin_buf, bin);
237+
v_store_aligned(w_mul_mag_buf, w);
238+
for(int vi = 0; vi < vecsize; vi++)
211239
{
212-
__m256i __bin = _mm256_cvtps_epi32(_mm256_mul_ps(__nd360, _mm256_loadu_ps(&Ori[k])));
213-
214-
__bin = _mm256_sub_epi32(__bin, _mm256_andnot_si256(_mm256_cmpgt_epi32(__n, __bin), __n));
215-
__bin = _mm256_add_epi32(__bin, _mm256_and_si256(__n, _mm256_cmpgt_epi32(_mm256_setzero_si256(), __bin)));
216-
217-
__m256 __w_mul_mag = _mm256_mul_ps(_mm256_loadu_ps(&W[k]), _mm256_loadu_ps(&Mag[k]));
218-
219-
_mm256_store_si256((__m256i *) bin_buf, __bin);
220-
_mm256_store_ps(w_mul_mag_buf, __w_mul_mag);
221-
222-
temphist[bin_buf[0]] += w_mul_mag_buf[0];
223-
temphist[bin_buf[1]] += w_mul_mag_buf[1];
224-
temphist[bin_buf[2]] += w_mul_mag_buf[2];
225-
temphist[bin_buf[3]] += w_mul_mag_buf[3];
226-
temphist[bin_buf[4]] += w_mul_mag_buf[4];
227-
temphist[bin_buf[5]] += w_mul_mag_buf[5];
228-
temphist[bin_buf[6]] += w_mul_mag_buf[6];
229-
temphist[bin_buf[7]] += w_mul_mag_buf[7];
240+
temphist[bin_buf[vi]] += w_mul_mag_buf[vi];
230241
}
231242
}
232243
#endif
@@ -247,34 +258,20 @@ float calcOrientationHist(
247258
temphist[n+1] = temphist[1];
248259

249260
i = 0;
250-
#if CV_AVX2
261+
#if CV_SIMD
262+
v_float32 d_1_16 = vx_setall_f32(1.f/16.f);
263+
v_float32 d_4_16 = vx_setall_f32(4.f/16.f);
264+
v_float32 d_6_16 = vx_setall_f32(6.f/16.f);
265+
for( ; i <= n - v_float32::nlanes; i += v_float32::nlanes )
251266
{
252-
__m256 __d_1_16 = _mm256_set1_ps(1.f/16.f);
253-
__m256 __d_4_16 = _mm256_set1_ps(4.f/16.f);
254-
__m256 __d_6_16 = _mm256_set1_ps(6.f/16.f);
255-
for( ; i <= n - 8; i+=8 )
256-
{
257-
#if CV_FMA3
258-
__m256 __hist = _mm256_fmadd_ps(
259-
_mm256_add_ps(_mm256_loadu_ps(&temphist[i-2]), _mm256_loadu_ps(&temphist[i+2])),
260-
__d_1_16,
261-
_mm256_fmadd_ps(
262-
_mm256_add_ps(_mm256_loadu_ps(&temphist[i-1]), _mm256_loadu_ps(&temphist[i+1])),
263-
__d_4_16,
264-
_mm256_mul_ps(_mm256_loadu_ps(&temphist[i]), __d_6_16)));
265-
#else
266-
__m256 __hist = _mm256_add_ps(
267-
_mm256_mul_ps(
268-
_mm256_add_ps(_mm256_loadu_ps(&temphist[i-2]), _mm256_loadu_ps(&temphist[i+2])),
269-
__d_1_16),
270-
_mm256_add_ps(
271-
_mm256_mul_ps(
272-
_mm256_add_ps(_mm256_loadu_ps(&temphist[i-1]), _mm256_loadu_ps(&temphist[i+1])),
273-
__d_4_16),
274-
_mm256_mul_ps(_mm256_loadu_ps(&temphist[i]), __d_6_16)));
275-
#endif
276-
_mm256_storeu_ps(&hist[i], __hist);
277-
}
267+
v_float32 tn2 = vx_load_aligned(temphist + i-2);
268+
v_float32 tn1 = vx_load(temphist + i-1);
269+
v_float32 t0 = vx_load(temphist + i);
270+
v_float32 t1 = vx_load(temphist + i+1);
271+
v_float32 t2 = vx_load(temphist + i+2);
272+
v_float32 _hist = v_fma(tn2 + t2, d_1_16,
273+
v_fma(tn1 + t1, d_4_16, t0 * d_6_16));
274+
v_store(hist + i, _hist);
278275
}
279276
#endif
280277
for( ; i < n; i++ )
@@ -623,91 +620,65 @@ void calcSIFTDescriptor(
623620
cv::hal::exp32f(W, W, len);
624621

625622
k = 0;
626-
#if CV_AVX2
623+
#if CV_SIMD
627624
{
628-
int CV_DECL_ALIGNED(32) idx_buf[8];
629-
float CV_DECL_ALIGNED(32) rco_buf[64];
630-
const __m256 __ori = _mm256_set1_ps(ori);
631-
const __m256 __bins_per_rad = _mm256_set1_ps(bins_per_rad);
632-
const __m256i __n = _mm256_set1_epi32(n);
633-
for( ; k <= len - 8; k+=8 )
625+
const int vecsize = v_float32::nlanes;
626+
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[vecsize];
627+
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*vecsize];
628+
const v_float32 __ori = vx_setall_f32(ori);
629+
const v_float32 __bins_per_rad = vx_setall_f32(bins_per_rad);
630+
const v_int32 __n = vx_setall_s32(n);
631+
const v_int32 __1 = vx_setall_s32(1);
632+
const v_int32 __d_plus_2 = vx_setall_s32(d+2);
633+
const v_int32 __n_plus_2 = vx_setall_s32(n+2);
634+
for( ; k <= len - vecsize; k += vecsize )
634635
{
635-
__m256 __rbin = _mm256_loadu_ps(&RBin[k]);
636-
__m256 __cbin = _mm256_loadu_ps(&CBin[k]);
637-
__m256 __obin = _mm256_mul_ps(_mm256_sub_ps(_mm256_loadu_ps(&Ori[k]), __ori), __bins_per_rad);
638-
__m256 __mag = _mm256_mul_ps(_mm256_loadu_ps(&Mag[k]), _mm256_loadu_ps(&W[k]));
639-
640-
__m256 __r0 = _mm256_floor_ps(__rbin);
641-
__rbin = _mm256_sub_ps(__rbin, __r0);
642-
__m256 __c0 = _mm256_floor_ps(__cbin);
643-
__cbin = _mm256_sub_ps(__cbin, __c0);
644-
__m256 __o0 = _mm256_floor_ps(__obin);
645-
__obin = _mm256_sub_ps(__obin, __o0);
646-
647-
__m256i __o0i = _mm256_cvtps_epi32(__o0);
648-
__o0i = _mm256_add_epi32(__o0i, _mm256_and_si256(__n, _mm256_cmpgt_epi32(_mm256_setzero_si256(), __o0i)));
649-
__o0i = _mm256_sub_epi32(__o0i, _mm256_andnot_si256(_mm256_cmpgt_epi32(__n, __o0i), __n));
650-
651-
__m256 __v_r1 = _mm256_mul_ps(__mag, __rbin);
652-
__m256 __v_r0 = _mm256_sub_ps(__mag, __v_r1);
653-
654-
__m256 __v_rc11 = _mm256_mul_ps(__v_r1, __cbin);
655-
__m256 __v_rc10 = _mm256_sub_ps(__v_r1, __v_rc11);
656-
657-
__m256 __v_rc01 = _mm256_mul_ps(__v_r0, __cbin);
658-
__m256 __v_rc00 = _mm256_sub_ps(__v_r0, __v_rc01);
659-
660-
__m256 __v_rco111 = _mm256_mul_ps(__v_rc11, __obin);
661-
__m256 __v_rco110 = _mm256_sub_ps(__v_rc11, __v_rco111);
662-
663-
__m256 __v_rco101 = _mm256_mul_ps(__v_rc10, __obin);
664-
__m256 __v_rco100 = _mm256_sub_ps(__v_rc10, __v_rco101);
665-
666-
__m256 __v_rco011 = _mm256_mul_ps(__v_rc01, __obin);
667-
__m256 __v_rco010 = _mm256_sub_ps(__v_rc01, __v_rco011);
668-
669-
__m256 __v_rco001 = _mm256_mul_ps(__v_rc00, __obin);
670-
__m256 __v_rco000 = _mm256_sub_ps(__v_rc00, __v_rco001);
671-
672-
__m256i __one = _mm256_set1_epi32(1);
673-
__m256i __idx = _mm256_add_epi32(
674-
_mm256_mullo_epi32(
675-
_mm256_add_epi32(
676-
_mm256_mullo_epi32(_mm256_add_epi32(_mm256_cvtps_epi32(__r0), __one), _mm256_set1_epi32(d + 2)),
677-
_mm256_add_epi32(_mm256_cvtps_epi32(__c0), __one)),
678-
_mm256_set1_epi32(n + 2)),
679-
__o0i);
680-
681-
_mm256_store_si256((__m256i *)idx_buf, __idx);
682-
683-
_mm256_store_ps(&(rco_buf[0]), __v_rco000);
684-
_mm256_store_ps(&(rco_buf[8]), __v_rco001);
685-
_mm256_store_ps(&(rco_buf[16]), __v_rco010);
686-
_mm256_store_ps(&(rco_buf[24]), __v_rco011);
687-
_mm256_store_ps(&(rco_buf[32]), __v_rco100);
688-
_mm256_store_ps(&(rco_buf[40]), __v_rco101);
689-
_mm256_store_ps(&(rco_buf[48]), __v_rco110);
690-
_mm256_store_ps(&(rco_buf[56]), __v_rco111);
691-
#define HIST_SUM_HELPER(id) \
692-
hist[idx_buf[(id)]] += rco_buf[(id)]; \
693-
hist[idx_buf[(id)]+1] += rco_buf[8 + (id)]; \
694-
hist[idx_buf[(id)]+(n+2)] += rco_buf[16 + (id)]; \
695-
hist[idx_buf[(id)]+(n+3)] += rco_buf[24 + (id)]; \
696-
hist[idx_buf[(id)]+(d+2)*(n+2)] += rco_buf[32 + (id)]; \
697-
hist[idx_buf[(id)]+(d+2)*(n+2)+1] += rco_buf[40 + (id)]; \
698-
hist[idx_buf[(id)]+(d+3)*(n+2)] += rco_buf[48 + (id)]; \
699-
hist[idx_buf[(id)]+(d+3)*(n+2)+1] += rco_buf[56 + (id)];
700-
701-
HIST_SUM_HELPER(0);
702-
HIST_SUM_HELPER(1);
703-
HIST_SUM_HELPER(2);
704-
HIST_SUM_HELPER(3);
705-
HIST_SUM_HELPER(4);
706-
HIST_SUM_HELPER(5);
707-
HIST_SUM_HELPER(6);
708-
HIST_SUM_HELPER(7);
709-
710-
#undef HIST_SUM_HELPER
636+
v_float32 rbin = vx_load(RBin + k);
637+
v_float32 cbin = vx_load(CBin + k);
638+
v_float32 obin = (vx_load(Ori + k) - __ori) * __bins_per_rad;
639+
v_float32 mag = vx_load(Mag + k) * vx_load(W + k);
640+
641+
v_int32 r0 = v_floor(rbin);
642+
v_int32 c0 = v_floor(cbin);
643+
v_int32 o0 = v_floor(obin);
644+
rbin -= v_cvt_f32(r0);
645+
cbin -= v_cvt_f32(c0);
646+
obin -= v_cvt_f32(o0);
647+
648+
o0 = v_select(o0 < vx_setzero_s32(), o0 + __n, o0);
649+
o0 = v_select(o0 >= __n, o0 - __n, o0);
650+
651+
v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1;
652+
v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
653+
v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
654+
v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
655+
v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
656+
v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
657+
v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
658+
659+
v_int32 idx = v_fma(v_fma(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
660+
v_store_aligned(idx_buf, idx);
661+
662+
v_store_aligned(rco_buf, v_rco000);
663+
v_store_aligned(rco_buf+vecsize, v_rco001);
664+
v_store_aligned(rco_buf+vecsize*2, v_rco010);
665+
v_store_aligned(rco_buf+vecsize*3, v_rco011);
666+
v_store_aligned(rco_buf+vecsize*4, v_rco100);
667+
v_store_aligned(rco_buf+vecsize*5, v_rco101);
668+
v_store_aligned(rco_buf+vecsize*6, v_rco110);
669+
v_store_aligned(rco_buf+vecsize*7, v_rco111);
670+
671+
for(int id = 0; id < vecsize; id++)
672+
{
673+
hist[idx_buf[id]] += rco_buf[id];
674+
hist[idx_buf[id]+1] += rco_buf[vecsize + id];
675+
hist[idx_buf[id]+(n+2)] += rco_buf[2*vecsize + id];
676+
hist[idx_buf[id]+(n+3)] += rco_buf[3*vecsize + id];
677+
hist[idx_buf[id]+(d+2)*(n+2)] += rco_buf[4*vecsize + id];
678+
hist[idx_buf[id]+(d+2)*(n+2)+1] += rco_buf[5*vecsize + id];
679+
hist[idx_buf[id]+(d+3)*(n+2)] += rco_buf[6*vecsize + id];
680+
hist[idx_buf[id]+(d+3)*(n+2)+1] += rco_buf[7*vecsize + id];
681+
}
711682
}
712683
}
713684
#endif
@@ -766,23 +737,16 @@ void calcSIFTDescriptor(
766737
float nrm2 = 0;
767738
len = d*d*n;
768739
k = 0;
769-
#if CV_AVX2
740+
#if CV_SIMD
770741
{
771-
float CV_DECL_ALIGNED(32) nrm2_buf[8];
772-
__m256 __nrm2 = _mm256_setzero_ps();
773-
__m256 __dst;
774-
for( ; k <= len - 8; k += 8 )
742+
v_float32 __nrm2 = vx_setzero_f32();
743+
v_float32 __dst;
744+
for( ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
775745
{
776-
__dst = _mm256_loadu_ps(&dst[k]);
777-
#if CV_FMA3
778-
__nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
779-
#else
780-
__nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst));
781-
#endif
746+
__dst = vx_load(dst + k);
747+
__nrm2 = v_fma(__dst, __dst, __nrm2);
782748
}
783-
_mm256_store_ps(nrm2_buf, __nrm2);
784-
nrm2 = nrm2_buf[0] + nrm2_buf[1] + nrm2_buf[2] + nrm2_buf[3] +
785-
nrm2_buf[4] + nrm2_buf[5] + nrm2_buf[6] + nrm2_buf[7];
749+
nrm2 = (float)v_reduce_sum(__nrm2);
786750
}
787751
#endif
788752
for( ; k < len; k++ )
@@ -795,7 +759,7 @@ void calcSIFTDescriptor(
795759
// This code cannot be enabled because it sums nrm2 in a different order,
796760
// thus producing slightly different results
797761
{
798-
float CV_DECL_ALIGNED(32) nrm2_buf[8];
762+
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) nrm2_buf[8];
799763
__m256 __dst;
800764
__m256 __nrm2 = _mm256_setzero_ps();
801765
__m256 __thr = _mm256_set1_ps(thr);
@@ -825,17 +789,17 @@ void calcSIFTDescriptor(
825789

826790
#if 1
827791
k = 0;
828-
#if CV_AVX2
792+
#if CV_SIMD
829793
{
830-
__m256 __dst;
831-
__m256 __min = _mm256_setzero_ps();
832-
__m256 __max = _mm256_set1_ps(255.0f); // max of uchar
833-
__m256 __nrm2 = _mm256_set1_ps(nrm2);
834-
for( k = 0; k <= len - 8; k+=8 )
794+
v_float32 __dst;
795+
v_float32 __min = vx_setzero_f32();
796+
v_float32 __max = vx_setall_f32(255.0f); // max of uchar
797+
v_float32 __nrm2 = vx_setall_f32(nrm2);
798+
for( k = 0; k <= len - v_float32::nlanes; k += v_float32::nlanes )
835799
{
836-
__dst = _mm256_loadu_ps(&dst[k]);
837-
__dst = _mm256_min_ps(_mm256_max_ps(_mm256_round_ps(_mm256_mul_ps(__dst, __nrm2), _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC), __min), __max);
838-
_mm256_storeu_ps(&dst[k], __dst);
800+
__dst = vx_load(dst + k);
801+
__dst = v_min(v_max(v_cvt_f32(v_round(__dst * __nrm2)), __min), __max);
802+
v_store(dst + k, __dst);
839803
}
840804
}
841805
#endif

0 commit comments

Comments
 (0)