@@ -167,9 +167,23 @@ float calcOrientationHist(
167
167
int i, j, k, len = (radius*2 +1 )*(radius*2 +1 );
168
168
169
169
float expf_scale = -1 .f /(2 .f * sigma * sigma);
170
+ #if CV_SIMD
171
+ AutoBuffer<float > bufX (len + v_float32::nlanes);
172
+ AutoBuffer<float > bufY (len + v_float32::nlanes);
173
+ AutoBuffer<float > bufO (len + v_float32::nlanes);
174
+ AutoBuffer<float > bufW (len + v_float32::nlanes);
175
+ AutoBuffer<float > bufT (n+4 + v_float32::nlanes);
176
+ float *X = alignPtr (bufX.data (), CV_SIMD_WIDTH);
177
+ float *Y = alignPtr (bufY.data (), CV_SIMD_WIDTH);
178
+ float *Mag = X;
179
+ float *Ori = alignPtr (bufO.data (), CV_SIMD_WIDTH);
180
+ float *W = alignPtr (bufW.data (), CV_SIMD_WIDTH);
181
+ float *temphist = alignPtr (bufT.data (), CV_SIMD_WIDTH)+2 ;
182
+ #else
170
183
AutoBuffer<float > buf (len*4 + n+4 );
171
184
float *X = buf.data (), *Y = X + len, *Mag = X, *Ori = Y + len, *W = Ori + len;
172
185
float * temphist = W + len + 2 ;
186
+ #endif
173
187
174
188
for ( i = 0 ; i < n; i++ )
175
189
temphist[i] = 0 .f ;
@@ -201,32 +215,29 @@ float calcOrientationHist(
201
215
cv::hal::magnitude32f (X, Y, Mag, len);
202
216
203
217
k = 0 ;
204
- #if CV_AVX2
218
+ #if CV_SIMD
219
+ const int vecsize = v_float32::nlanes;
220
+ v_float32 nd360 = vx_setall_f32 (n/360 .f );
221
+ v_int32 __n = vx_setall_s32 (n);
222
+ int CV_DECL_ALIGNED (CV_SIMD_WIDTH) bin_buf[vecsize];
223
+ float CV_DECL_ALIGNED (CV_SIMD_WIDTH) w_mul_mag_buf[vecsize];
224
+
225
+ for ( ; k <= len - vecsize; k += vecsize )
205
226
{
206
- __m256 __nd360 = _mm256_set1_ps (n/360 .f );
207
- __m256i __n = _mm256_set1_epi32 (n);
208
- int CV_DECL_ALIGNED (32 ) bin_buf[8 ];
209
- float CV_DECL_ALIGNED (32 ) w_mul_mag_buf[8 ];
210
- for ( ; k <= len - 8 ; k+=8 )
227
+ v_float32 w = vx_load_aligned ( W + k );
228
+ v_float32 mag = vx_load_aligned ( Mag + k );
229
+ v_float32 ori = vx_load_aligned ( Ori + k );
230
+ v_int32 bin = v_round ( nd360 * ori );
231
+
232
+ bin = v_select (bin >= __n, bin - __n, bin);
233
+ bin = v_select (bin < vx_setzero_s32 (), bin + __n, bin);
234
+
235
+ w = w * mag;
236
+ v_store_aligned (bin_buf, bin);
237
+ v_store_aligned (w_mul_mag_buf, w);
238
+ for (int vi = 0 ; vi < vecsize; vi++)
211
239
{
212
- __m256i __bin = _mm256_cvtps_epi32 (_mm256_mul_ps (__nd360, _mm256_loadu_ps (&Ori[k])));
213
-
214
- __bin = _mm256_sub_epi32 (__bin, _mm256_andnot_si256 (_mm256_cmpgt_epi32 (__n, __bin), __n));
215
- __bin = _mm256_add_epi32 (__bin, _mm256_and_si256 (__n, _mm256_cmpgt_epi32 (_mm256_setzero_si256 (), __bin)));
216
-
217
- __m256 __w_mul_mag = _mm256_mul_ps (_mm256_loadu_ps (&W[k]), _mm256_loadu_ps (&Mag[k]));
218
-
219
- _mm256_store_si256 ((__m256i *) bin_buf, __bin);
220
- _mm256_store_ps (w_mul_mag_buf, __w_mul_mag);
221
-
222
- temphist[bin_buf[0 ]] += w_mul_mag_buf[0 ];
223
- temphist[bin_buf[1 ]] += w_mul_mag_buf[1 ];
224
- temphist[bin_buf[2 ]] += w_mul_mag_buf[2 ];
225
- temphist[bin_buf[3 ]] += w_mul_mag_buf[3 ];
226
- temphist[bin_buf[4 ]] += w_mul_mag_buf[4 ];
227
- temphist[bin_buf[5 ]] += w_mul_mag_buf[5 ];
228
- temphist[bin_buf[6 ]] += w_mul_mag_buf[6 ];
229
- temphist[bin_buf[7 ]] += w_mul_mag_buf[7 ];
240
+ temphist[bin_buf[vi]] += w_mul_mag_buf[vi];
230
241
}
231
242
}
232
243
#endif
@@ -247,34 +258,20 @@ float calcOrientationHist(
247
258
temphist[n+1 ] = temphist[1 ];
248
259
249
260
i = 0 ;
250
- #if CV_AVX2
261
+ #if CV_SIMD
262
+ v_float32 d_1_16 = vx_setall_f32 (1 .f /16 .f );
263
+ v_float32 d_4_16 = vx_setall_f32 (4 .f /16 .f );
264
+ v_float32 d_6_16 = vx_setall_f32 (6 .f /16 .f );
265
+ for ( ; i <= n - v_float32::nlanes; i += v_float32::nlanes )
251
266
{
252
- __m256 __d_1_16 = _mm256_set1_ps (1 .f /16 .f );
253
- __m256 __d_4_16 = _mm256_set1_ps (4 .f /16 .f );
254
- __m256 __d_6_16 = _mm256_set1_ps (6 .f /16 .f );
255
- for ( ; i <= n - 8 ; i+=8 )
256
- {
257
- #if CV_FMA3
258
- __m256 __hist = _mm256_fmadd_ps (
259
- _mm256_add_ps (_mm256_loadu_ps (&temphist[i-2 ]), _mm256_loadu_ps (&temphist[i+2 ])),
260
- __d_1_16,
261
- _mm256_fmadd_ps (
262
- _mm256_add_ps (_mm256_loadu_ps (&temphist[i-1 ]), _mm256_loadu_ps (&temphist[i+1 ])),
263
- __d_4_16,
264
- _mm256_mul_ps (_mm256_loadu_ps (&temphist[i]), __d_6_16)));
265
- #else
266
- __m256 __hist = _mm256_add_ps (
267
- _mm256_mul_ps (
268
- _mm256_add_ps (_mm256_loadu_ps (&temphist[i-2 ]), _mm256_loadu_ps (&temphist[i+2 ])),
269
- __d_1_16),
270
- _mm256_add_ps (
271
- _mm256_mul_ps (
272
- _mm256_add_ps (_mm256_loadu_ps (&temphist[i-1 ]), _mm256_loadu_ps (&temphist[i+1 ])),
273
- __d_4_16),
274
- _mm256_mul_ps (_mm256_loadu_ps (&temphist[i]), __d_6_16)));
275
- #endif
276
- _mm256_storeu_ps (&hist[i], __hist);
277
- }
267
+ v_float32 tn2 = vx_load_aligned (temphist + i-2 );
268
+ v_float32 tn1 = vx_load (temphist + i-1 );
269
+ v_float32 t0 = vx_load (temphist + i);
270
+ v_float32 t1 = vx_load (temphist + i+1 );
271
+ v_float32 t2 = vx_load (temphist + i+2 );
272
+ v_float32 _hist = v_fma (tn2 + t2, d_1_16,
273
+ v_fma (tn1 + t1, d_4_16, t0 * d_6_16));
274
+ v_store (hist + i, _hist);
278
275
}
279
276
#endif
280
277
for ( ; i < n; i++ )
@@ -623,91 +620,65 @@ void calcSIFTDescriptor(
623
620
cv::hal::exp32f (W, W, len);
624
621
625
622
k = 0 ;
626
- #if CV_AVX2
623
+ #if CV_SIMD
627
624
{
628
- int CV_DECL_ALIGNED (32 ) idx_buf[8 ];
629
- float CV_DECL_ALIGNED (32 ) rco_buf[64 ];
630
- const __m256 __ori = _mm256_set1_ps (ori);
631
- const __m256 __bins_per_rad = _mm256_set1_ps (bins_per_rad);
632
- const __m256i __n = _mm256_set1_epi32 (n);
633
- for ( ; k <= len - 8 ; k+=8 )
625
+ const int vecsize = v_float32::nlanes;
626
+ int CV_DECL_ALIGNED (CV_SIMD_WIDTH) idx_buf[vecsize];
627
+ float CV_DECL_ALIGNED (CV_SIMD_WIDTH) rco_buf[8 *vecsize];
628
+ const v_float32 __ori = vx_setall_f32 (ori);
629
+ const v_float32 __bins_per_rad = vx_setall_f32 (bins_per_rad);
630
+ const v_int32 __n = vx_setall_s32 (n);
631
+ const v_int32 __1 = vx_setall_s32 (1 );
632
+ const v_int32 __d_plus_2 = vx_setall_s32 (d+2 );
633
+ const v_int32 __n_plus_2 = vx_setall_s32 (n+2 );
634
+ for ( ; k <= len - vecsize; k += vecsize )
634
635
{
635
- __m256 __rbin = _mm256_loadu_ps (&RBin[k]);
636
- __m256 __cbin = _mm256_loadu_ps (&CBin[k]);
637
- __m256 __obin = _mm256_mul_ps (_mm256_sub_ps (_mm256_loadu_ps (&Ori[k]), __ori), __bins_per_rad);
638
- __m256 __mag = _mm256_mul_ps (_mm256_loadu_ps (&Mag[k]), _mm256_loadu_ps (&W[k]));
639
-
640
- __m256 __r0 = _mm256_floor_ps (__rbin);
641
- __rbin = _mm256_sub_ps (__rbin, __r0);
642
- __m256 __c0 = _mm256_floor_ps (__cbin);
643
- __cbin = _mm256_sub_ps (__cbin, __c0);
644
- __m256 __o0 = _mm256_floor_ps (__obin);
645
- __obin = _mm256_sub_ps (__obin, __o0);
646
-
647
- __m256i __o0i = _mm256_cvtps_epi32 (__o0);
648
- __o0i = _mm256_add_epi32 (__o0i, _mm256_and_si256 (__n, _mm256_cmpgt_epi32 (_mm256_setzero_si256 (), __o0i)));
649
- __o0i = _mm256_sub_epi32 (__o0i, _mm256_andnot_si256 (_mm256_cmpgt_epi32 (__n, __o0i), __n));
650
-
651
- __m256 __v_r1 = _mm256_mul_ps (__mag, __rbin);
652
- __m256 __v_r0 = _mm256_sub_ps (__mag, __v_r1);
653
-
654
- __m256 __v_rc11 = _mm256_mul_ps (__v_r1, __cbin);
655
- __m256 __v_rc10 = _mm256_sub_ps (__v_r1, __v_rc11);
656
-
657
- __m256 __v_rc01 = _mm256_mul_ps (__v_r0, __cbin);
658
- __m256 __v_rc00 = _mm256_sub_ps (__v_r0, __v_rc01);
659
-
660
- __m256 __v_rco111 = _mm256_mul_ps (__v_rc11, __obin);
661
- __m256 __v_rco110 = _mm256_sub_ps (__v_rc11, __v_rco111);
662
-
663
- __m256 __v_rco101 = _mm256_mul_ps (__v_rc10, __obin);
664
- __m256 __v_rco100 = _mm256_sub_ps (__v_rc10, __v_rco101);
665
-
666
- __m256 __v_rco011 = _mm256_mul_ps (__v_rc01, __obin);
667
- __m256 __v_rco010 = _mm256_sub_ps (__v_rc01, __v_rco011);
668
-
669
- __m256 __v_rco001 = _mm256_mul_ps (__v_rc00, __obin);
670
- __m256 __v_rco000 = _mm256_sub_ps (__v_rc00, __v_rco001);
671
-
672
- __m256i __one = _mm256_set1_epi32 (1 );
673
- __m256i __idx = _mm256_add_epi32 (
674
- _mm256_mullo_epi32 (
675
- _mm256_add_epi32 (
676
- _mm256_mullo_epi32 (_mm256_add_epi32 (_mm256_cvtps_epi32 (__r0), __one), _mm256_set1_epi32 (d + 2 )),
677
- _mm256_add_epi32 (_mm256_cvtps_epi32 (__c0), __one)),
678
- _mm256_set1_epi32 (n + 2 )),
679
- __o0i);
680
-
681
- _mm256_store_si256 ((__m256i *)idx_buf, __idx);
682
-
683
- _mm256_store_ps (&(rco_buf[0 ]), __v_rco000);
684
- _mm256_store_ps (&(rco_buf[8 ]), __v_rco001);
685
- _mm256_store_ps (&(rco_buf[16 ]), __v_rco010);
686
- _mm256_store_ps (&(rco_buf[24 ]), __v_rco011);
687
- _mm256_store_ps (&(rco_buf[32 ]), __v_rco100);
688
- _mm256_store_ps (&(rco_buf[40 ]), __v_rco101);
689
- _mm256_store_ps (&(rco_buf[48 ]), __v_rco110);
690
- _mm256_store_ps (&(rco_buf[56 ]), __v_rco111);
691
- #define HIST_SUM_HELPER (id ) \
692
- hist[idx_buf[(id)]] += rco_buf[(id)]; \
693
- hist[idx_buf[(id)]+1 ] += rco_buf[8 + (id)]; \
694
- hist[idx_buf[(id)]+(n+2 )] += rco_buf[16 + (id)]; \
695
- hist[idx_buf[(id)]+(n+3 )] += rco_buf[24 + (id)]; \
696
- hist[idx_buf[(id)]+(d+2 )*(n+2 )] += rco_buf[32 + (id)]; \
697
- hist[idx_buf[(id)]+(d+2 )*(n+2 )+1 ] += rco_buf[40 + (id)]; \
698
- hist[idx_buf[(id)]+(d+3 )*(n+2 )] += rco_buf[48 + (id)]; \
699
- hist[idx_buf[(id)]+(d+3 )*(n+2 )+1 ] += rco_buf[56 + (id)];
700
-
701
- HIST_SUM_HELPER (0 );
702
- HIST_SUM_HELPER (1 );
703
- HIST_SUM_HELPER (2 );
704
- HIST_SUM_HELPER (3 );
705
- HIST_SUM_HELPER (4 );
706
- HIST_SUM_HELPER (5 );
707
- HIST_SUM_HELPER (6 );
708
- HIST_SUM_HELPER (7 );
709
-
710
- #undef HIST_SUM_HELPER
636
+ v_float32 rbin = vx_load (RBin + k);
637
+ v_float32 cbin = vx_load (CBin + k);
638
+ v_float32 obin = (vx_load (Ori + k) - __ori) * __bins_per_rad;
639
+ v_float32 mag = vx_load (Mag + k) * vx_load (W + k);
640
+
641
+ v_int32 r0 = v_floor (rbin);
642
+ v_int32 c0 = v_floor (cbin);
643
+ v_int32 o0 = v_floor (obin);
644
+ rbin -= v_cvt_f32 (r0);
645
+ cbin -= v_cvt_f32 (c0);
646
+ obin -= v_cvt_f32 (o0);
647
+
648
+ o0 = v_select (o0 < vx_setzero_s32 (), o0 + __n, o0);
649
+ o0 = v_select (o0 >= __n, o0 - __n, o0);
650
+
651
+ v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1;
652
+ v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
653
+ v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
654
+ v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
655
+ v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
656
+ v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
657
+ v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
658
+
659
+ v_int32 idx = v_fma (v_fma (r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
660
+ v_store_aligned (idx_buf, idx);
661
+
662
+ v_store_aligned (rco_buf, v_rco000);
663
+ v_store_aligned (rco_buf+vecsize, v_rco001);
664
+ v_store_aligned (rco_buf+vecsize*2 , v_rco010);
665
+ v_store_aligned (rco_buf+vecsize*3 , v_rco011);
666
+ v_store_aligned (rco_buf+vecsize*4 , v_rco100);
667
+ v_store_aligned (rco_buf+vecsize*5 , v_rco101);
668
+ v_store_aligned (rco_buf+vecsize*6 , v_rco110);
669
+ v_store_aligned (rco_buf+vecsize*7 , v_rco111);
670
+
671
+ for (int id = 0 ; id < vecsize; id++)
672
+ {
673
+ hist[idx_buf[id]] += rco_buf[id];
674
+ hist[idx_buf[id]+1 ] += rco_buf[vecsize + id];
675
+ hist[idx_buf[id]+(n+2 )] += rco_buf[2 *vecsize + id];
676
+ hist[idx_buf[id]+(n+3 )] += rco_buf[3 *vecsize + id];
677
+ hist[idx_buf[id]+(d+2 )*(n+2 )] += rco_buf[4 *vecsize + id];
678
+ hist[idx_buf[id]+(d+2 )*(n+2 )+1 ] += rco_buf[5 *vecsize + id];
679
+ hist[idx_buf[id]+(d+3 )*(n+2 )] += rco_buf[6 *vecsize + id];
680
+ hist[idx_buf[id]+(d+3 )*(n+2 )+1 ] += rco_buf[7 *vecsize + id];
681
+ }
711
682
}
712
683
}
713
684
#endif
@@ -766,23 +737,16 @@ void calcSIFTDescriptor(
766
737
float nrm2 = 0 ;
767
738
len = d*d*n;
768
739
k = 0 ;
769
- #if CV_AVX2
740
+ #if CV_SIMD
770
741
{
771
- float CV_DECL_ALIGNED (32 ) nrm2_buf[8 ];
772
- __m256 __nrm2 = _mm256_setzero_ps ();
773
- __m256 __dst;
774
- for ( ; k <= len - 8 ; k += 8 )
742
+ v_float32 __nrm2 = vx_setzero_f32 ();
743
+ v_float32 __dst;
744
+ for ( ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
775
745
{
776
- __dst = _mm256_loadu_ps (&dst[k]);
777
- #if CV_FMA3
778
- __nrm2 = _mm256_fmadd_ps (__dst, __dst, __nrm2);
779
- #else
780
- __nrm2 = _mm256_add_ps (__nrm2, _mm256_mul_ps (__dst, __dst));
781
- #endif
746
+ __dst = vx_load (dst + k);
747
+ __nrm2 = v_fma (__dst, __dst, __nrm2);
782
748
}
783
- _mm256_store_ps (nrm2_buf, __nrm2);
784
- nrm2 = nrm2_buf[0 ] + nrm2_buf[1 ] + nrm2_buf[2 ] + nrm2_buf[3 ] +
785
- nrm2_buf[4 ] + nrm2_buf[5 ] + nrm2_buf[6 ] + nrm2_buf[7 ];
749
+ nrm2 = (float )v_reduce_sum (__nrm2);
786
750
}
787
751
#endif
788
752
for ( ; k < len; k++ )
@@ -795,7 +759,7 @@ void calcSIFTDescriptor(
795
759
// This code cannot be enabled because it sums nrm2 in a different order,
796
760
// thus producing slightly different results
797
761
{
798
- float CV_DECL_ALIGNED(32 ) nrm2_buf[8];
762
+ float CV_DECL_ALIGNED(CV_SIMD_WIDTH ) nrm2_buf[8];
799
763
__m256 __dst;
800
764
__m256 __nrm2 = _mm256_setzero_ps();
801
765
__m256 __thr = _mm256_set1_ps(thr);
@@ -825,17 +789,17 @@ void calcSIFTDescriptor(
825
789
826
790
#if 1
827
791
k = 0 ;
828
- #if CV_AVX2
792
+ #if CV_SIMD
829
793
{
830
- __m256 __dst;
831
- __m256 __min = _mm256_setzero_ps ();
832
- __m256 __max = _mm256_set1_ps (255 .0f ); // max of uchar
833
- __m256 __nrm2 = _mm256_set1_ps (nrm2);
834
- for ( k = 0 ; k <= len - 8 ; k+= 8 )
794
+ v_float32 __dst;
795
+ v_float32 __min = vx_setzero_f32 ();
796
+ v_float32 __max = vx_setall_f32 (255 .0f ); // max of uchar
797
+ v_float32 __nrm2 = vx_setall_f32 (nrm2);
798
+ for ( k = 0 ; k <= len - v_float32::nlanes ; k += v_float32::nlanes )
835
799
{
836
- __dst = _mm256_loadu_ps (& dst[k] );
837
- __dst = _mm256_min_ps ( _mm256_max_ps ( _mm256_round_ps ( _mm256_mul_ps (__dst, __nrm2), _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC ), __min), __max);
838
- _mm256_storeu_ps (& dst[k] , __dst);
800
+ __dst = vx_load ( dst + k );
801
+ __dst = v_min ( v_max ( v_cvt_f32 ( v_round (__dst * __nrm2)), __min), __max);
802
+ v_store ( dst + k , __dst);
839
803
}
840
804
}
841
805
#endif
0 commit comments