@@ -648,6 +648,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
648
648
bool useAVX;
649
649
bool useAVX2;
650
650
bool useAVX512;
651
+ int blk_size_cn;
651
652
652
653
ParallelConv ()
653
654
: input_(0 ), weights_(0 ), output_(0 ), ngroups_(0 ), nstripes_(0 ),
@@ -704,12 +705,17 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
704
705
p.useAVX2 = checkHardwareSupport (CPU_AVX2) && isConv2D;
705
706
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
706
707
707
- int ncn = std::min (inpCn, (int )BLK_SIZE_CN);
708
-
709
708
int kernel_d = !isConv2D? kernel_size[0 ] : 1 ;
710
709
int kernel_h = kernel_size[kernel_size.size () - 2 ];
711
710
int kernel_w = kernel_size.back ();
712
711
712
+ int blk_size_cn0 = cvCeil (800 ./(kernel_w*kernel_h));
713
+ int ncn = 16 ;
714
+ while (ncn*2 < blk_size_cn0 && ncn < inpCn)
715
+ ncn *= 2 ;
716
+ ncn = std::min (ncn, inpCn);
717
+ p.blk_size_cn = ncn;
718
+
713
719
int dil_d = !isConv2D? dilations[0 ] : 1 ;
714
720
int dil_h = dilations[dilations.size () - 2 ];
715
721
int dil_w = dilations.back ();
@@ -777,18 +783,26 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
777
783
int dilation_w = dilations.back ();
778
784
779
785
int i, j, k, d;
780
- size_t inpPlaneSize = input_->total (2 );
781
- size_t outPlaneSize = output_->total (2 );
786
+ int inpPlaneSize = ( int ) input_->total (2 );
787
+ int outPlaneSize = ( int ) output_->total (2 );
782
788
bool is1x1 = is1x1_;
783
789
784
790
int stripesPerSample;
785
- size_t stripeSize;
791
+ int stripeSize;
786
792
Range r = r0;
787
-
788
- if ( nstripes >= batchSize*2 )
793
+ bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 &&
794
+ outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 &&
795
+ width >= 16 + dilation_w*(kernel_w - 1 );
796
+ // for now only 3x3 depth-wise convolutions are supported
797
+ depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 &&
798
+ // computing at most 1 pixel from each side can involve padding
799
+ max (stride_w, dilation_w) >= pad_l && max (stride_h, dilation_h) >= pad_t &&
800
+ pad_l <= 1 && pad_t <= 1 ;
801
+
802
+ if ( !depthWiseConvolution && nstripes >= batchSize*2 )
789
803
{
790
804
stripesPerSample = nstripes/batchSize;
791
- stripeSize = alignSize ((outPlaneSize + stripesPerSample - 1 )/stripesPerSample, valign);
805
+ stripeSize = ( int ) alignSize ((outPlaneSize + stripesPerSample - 1 )/stripesPerSample, valign);
792
806
stripeSize = std::min (stripeSize, outPlaneSize);
793
807
}
794
808
else
@@ -807,20 +821,29 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
807
821
const float * biasptr_ = &biasvec_->at (0 );
808
822
const float * reluptr_ = reluslope_->empty () ? 0 : &reluslope_->at (0 );
809
823
float * data_out0_ = output_->ptr <float >();
810
- size_t rowbufsz = (size_t )karea*BLK_SIZE_CN*BLK_SIZE;
811
- AutoBuffer<float > rowbuf0_ (rowbufsz + valign);
812
- float * rowbuf0 = alignPtr (rowbuf0_.data (), (int )(valign*sizeof (float )));
813
-
814
- // we clear the buffer once; ultimately, it lets us to avoid
815
- // tail processing after running the unrolled/vectorized loop.
816
- // the main idea is to make sure that the tail (a.k.a. padding) of each row
817
- // (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
818
- // does not contain NaNs or Infs. Because the padding in the weights
819
- // matrix is explicitly initialized with 0's, we handle all other
820
- // cases nicely, i.e. we can skip expliciting re-initialization
821
- // of the padding - we just retain elements from the previous iteration
822
- // of the loop over channels (cn0).
823
- memset (rowbuf0, 0 , rowbufsz*sizeof (rowbuf0[0 ]) );
824
+ AutoBuffer<float > rowbuf0_;
825
+ float * rowbuf0 = 0 ;
826
+ bool use_rowbuf = !depthWiseConvolution;
827
+ int blk_size = depthWiseConvolution ? outPlaneSize : min ((int )BLK_SIZE, stripeSize);
828
+
829
+ // im2row buffer is not used for depth-wise convolution
830
+ if (use_rowbuf)
831
+ {
832
+ size_t rowbufsz = alignSize (karea*blk_size_cn, valign)*min ((int )BLK_SIZE, blk_size);
833
+ // printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize);
834
+ rowbuf0_.allocate (rowbufsz + valign);
835
+ rowbuf0 = alignPtr (rowbuf0_.data (), (int )(valign*sizeof (float )));
836
+ // we clear the buffer once; ultimately, it lets us to avoid
837
+ // tail processing after running the unrolled/vectorized loop.
838
+ // the main idea is to make sure that the tail (a.k.a. padding) of each row
839
+ // (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
840
+ // does not contain NaNs or Infs. Because the padding in the weights
841
+ // matrix is explicitly initialized with 0's, we handle all other
842
+ // cases nicely, i.e. we can skip expliciting re-initialization
843
+ // of the padding - we just retain elements from the previous iteration
844
+ // of the loop over channels (cn0).
845
+ memset (rowbuf0, 0 , rowbufsz*sizeof (rowbuf0[0 ]) );
846
+ }
824
847
825
848
for ( int stripe = r.start ; stripe < r.end ; stripe++ )
826
849
{
@@ -835,28 +858,213 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
835
858
const float * wptr_orig = wptr_orig_ + wstep*startOutCn;
836
859
const float * biasptr = biasptr_ + startOutCn;
837
860
838
- for ( int cn0 = 0 ; cn0 < inpCn; cn0 += BLK_SIZE_CN )
861
+ for ( int cn0 = 0 ; cn0 < inpCn; cn0 += blk_size_cn )
839
862
{
840
- int cn1 = std::min (cn0 + BLK_SIZE_CN , inpCn);
863
+ int cn1 = std::min (cn0 + blk_size_cn , inpCn);
841
864
int ncn = cn1 - cn0, vsz = karea*ncn;
842
865
int vsz_a = (int )alignSize (vsz, valign);
843
866
const float * wptr = wptr_orig + cn0*karea;
844
867
// we apply [Channels][P]ReLU (if any) during the final pass only.
845
868
const float * relu = cn1 == inpCn && reluptr_ ? reluptr_ + startOutCn : 0 ;
846
869
847
- for ( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
870
+ for ( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size )
848
871
{
849
- int ofs, ofs1 = std::min (ofs0 + BLK_SIZE, stripeEnd);
872
+ int ofs, ofs1 = std::min (ofs0 + blk_size, stripeEnd);
873
+ int bsz = ofs1 - ofs0;
850
874
851
875
int out_d = ofs0 / (outH * outW);
852
876
int out_i = (ofs0 - out_d * outH * outW) / outW;
853
877
int out_j = ofs0 % outW;
854
878
879
+ if (depthWiseConvolution)
880
+ {
881
+ CV_Assert (out_i == 0 && out_j == 0 );
882
+ int in_d = out_d * stride_d - pad_d;
883
+ const float * inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width;
884
+ float * outptr_ = data_out0 + ofs0;
885
+
886
+ #if CV_TRY_AVX2
887
+ if (useAVX2)
888
+ opt_AVX2::fastDepthwiseConv (wptr, kernel_h, kernel_w,
889
+ stride_h, stride_w, dilation_h, dilation_w, pad_t , pad_l,
890
+ biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
891
+ else
892
+ #endif
893
+ #if CV_TRY_AVX
894
+ if (useAVX)
895
+ opt_AVX::fastDepthwiseConv (wptr, kernel_h, kernel_w,
896
+ stride_h, stride_w, dilation_h, dilation_w, pad_t , pad_l,
897
+ biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
898
+ else
899
+ #endif
900
+ {
901
+ const float w00_ = wptr[0 ], w01_ = wptr[1 ], w02_ = wptr[2 ],
902
+ w10 = wptr[3 ], w11 = wptr[4 ], w12 = wptr[5 ],
903
+ w20_ = wptr[6 ], w21_ = wptr[7 ], w22_ = wptr[8 ];
904
+ int outW1 = min (outW, (width - dilation_w*(kernel_w - 1 ) + pad_l)/stride_w);
905
+ float relu_coeff = relu ? relu[out_d] : 1 .f , bias = biasptr[out_d];
906
+
907
+ for (int out_i = 0 ; out_i < outH; out_i++)
908
+ {
909
+ int in_i = out_i * stride_h - pad_t , out_j = 0 ;
910
+ const float * imgptr0 = inptr_ + in_i*width;
911
+ const float * imgptr1 = imgptr0 + dilation_h*width;
912
+ const float * imgptr2 = imgptr0 + (dilation_h*2 )*width;
913
+ float out, w00 = w00_, w01 = w01_, w02 = w02_;
914
+ float w20 = w20_, w21 = w21_, w22 = w22_;
915
+ if (in_i < 0 )
916
+ {
917
+ w00 = w01 = w02 = 0 .f ;
918
+ imgptr0 = imgptr1;
919
+ }
920
+ else if (in_i + dilation_h*(kernel_h-1 ) >= height)
921
+ {
922
+ w20 = w21 = w22 = 0 .f ;
923
+ imgptr2 = imgptr1;
924
+ }
925
+ float * outptr = outptr_ + out_i*outW;
926
+ if (pad_l > 0 )
927
+ {
928
+ out = imgptr0[0 ]*w01 + imgptr0[dilation_w]*w02 +
929
+ imgptr1[0 ]*w11 + imgptr1[dilation_w]*w12 +
930
+ imgptr2[0 ]*w21 + imgptr2[dilation_w]*w22 + bias;
931
+ if (relu)
932
+ out = out > 0 .f ? out : out*relu_coeff;
933
+ outptr[0 ] = out;
934
+ out_j = 1 ;
935
+ }
936
+
937
+ #if CV_SIMD
938
+ // maybe with AVX or AVX512 strided depthwise convolution
939
+ // can be accelerated with vector code, but with 4xfloat vectors
940
+ // it's hardly the case
941
+ if ( stride_w == 1 )
942
+ {
943
+ const int VECSZ = v_float32::nlanes;
944
+ const int out_delta = VECSZ/stride_w;
945
+ v_float32 vw00 = vx_setall_f32 (w00), vw01 = vx_setall_f32 (w01), vw02 = vx_setall_f32 (w02),
946
+ vw10 = vx_setall_f32 (w10), vw11 = vx_setall_f32 (w11), vw12 = vx_setall_f32 (w12),
947
+ vw20 = vx_setall_f32 (w20), vw21 = vx_setall_f32 (w21), vw22 = vx_setall_f32 (w22);
948
+ v_float32 z = vx_setzero_f32 (), vbias = vx_setall_f32 (bias), vrc = vx_setall_f32 (relu_coeff);
949
+ for ( ; out_j < outW1; out_j += out_delta )
950
+ {
951
+ if (out_j + out_delta > outW1)
952
+ {
953
+ if (out_j <= pad_l)
954
+ break ;
955
+ out_j = outW1 - out_delta;
956
+ }
957
+ int in_j = out_j * stride_w - pad_l;
958
+ v_float32 v00 = vx_load (imgptr0 + in_j),
959
+ v01 = vx_load (imgptr0 + in_j + dilation_w),
960
+ v02 = vx_load (imgptr0 + in_j + dilation_w*2 ),
961
+ v10 = vx_load (imgptr1 + in_j),
962
+ v11 = vx_load (imgptr1 + in_j + dilation_w),
963
+ v12 = vx_load (imgptr1 + in_j + dilation_w*2 ),
964
+ v20 = vx_load (imgptr2 + in_j),
965
+ v21 = vx_load (imgptr2 + in_j + dilation_w),
966
+ v22 = vx_load (imgptr2 + in_j + dilation_w*2 );
967
+
968
+ v_float32 vout = v00*vw00 + v01*vw01 + v02*vw02 +
969
+ v10*vw10 + v11*vw11 + v12*vw12 +
970
+ v20*vw20 + v21*vw21 + v22*vw22 + vbias;
971
+ if (relu)
972
+ vout = v_select (vout > z, vout, vout*vrc);
973
+ vx_store (outptr + out_j, vout);
974
+ }
975
+ }
976
+ #endif
977
+ for (; out_j < outW1; out_j++)
978
+ {
979
+ int in_j = out_j * stride_w - pad_l;
980
+ out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2 ]*w02 +
981
+ imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2 ]*w12 +
982
+ imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2 ]*w22 + bias;
983
+ if (relu)
984
+ out = out > 0 .f ? out : out*relu_coeff;
985
+ outptr[out_j] = out;
986
+ }
987
+
988
+ for (; out_j < outW; out_j++ )
989
+ {
990
+ int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2 ;
991
+ float s0 = 1 .f , s1 = 1 .f , s2 = 1 .f ;
992
+ if (in_j0 >= width)
993
+ {
994
+ in_j0 = 0 ;
995
+ s0 = 0 .f ;
996
+ }
997
+ if (in_j1 >= width)
998
+ {
999
+ in_j1 = 0 ;
1000
+ s1 = 0 .f ;
1001
+ }
1002
+ if (in_j2 >= width)
1003
+ {
1004
+ in_j2 = 0 ;
1005
+ s2 = 0 .f ;
1006
+ }
1007
+ out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
1008
+ imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
1009
+ imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
1010
+ if (relu)
1011
+ out = out > 0 .f ? out : out*relu_coeff;
1012
+ outptr[out_j] = out;
1013
+ }
1014
+ }
1015
+ }
1016
+ continue ;
1017
+ }
1018
+
855
1019
// do im2row for a part of input tensor
856
1020
float * rowbuf = rowbuf0;
857
1021
858
1022
if (isConv2D)
859
1023
{
1024
+ if ( is1x1 && stride_w == 1 && stride_h == 1 )
1025
+ {
1026
+ const float * imgptr = data_inp0 + (cn0*height + out_i)*width + out_j;
1027
+ for ( int j = 0 ; j < bsz; j++, rowbuf += vsz_a )
1028
+ {
1029
+ if ( j + 4 <= bsz )
1030
+ {
1031
+ k = 0 ;
1032
+ #if CV_SIMD128
1033
+ for ( ; k <= vsz - 4 ; k += 4 )
1034
+ {
1035
+ const float * inp = imgptr + j + k*inpPlaneSize;
1036
+ v_float32x4 p0 = v_load (inp), p1 = v_load (inp + inpPlaneSize);
1037
+ v_float32x4 p2 = v_load (inp + inpPlaneSize*2 ), p3 = v_load (inp + inpPlaneSize*3 );
1038
+ v_float32x4 r0, r1, r2, r3;
1039
+ v_transpose4x4 (p0, p1, p2, p3, r0, r1, r2, r3);
1040
+ v_store (rowbuf + k, r0);
1041
+ v_store (rowbuf + k + vsz_a, r1);
1042
+ v_store (rowbuf + k + vsz_a*2 , r2);
1043
+ v_store (rowbuf + k + vsz_a*3 , r3);
1044
+ }
1045
+ #endif
1046
+ for ( ; k < vsz; k++ )
1047
+ {
1048
+ const float * inp = imgptr + j + k*inpPlaneSize;
1049
+ float v0 = inp[0 ], v1 = inp[1 ], v2 = inp[2 ], v3 = inp[3 ];
1050
+ rowbuf[k] = v0;
1051
+ rowbuf[k + vsz_a] = v1;
1052
+ rowbuf[k + vsz_a*2 ] = v2;
1053
+ rowbuf[k + vsz_a*3 ] = v3;
1054
+ }
1055
+ j += 3 ;
1056
+ rowbuf += vsz_a*3 ;
1057
+ }
1058
+ else
1059
+ {
1060
+ for ( k = 0 ; k < vsz; k++ )
1061
+ {
1062
+ rowbuf[k] = imgptr[j + k*inpPlaneSize];
1063
+ }
1064
+ }
1065
+ }
1066
+ }
1067
+ else
860
1068
for ( ofs = ofs0; ofs < ofs1; out_j = 0 , ++out_i )
861
1069
{
862
1070
int delta = std::min (ofs1 - ofs, outW - out_j);
@@ -976,7 +1184,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
976
1184
977
1185
// now compute dot product of the weights
978
1186
// and im2row-transformed part of the tensor
979
- int bsz = ofs1 - ofs0;
980
1187
#if CV_TRY_AVX512_SKX
981
1188
/* AVX512 convolution requires an alignment of 16, and ROI is only there for larger vector sizes */
982
1189
if (useAVX512)
0 commit comments