Skip to content

Commit 1067cd0

Browse files
committed
Merge pull request opencv#18036 from alalek:backport_17858
2 parents 5b5c42d + 1537ecd commit 1067cd0

File tree

2 files changed

+499
-36
lines changed

2 files changed

+499
-36
lines changed

modules/dnn/src/layers/convolution_layer.cpp

Lines changed: 234 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
648648
bool useAVX;
649649
bool useAVX2;
650650
bool useAVX512;
651+
int blk_size_cn;
651652

652653
ParallelConv()
653654
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
@@ -704,12 +705,17 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
704705
p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D;
705706
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;
706707

707-
int ncn = std::min(inpCn, (int)BLK_SIZE_CN);
708-
709708
int kernel_d = !isConv2D? kernel_size[0] : 1;
710709
int kernel_h = kernel_size[kernel_size.size() - 2];
711710
int kernel_w = kernel_size.back();
712711

712+
int blk_size_cn0 = cvCeil(800./(kernel_w*kernel_h));
713+
int ncn = 16;
714+
while (ncn*2 < blk_size_cn0 && ncn < inpCn)
715+
ncn *= 2;
716+
ncn = std::min(ncn, inpCn);
717+
p.blk_size_cn = ncn;
718+
713719
int dil_d = !isConv2D? dilations[0] : 1;
714720
int dil_h = dilations[dilations.size() - 2];
715721
int dil_w = dilations.back();
@@ -777,18 +783,26 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
777783
int dilation_w = dilations.back();
778784

779785
int i, j, k, d;
780-
size_t inpPlaneSize = input_->total(2);
781-
size_t outPlaneSize = output_->total(2);
786+
int inpPlaneSize = (int)input_->total(2);
787+
int outPlaneSize = (int)output_->total(2);
782788
bool is1x1 = is1x1_;
783789

784790
int stripesPerSample;
785-
size_t stripeSize;
791+
int stripeSize;
786792
Range r = r0;
787-
788-
if( nstripes >= batchSize*2 )
793+
bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 &&
794+
outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 &&
795+
width >= 16 + dilation_w*(kernel_w - 1);
796+
// for now only 3x3 depth-wise convolutions are supported
797+
depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 &&
798+
// computing at most 1 pixel from each side can involve padding
799+
max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t &&
800+
pad_l <= 1 && pad_t <= 1;
801+
802+
if( !depthWiseConvolution && nstripes >= batchSize*2 )
789803
{
790804
stripesPerSample = nstripes/batchSize;
791-
stripeSize = alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
805+
stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
792806
stripeSize = std::min(stripeSize, outPlaneSize);
793807
}
794808
else
@@ -807,20 +821,29 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
807821
const float* biasptr_ = &biasvec_->at(0);
808822
const float* reluptr_ = reluslope_->empty() ? 0 : &reluslope_->at(0);
809823
float* data_out0_ = output_->ptr<float>();
810-
size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE;
811-
AutoBuffer<float> rowbuf0_(rowbufsz + valign);
812-
float* rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float)));
813-
814-
// we clear the buffer once; ultimately, it lets us to avoid
815-
// tail processing after running the unrolled/vectorized loop.
816-
// the main idea is to make sure that the tail (a.k.a. padding) of each row
817-
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
818-
// does not contain NaNs or Infs. Because the padding in the weights
819-
// matrix is explicitly initialized with 0's, we handle all other
820-
// cases nicely, i.e. we can skip expliciting re-initialization
821-
// of the padding - we just retain elements from the previous iteration
822-
// of the loop over channels (cn0).
823-
memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
824+
AutoBuffer<float> rowbuf0_;
825+
float* rowbuf0 = 0;
826+
bool use_rowbuf = !depthWiseConvolution;
827+
int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize);
828+
829+
// im2row buffer is not used for depth-wise convolution
830+
if(use_rowbuf)
831+
{
832+
size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size);
833+
//printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize);
834+
rowbuf0_.allocate(rowbufsz + valign);
835+
rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float)));
836+
// we clear the buffer once; ultimately, it lets us to avoid
837+
// tail processing after running the unrolled/vectorized loop.
838+
// the main idea is to make sure that the tail (a.k.a. padding) of each row
839+
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
840+
// does not contain NaNs or Infs. Because the padding in the weights
841+
// matrix is explicitly initialized with 0's, we handle all other
842+
// cases nicely, i.e. we can skip expliciting re-initialization
843+
// of the padding - we just retain elements from the previous iteration
844+
// of the loop over channels (cn0).
845+
memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
846+
}
824847

825848
for( int stripe = r.start; stripe < r.end; stripe++ )
826849
{
@@ -835,28 +858,213 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
835858
const float* wptr_orig = wptr_orig_ + wstep*startOutCn;
836859
const float* biasptr = biasptr_ + startOutCn;
837860

838-
for( int cn0 = 0; cn0 < inpCn; cn0 += BLK_SIZE_CN )
861+
for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn )
839862
{
840-
int cn1 = std::min(cn0 + BLK_SIZE_CN, inpCn);
863+
int cn1 = std::min(cn0 + blk_size_cn, inpCn);
841864
int ncn = cn1 - cn0, vsz = karea*ncn;
842865
int vsz_a = (int)alignSize(vsz, valign);
843866
const float* wptr = wptr_orig + cn0*karea;
844867
// we apply [Channels][P]ReLU (if any) during the final pass only.
845868
const float* relu = cn1 == inpCn && reluptr_ ? reluptr_ + startOutCn : 0;
846869

847-
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
870+
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size )
848871
{
849-
int ofs, ofs1 = std::min(ofs0 + BLK_SIZE, stripeEnd);
872+
int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd);
873+
int bsz = ofs1 - ofs0;
850874

851875
int out_d = ofs0 / (outH * outW);
852876
int out_i = (ofs0 - out_d * outH * outW) / outW;
853877
int out_j = ofs0 % outW;
854878

879+
if (depthWiseConvolution)
880+
{
881+
CV_Assert(out_i == 0 && out_j == 0);
882+
int in_d = out_d * stride_d - pad_d;
883+
const float* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width;
884+
float* outptr_ = data_out0 + ofs0;
885+
886+
#if CV_TRY_AVX2
887+
if(useAVX2)
888+
opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w,
889+
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
890+
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
891+
else
892+
#endif
893+
#if CV_TRY_AVX
894+
if(useAVX)
895+
opt_AVX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
896+
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
897+
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
898+
else
899+
#endif
900+
{
901+
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
902+
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
903+
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
904+
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
905+
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
906+
907+
for (int out_i = 0; out_i < outH; out_i++)
908+
{
909+
int in_i = out_i * stride_h - pad_t, out_j = 0;
910+
const float* imgptr0 = inptr_ + in_i*width;
911+
const float* imgptr1 = imgptr0 + dilation_h*width;
912+
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
913+
float out, w00 = w00_, w01 = w01_, w02 = w02_;
914+
float w20 = w20_, w21 = w21_, w22 = w22_;
915+
if (in_i < 0)
916+
{
917+
w00 = w01 = w02 = 0.f;
918+
imgptr0 = imgptr1;
919+
}
920+
else if (in_i + dilation_h*(kernel_h-1) >= height)
921+
{
922+
w20 = w21 = w22 = 0.f;
923+
imgptr2 = imgptr1;
924+
}
925+
float* outptr = outptr_ + out_i*outW;
926+
if (pad_l > 0)
927+
{
928+
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
929+
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
930+
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
931+
if (relu)
932+
out = out > 0.f ? out : out*relu_coeff;
933+
outptr[0] = out;
934+
out_j = 1;
935+
}
936+
937+
#if CV_SIMD
938+
// maybe with AVX or AVX512 strided depthwise convolution
939+
// can be accelerated with vector code, but with 4xfloat vectors
940+
// it's hardly the case
941+
if( stride_w == 1 )
942+
{
943+
const int VECSZ = v_float32::nlanes;
944+
const int out_delta = VECSZ/stride_w;
945+
v_float32 vw00 = vx_setall_f32(w00), vw01 = vx_setall_f32(w01), vw02 = vx_setall_f32(w02),
946+
vw10 = vx_setall_f32(w10), vw11 = vx_setall_f32(w11), vw12 = vx_setall_f32(w12),
947+
vw20 = vx_setall_f32(w20), vw21 = vx_setall_f32(w21), vw22 = vx_setall_f32(w22);
948+
v_float32 z = vx_setzero_f32(), vbias = vx_setall_f32(bias), vrc = vx_setall_f32(relu_coeff);
949+
for( ; out_j < outW1; out_j += out_delta )
950+
{
951+
if (out_j + out_delta > outW1)
952+
{
953+
if (out_j <= pad_l)
954+
break;
955+
out_j = outW1 - out_delta;
956+
}
957+
int in_j = out_j * stride_w - pad_l;
958+
v_float32 v00 = vx_load(imgptr0 + in_j),
959+
v01 = vx_load(imgptr0 + in_j + dilation_w),
960+
v02 = vx_load(imgptr0 + in_j + dilation_w*2),
961+
v10 = vx_load(imgptr1 + in_j),
962+
v11 = vx_load(imgptr1 + in_j + dilation_w),
963+
v12 = vx_load(imgptr1 + in_j + dilation_w*2),
964+
v20 = vx_load(imgptr2 + in_j),
965+
v21 = vx_load(imgptr2 + in_j + dilation_w),
966+
v22 = vx_load(imgptr2 + in_j + dilation_w*2);
967+
968+
v_float32 vout = v00*vw00 + v01*vw01 + v02*vw02 +
969+
v10*vw10 + v11*vw11 + v12*vw12 +
970+
v20*vw20 + v21*vw21 + v22*vw22 + vbias;
971+
if (relu)
972+
vout = v_select(vout > z, vout, vout*vrc);
973+
vx_store(outptr + out_j, vout);
974+
}
975+
}
976+
#endif
977+
for (; out_j < outW1; out_j++)
978+
{
979+
int in_j = out_j * stride_w - pad_l;
980+
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
981+
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
982+
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
983+
if (relu)
984+
out = out > 0.f ? out : out*relu_coeff;
985+
outptr[out_j] = out;
986+
}
987+
988+
for (; out_j < outW; out_j++ )
989+
{
990+
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
991+
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
992+
if (in_j0 >= width)
993+
{
994+
in_j0 = 0;
995+
s0 = 0.f;
996+
}
997+
if (in_j1 >= width)
998+
{
999+
in_j1 = 0;
1000+
s1 = 0.f;
1001+
}
1002+
if (in_j2 >= width)
1003+
{
1004+
in_j2 = 0;
1005+
s2 = 0.f;
1006+
}
1007+
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
1008+
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
1009+
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
1010+
if (relu)
1011+
out = out > 0.f ? out : out*relu_coeff;
1012+
outptr[out_j] = out;
1013+
}
1014+
}
1015+
}
1016+
continue;
1017+
}
1018+
8551019
// do im2row for a part of input tensor
8561020
float* rowbuf = rowbuf0;
8571021

8581022
if (isConv2D)
8591023
{
1024+
if( is1x1 && stride_w == 1 && stride_h == 1 )
1025+
{
1026+
const float* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j;
1027+
for( int j = 0; j < bsz; j++, rowbuf += vsz_a )
1028+
{
1029+
if( j + 4 <= bsz )
1030+
{
1031+
k = 0;
1032+
#if CV_SIMD128
1033+
for( ; k <= vsz - 4; k += 4 )
1034+
{
1035+
const float* inp = imgptr + j + k*inpPlaneSize;
1036+
v_float32x4 p0 = v_load(inp), p1 = v_load(inp + inpPlaneSize);
1037+
v_float32x4 p2 = v_load(inp + inpPlaneSize*2), p3 = v_load(inp + inpPlaneSize*3);
1038+
v_float32x4 r0, r1, r2, r3;
1039+
v_transpose4x4(p0, p1, p2, p3, r0, r1, r2, r3);
1040+
v_store(rowbuf + k, r0);
1041+
v_store(rowbuf + k + vsz_a, r1);
1042+
v_store(rowbuf + k + vsz_a*2, r2);
1043+
v_store(rowbuf + k + vsz_a*3, r3);
1044+
}
1045+
#endif
1046+
for( ; k < vsz; k++ )
1047+
{
1048+
const float* inp = imgptr + j + k*inpPlaneSize;
1049+
float v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3];
1050+
rowbuf[k] = v0;
1051+
rowbuf[k + vsz_a] = v1;
1052+
rowbuf[k + vsz_a*2] = v2;
1053+
rowbuf[k + vsz_a*3] = v3;
1054+
}
1055+
j += 3;
1056+
rowbuf += vsz_a*3;
1057+
}
1058+
else
1059+
{
1060+
for( k = 0; k < vsz; k++ )
1061+
{
1062+
rowbuf[k] = imgptr[j + k*inpPlaneSize];
1063+
}
1064+
}
1065+
}
1066+
}
1067+
else
8601068
for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
8611069
{
8621070
int delta = std::min(ofs1 - ofs, outW - out_j);
@@ -976,7 +1184,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
9761184

9771185
// now compute dot product of the weights
9781186
// and im2row-transformed part of the tensor
979-
int bsz = ofs1 - ofs0;
9801187
#if CV_TRY_AVX512_SKX
9811188
/* AVX512 convolution requires an alignment of 16, and ROI is only there for larger vector sizes */
9821189
if(useAVX512)

0 commit comments

Comments
 (0)