Skip to content

Commit e551d15

Browse files
authored
enabled convolution & activation fusion (#1245)
* enabled convolution & activation fusion * a few more optimizations: + optimized the common case when the indices of max pooling layer are not used. in this case we use the more efficient branch that computes just maximums over the aperture. + optimized the convolution + activation fusion when the activation is relu, which is another common case + convolution can now be fused with batch norm. It's the zero-cost fusion. If the batch norm is followed by relu, all three (conv + batchnorm + relu) are fused together. this modification seriously improved ENet performance * hopefully fixed warnings on Windows
1 parent 62ba5d7 commit e551d15

File tree

8 files changed

+365
-449
lines changed

8 files changed

+365
-449
lines changed

modules/dnn/include/opencv2/dnn/all_layers.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,11 +202,13 @@ namespace dnn
202202
};
203203

204204
class CV_EXPORTS ActivationLayer;
205+
class CV_EXPORTS BatchNormLayer;
205206

206207
class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
207208
{
208209
public:
209210
virtual bool setActivation(const Ptr<ActivationLayer>& layer) = 0;
211+
virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer) = 0;
210212

211213
static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
212214
};
@@ -247,6 +249,7 @@ namespace dnn
247249
int type;
248250
Size kernel, stride, pad;
249251
bool globalPooling;
252+
bool computeMaxIdx;
250253
String padMode;
251254

252255
static Ptr<PoolingLayer> create(const LayerParams& params);
@@ -414,6 +417,7 @@ namespace dnn
414417
bool hasWeights, hasBias;
415418
float epsilon;
416419

420+
virtual void getScaleShift(Mat& scale, Mat& shift) const = 0;
417421
static Ptr<BatchNormLayer> create(const LayerParams &params);
418422
};
419423

modules/dnn/src/dnn.cpp

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,7 @@ struct LayerData
324324
//add logging info
325325
params.name = name;
326326
params.type = type;
327+
skip = false;
327328
}
328329

329330
int id;
@@ -334,6 +335,7 @@ struct LayerData
334335
std::vector<LayerPin> inputBlobsId;
335336
std::set<int> inputLayersId;
336337
std::set<int> requiredOutputs;
338+
std::vector<LayerPin> consumers;
337339

338340
Ptr<Layer> layerInstance;
339341
std::vector<Mat> outputBlobs;
@@ -345,6 +347,7 @@ struct LayerData
345347
std::map<int, bool> skipFlags;
346348

347349
int flag;
350+
bool skip;
348351

349352
Ptr<Layer> getLayerInstance()
350353
{
@@ -835,6 +838,7 @@ struct Net::Impl
835838

836839
addLayerInput(ldInp, inNum, LayerPin(outLayerId, outNum));
837840
ldOut.requiredOutputs.insert(outNum);
841+
ldOut.consumers.push_back(LayerPin(inLayerId, outNum));
838842
}
839843

840844
void computeNetOutputLayers()
@@ -1034,15 +1038,79 @@ struct Net::Impl
10341038
int lid = it->first;
10351039
allocateLayer(lid, layersShapes);
10361040
}
1041+
1042+
// scan through all the layers. If there is convolution layer followed by the activation layer,
1043+
// we try to embed this activation into the convolution and disable separate execution of the activation
1044+
std::vector<String> outnames;
1045+
for (it = layers.begin(); it != layers.end(); it++)
1046+
{
1047+
int lid = it->first;
1048+
LayerData& ld = layers[lid];
1049+
if( ld.skip )
1050+
{
1051+
//printf("skipping %s\n", ld.layerInstance->name.c_str());
1052+
continue;
1053+
}
1054+
//printf("analyzing %s\n", ld.layerInstance->name.c_str());
1055+
if( ld.consumers.size() == 0 )
1056+
outnames.push_back(ld.layerInstance->name);
1057+
Ptr<ConvolutionLayer> convLayer = ld.layerInstance.dynamicCast<ConvolutionLayer>();
1058+
if( !convLayer.empty() && ld.consumers.size() == 1 )
1059+
{
1060+
LayerData* nextData = &layers[ld.consumers[0].lid];
1061+
Ptr<BatchNormLayer> nextBNormLayer =
1062+
nextData->layerInstance.dynamicCast<BatchNormLayer>();
1063+
if( !nextBNormLayer.empty() )
1064+
{
1065+
LayerData* bnormData = nextData;
1066+
nextData = 0;
1067+
if( convLayer->setBatchNorm(nextBNormLayer) )
1068+
{
1069+
//printf("fused convolution (%s) and batch norm (%s)\n", convLayer->name.c_str(), nextBNormLayer->name.c_str());
1070+
bnormData->skip = true;
1071+
if( bnormData->consumers.size() == 1 )
1072+
nextData = &layers[bnormData->consumers[0].lid];
1073+
}
1074+
}
1075+
1076+
Ptr<ActivationLayer> nextActivLayer;
1077+
if( nextData )
1078+
nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
1079+
1080+
if( !nextActivLayer.empty() && convLayer->setActivation(nextActivLayer) )
1081+
{
1082+
//printf("fused convolution (%s) and activation (%s)\n", convLayer->name.c_str(), nextActivLayer->name.c_str());
1083+
nextData->skip = true;
1084+
}
1085+
}
1086+
Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>();
1087+
if( !poolingLayer.empty() && !ld.consumers.empty() )
1088+
{
1089+
size_t i = 0, nconsumers = ld.consumers.size();
1090+
for( ; i < nconsumers; i++ )
1091+
if( ld.consumers[i].oid > 0 )
1092+
break;
1093+
// if there is no layer that takes the second output pin of the pooling layer
1094+
// on input then we don't need to compute the indices
1095+
if( i >= nconsumers )
1096+
poolingLayer->computeMaxIdx = false;
1097+
}
1098+
}
1099+
/*printf("outputs: ");
1100+
for( size_t j = 0; j < outnames.size(); j++ )
1101+
printf("%s ", outnames[j].c_str());
1102+
printf("\n");*/
10371103
}
10381104

10391105
void forwardLayer(LayerData &ld)
10401106
{
10411107
Ptr<Layer> layer = ld.layerInstance;
1108+
10421109
if (preferableBackend == DNN_BACKEND_DEFAULT ||
10431110
!layer->supportBackend(preferableBackend))
10441111
{
1045-
layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals);
1112+
if( !ld.skip )
1113+
layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals);
10461114
}
10471115
else if (!ld.skipFlags[preferableBackend])
10481116
{

modules/dnn/src/layers/batch_norm_layer.cpp

Lines changed: 59 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ namespace dnn
2121
class BatchNormLayerImpl : public BatchNormLayer
2222
{
2323
public:
24+
Mat weights_, bias_;
25+
2426
BatchNormLayerImpl(const LayerParams& params)
2527
{
2628
setParamsFrom(params);
@@ -29,6 +31,60 @@ class BatchNormLayerImpl : public BatchNormLayer
2931
hasWeights = params.get<bool>("has_weight", false);
3032
hasBias = params.get<bool>("has_bias", false);
3133
epsilon = params.get<float>("eps", 1E-5);
34+
35+
size_t n = blobs[0].total();
36+
CV_Assert(blobs[1].total() == n &&
37+
blobs[0].isContinuous() && blobs[1].isContinuous() &&
38+
blobs[0].type() == CV_32F && blobs[1].type() == CV_32F);
39+
40+
float varMeanScale = 1.f;
41+
if (!hasWeights && !hasBias) {
42+
CV_Assert(blobs[2].type() == CV_32F);
43+
varMeanScale = blobs[2].at<float>(0);
44+
if (varMeanScale != 0)
45+
varMeanScale = 1/varMeanScale;
46+
}
47+
48+
const int weightsBlobIndex = 2;
49+
const int biasBlobIndex = weightsBlobIndex + hasWeights;
50+
51+
if( hasWeights )
52+
{
53+
CV_Assert((size_t)weightsBlobIndex < blobs.size());
54+
const Mat& w = blobs[weightsBlobIndex];
55+
CV_Assert(w.isContinuous() && w.type() == CV_32F && w.total() == (size_t)n);
56+
}
57+
58+
if( hasBias )
59+
{
60+
CV_Assert((size_t)biasBlobIndex < blobs.size());
61+
const Mat& b = blobs[weightsBlobIndex];
62+
CV_Assert(b.isContinuous() && b.type() == CV_32F && b.total() == (size_t)n);
63+
}
64+
65+
const float* meanData = blobs[0].ptr<float>();
66+
const float* stdData = blobs[1].ptr<float>();
67+
const float* weightsData = hasWeights ? blobs[weightsBlobIndex].ptr<float>() : 0;
68+
const float* biasData = hasBias ? blobs[biasBlobIndex].ptr<float>() : 0;
69+
70+
weights_.create(1, (int)n, CV_32F);
71+
bias_.create(1, (int)n, CV_32F);
72+
73+
float* dstWeightsData = weights_.ptr<float>();
74+
float* dstBiasData = bias_.ptr<float>();
75+
76+
for (size_t i = 0; i < n; ++i)
77+
{
78+
float w = (hasWeights ? weightsData[i] : 1.0f) / sqrt(stdData[i] * varMeanScale + epsilon);
79+
dstWeightsData[i] = w;
80+
dstBiasData[i] = (hasBias ? biasData[i] : 0.0f) - w * meanData[i] * varMeanScale;
81+
}
82+
}
83+
84+
void getScaleShift(Mat& scale, Mat& shift) const
85+
{
86+
scale = weights_;
87+
shift = bias_;
3288
}
3389

3490
bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -51,45 +107,23 @@ class BatchNormLayerImpl : public BatchNormLayer
51107
CV_Assert(blobs.size() >= 2);
52108
CV_Assert(inputs.size() == 1);
53109

54-
float varMeanScale = 1.f;
55-
if (!hasWeights && !hasBias) {
56-
varMeanScale = *blobs[2].ptr<float>();
57-
if (varMeanScale != 0)
58-
varMeanScale = 1/varMeanScale;
59-
}
60-
61-
Mat invStdMat;
62-
cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
63-
64110
Mat &inpBlob = *inputs[0];
65-
66-
int weightsBlobIndex = 2;
67-
int biasBlobIndex = weightsBlobIndex + hasWeights;
68-
69111
int rows = inpBlob.size[2];
70112
int cols = inpBlob.size[3];
71113

72114
for (size_t ii = 0; ii < outputs.size(); ii++)
73115
{
74116
Mat &outBlob = outputs[ii];
75117

76-
if (hasWeights)
77-
CV_Assert(inpBlob.size[1] == blobs[weightsBlobIndex].total());
78-
79-
if (hasBias)
80-
CV_Assert(inpBlob.size[1] == blobs[biasBlobIndex].total());
81-
82118
for(int num = 0; num < outBlob.size[0]; num++)
83119
{
84120
for (int n = 0; n < outBlob.size[1]; n++)
85121
{
86-
float mean = blobs[0].at<float>(n)*varMeanScale;
87-
double invstd = invStdMat.at<float>(n);
88-
float w = hasWeights ? blobs[weightsBlobIndex].at<float>(n) : 1;
89-
float b = hasBias ? blobs[biasBlobIndex].at<float>(n) : 0;
122+
float w = weights_.at<float>(n);
123+
float b = bias_.at<float>(n);
90124
Mat inpBlobPlane(rows, cols, CV_32F, inpBlob.ptr<float>(num, n));
91125
Mat outBlobPlane(rows, cols, CV_32F, outBlob.ptr<float>(num, n));
92-
inpBlobPlane.convertTo(outBlobPlane, CV_32F, w*invstd, b - mean*w*invstd);
126+
inpBlobPlane.convertTo(outBlobPlane, CV_32F, w, b);
93127
}
94128
}
95129
}

0 commit comments

Comments
 (0)