opencv
diff --git a/‎modules/dnn/include/opencv2/dnn/all_layers.hpp
Lines changed: 4 additions & 0 deletions b/‎modules/dnn/include/opencv2/dnn/all_layers.hpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎modules/dnn/src/dnn.cpp
Lines changed: 69 additions & 1 deletion b/‎modules/dnn/src/dnn.cpp
Lines changed: 69 additions & 1 deletion
diff --git a/‎modules/dnn/src/layers/batch_norm_layer.cpp
Lines changed: 59 additions & 25 deletions b/‎modules/dnn/src/layers/batch_norm_layer.cpp
Lines changed: 59 additions & 25 deletions
@@ -202,11 +202,13 @@ namespace dnn
     };
 
     class CV_EXPORTS ActivationLayer;
+    class CV_EXPORTS BatchNormLayer;
 
     class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
     {
     public:
         virtual bool setActivation(const Ptr<ActivationLayer>& layer) = 0;
+        virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer) = 0;
 
         static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
     };
@@ -247,6 +249,7 @@ namespace dnn
         int type;
         Size kernel, stride, pad;
         bool globalPooling;
+        bool computeMaxIdx;
         String padMode;
 
         static Ptr<PoolingLayer> create(const LayerParams& params);
@@ -414,6 +417,7 @@ namespace dnn
         bool hasWeights, hasBias;
         float epsilon;
 
+        virtual void getScaleShift(Mat& scale, Mat& shift) const = 0;
         static Ptr<BatchNormLayer> create(const LayerParams &params);
     };
 
 
@@ -324,6 +324,7 @@ struct LayerData
         //add logging info
         params.name = name;
         params.type = type;
+        skip = false;
     }
 
     int id;
@@ -334,6 +335,7 @@ struct LayerData
     std::vector<LayerPin> inputBlobsId;
     std::set<int> inputLayersId;
     std::set<int> requiredOutputs;
+    std::vector<LayerPin> consumers;
 
     Ptr<Layer> layerInstance;
     std::vector<Mat> outputBlobs;
@@ -345,6 +347,7 @@ struct LayerData
     std::map<int, bool> skipFlags;
 
     int flag;
+    bool skip;
 
     Ptr<Layer> getLayerInstance()
     {
@@ -835,6 +838,7 @@ struct Net::Impl
 
         addLayerInput(ldInp, inNum, LayerPin(outLayerId, outNum));
         ldOut.requiredOutputs.insert(outNum);
+        ldOut.consumers.push_back(LayerPin(inLayerId, outNum));
     }
 
     void computeNetOutputLayers()
@@ -1034,15 +1038,79 @@ struct Net::Impl
             int lid = it->first;
             allocateLayer(lid, layersShapes);
         }
+
+        // scan through all the layers. If there is convolution layer followed by the activation layer,
+        // we try to embed this activation into the convolution and disable separate execution of the activation
+        std::vector<String> outnames;
+        for (it = layers.begin(); it != layers.end(); it++)
+        {
+            int lid = it->first;
+            LayerData& ld = layers[lid];
+            if( ld.skip )
+            {
+                //printf("skipping %s\n", ld.layerInstance->name.c_str());
+                continue;
+            }
+            //printf("analyzing %s\n", ld.layerInstance->name.c_str());
+            if( ld.consumers.size() == 0 )
+                outnames.push_back(ld.layerInstance->name);
+            Ptr<ConvolutionLayer> convLayer = ld.layerInstance.dynamicCast<ConvolutionLayer>();
+            if( !convLayer.empty() && ld.consumers.size() == 1 )
+            {
+                LayerData* nextData = &layers[ld.consumers[0].lid];
+                Ptr<BatchNormLayer> nextBNormLayer =
+                    nextData->layerInstance.dynamicCast<BatchNormLayer>();
+                if( !nextBNormLayer.empty() )
+                {
+                    LayerData* bnormData = nextData;
+                    nextData = 0;
+                    if( convLayer->setBatchNorm(nextBNormLayer) )
+                    {
+                        //printf("fused convolution (%s) and batch norm (%s)\n", convLayer->name.c_str(), nextBNormLayer->name.c_str());
+                        bnormData->skip = true;
+                        if( bnormData->consumers.size() == 1 )
+                            nextData = &layers[bnormData->consumers[0].lid];
+                    }
+                }
+
+                Ptr<ActivationLayer> nextActivLayer;
+                if( nextData )
+                    nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
+
+                if( !nextActivLayer.empty() && convLayer->setActivation(nextActivLayer) )
+                {
+                    //printf("fused convolution (%s) and activation (%s)\n", convLayer->name.c_str(), nextActivLayer->name.c_str());
+                    nextData->skip = true;
+                }
+            }
+            Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>();
+            if( !poolingLayer.empty() && !ld.consumers.empty() )
+            {
+                size_t i = 0, nconsumers = ld.consumers.size();
+                for( ; i < nconsumers; i++ )
+                    if( ld.consumers[i].oid > 0 )
+                        break;
+                // if there is no layer that takes the second output pin of the pooling layer
+                // on input then we don't need to compute the indices
+                if( i >= nconsumers )
+                    poolingLayer->computeMaxIdx = false;
+            }
+        }
+        /*printf("outputs: ");
+        for( size_t j = 0; j < outnames.size(); j++ )
+            printf("%s ", outnames[j].c_str());
+        printf("\n");*/
     }
 
     void forwardLayer(LayerData &ld)
     {
         Ptr<Layer> layer = ld.layerInstance;
+
         if (preferableBackend == DNN_BACKEND_DEFAULT ||
             !layer->supportBackend(preferableBackend))
         {
-            layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals);
+            if( !ld.skip )
+                layer->forward(ld.inputBlobs, ld.outputBlobs, ld.internals);
         }
         else if (!ld.skipFlags[preferableBackend])
         {
 
@@ -21,6 +21,8 @@ namespace dnn
 class BatchNormLayerImpl : public BatchNormLayer
 {
 public:
+    Mat weights_, bias_;
+
     BatchNormLayerImpl(const LayerParams& params)
     {
         setParamsFrom(params);
@@ -29,6 +31,60 @@ class BatchNormLayerImpl : public BatchNormLayer
         hasWeights = params.get<bool>("has_weight", false);
         hasBias = params.get<bool>("has_bias", false);
         epsilon = params.get<float>("eps", 1E-5);
+
+        size_t n = blobs[0].total();
+        CV_Assert(blobs[1].total() == n &&
+                  blobs[0].isContinuous() && blobs[1].isContinuous() &&
+                  blobs[0].type() == CV_32F && blobs[1].type() == CV_32F);
+
+        float varMeanScale = 1.f;
+        if (!hasWeights && !hasBias) {
+            CV_Assert(blobs[2].type() == CV_32F);
+            varMeanScale = blobs[2].at<float>(0);
+            if (varMeanScale != 0)
+                varMeanScale = 1/varMeanScale;
+        }
+
+        const int weightsBlobIndex = 2;
+        const int biasBlobIndex = weightsBlobIndex + hasWeights;
+
+        if( hasWeights )
+        {
+            CV_Assert((size_t)weightsBlobIndex < blobs.size());
+            const Mat& w = blobs[weightsBlobIndex];
+            CV_Assert(w.isContinuous() && w.type() == CV_32F && w.total() == (size_t)n);
+        }
+
+        if( hasBias )
+        {
+            CV_Assert((size_t)biasBlobIndex < blobs.size());
+            const Mat& b = blobs[weightsBlobIndex];
+            CV_Assert(b.isContinuous() && b.type() == CV_32F && b.total() == (size_t)n);
+        }
+
+        const float* meanData = blobs[0].ptr<float>();
+        const float* stdData = blobs[1].ptr<float>();
+        const float* weightsData = hasWeights ? blobs[weightsBlobIndex].ptr<float>() : 0;
+        const float* biasData = hasBias ? blobs[biasBlobIndex].ptr<float>() : 0;
+
+        weights_.create(1, (int)n, CV_32F);
+        bias_.create(1, (int)n, CV_32F);
+
+        float* dstWeightsData = weights_.ptr<float>();
+        float* dstBiasData = bias_.ptr<float>();
+
+        for (size_t i = 0; i < n; ++i)
+        {
+            float w = (hasWeights ? weightsData[i] : 1.0f) / sqrt(stdData[i] * varMeanScale + epsilon);
+            dstWeightsData[i] = w;
+            dstBiasData[i] = (hasBias ? biasData[i] : 0.0f) - w * meanData[i] * varMeanScale;
+        }
+    }
+
+    void getScaleShift(Mat& scale, Mat& shift) const
+    {
+        scale = weights_;
+        shift = bias_;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -51,45 +107,23 @@ class BatchNormLayerImpl : public BatchNormLayer
         CV_Assert(blobs.size() >= 2);
         CV_Assert(inputs.size() == 1);
 
-        float varMeanScale = 1.f;
-        if (!hasWeights && !hasBias) {
-            varMeanScale = *blobs[2].ptr<float>();
-            if (varMeanScale != 0)
-                varMeanScale = 1/varMeanScale;
-        }
-
-        Mat invStdMat;
-        cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
-
         Mat &inpBlob = *inputs[0];
-
-        int weightsBlobIndex = 2;
-        int biasBlobIndex = weightsBlobIndex + hasWeights;
-
         int rows = inpBlob.size[2];
         int cols = inpBlob.size[3];
 
         for (size_t ii = 0; ii < outputs.size(); ii++)
         {
             Mat &outBlob = outputs[ii];
 
-            if (hasWeights)
-                CV_Assert(inpBlob.size[1] == blobs[weightsBlobIndex].total());
-
-            if (hasBias)
-                CV_Assert(inpBlob.size[1] == blobs[biasBlobIndex].total());
-
             for(int num = 0; num < outBlob.size[0]; num++)
             {
                 for (int n = 0; n < outBlob.size[1]; n++)
                 {
-                    float mean = blobs[0].at<float>(n)*varMeanScale;
-                    double invstd = invStdMat.at<float>(n);
-                    float w = hasWeights ? blobs[weightsBlobIndex].at<float>(n) : 1;
-                    float b = hasBias ? blobs[biasBlobIndex].at<float>(n) : 0;
+                    float w = weights_.at<float>(n);
+                    float b = bias_.at<float>(n);
                     Mat inpBlobPlane(rows, cols, CV_32F, inpBlob.ptr<float>(num, n));
                     Mat outBlobPlane(rows, cols, CV_32F, outBlob.ptr<float>(num, n));
-                    inpBlobPlane.convertTo(outBlobPlane, CV_32F, w*invstd, b - mean*w*invstd);
+                    inpBlobPlane.convertTo(outBlobPlane, CV_32F, w, b);
                 }
             }
         }