YOLO-V4 support in C++ demo (#2689)

fzhar · eizamaliev · vladimir-dudnik · web-flow · commit 5ecfb293d2f3 · 2021-08-18T14:35:01.000+03:00
* Initial version

* Update demos/common/cpp/models/src/detection_model_yolo.cpp

Co-authored-by: Eduard Zamaliev &lt;eduard.zamaliev@intel.com&gt;

* Update demos/common/cpp/models/src/detection_model_yolo.cpp

Co-authored-by: Eduard Zamaliev &lt;eduard.zamaliev@intel.com&gt;

* Update demos/common/cpp/models/src/detection_model_yolo.cpp

Co-authored-by: Eduard Zamaliev &lt;eduard.zamaliev@intel.com&gt;

* New parameters are added to object_detection_demo
Bugfixes.

* linux fixes

* extend test cases, fix detection of yolo-v4-tiny

* fix test cases intendation

* Tiny detection is changed to be based on ouptus number.
Additional review issues fixes.

* Docs update and alignment.

Co-authored-by: Eduard Zamaliev &lt;eduard.zamaliev@intel.com&gt;
Co-authored-by: Vladimir Dudnik &lt;vladimir.dudnik@intel.com&gt;
diff --git a/demos/common/cpp/models/include/models/detection_model_yolo.h b/demos/common/cpp/models/include/models/detection_model_yolo.h
@@ -38,9 +38,17 @@ class ModelYolo : public DetectionModel {
         int outputHeight = 0;
 
         Region(const std::shared_ptr<ngraph::op::RegionYolo>& regionYolo);
+        Region(int classes, int coords, const std::vector<float>& anchors, const std::vector<int64_t>& masks, int outputWidth, int outputHeight);
     };
 
 public:
+    enum YoloVersion {
+        YOLO_V1V2,
+        YOLO_V3,
+        YOLO_V4,
+        YOLO_V4_TINY
+    };
+
     /// Constructor.
     /// @param modelFileName name of model to load
     /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
@@ -53,8 +61,11 @@ class ModelYolo : public DetectionModel {
     /// during postprocessing (only one of them should stay). The default value is 0.5
     /// @param labels - array of labels for every class. If this array is empty or contains less elements
     /// than actual classes number, default "Label #N" will be shown for missing items.
+    /// @param anchors - vector of anchors coordinates. Required for YOLOv4, for other versions it may be omitted.
+    /// @param masks - vector of masks values. Required for YOLOv4, for other versions it may be omitted.
     ModelYolo(const std::string& modelFileName, float confidenceThreshold, bool useAutoResize,
-        bool useAdvancedPostprocessing = true, float boxIOUThreshold = 0.5, const std::vector<std::string>& labels = std::vector<std::string>());
+        bool useAdvancedPostprocessing = true, float boxIOUThreshold = 0.5, const std::vector<std::string>& labels = std::vector<std::string>(),
+        const std::vector<float>& anchors = std::vector<float>(), const std::vector<int64_t>& masks = std::vector<int64_t>());
 
     std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
 
@@ -71,5 +82,7 @@ class ModelYolo : public DetectionModel {
     std::map<std::string, Region> regions;
     double boxIOUThreshold;
     bool useAdvancedPostprocessing;
-    bool isYoloV3;
+    YoloVersion yoloVersion;
+    const std::vector<float> presetAnchors;
+    const std::vector<int64_t> presetMasks;
 };
diff --git a/demos/common/cpp/models/src/detection_model_yolo.cpp b/demos/common/cpp/models/src/detection_model_yolo.cpp
@@ -19,74 +19,149 @@
 #include <utils/common.hpp>
 #include <ngraph/ngraph.hpp>
 
-using namespace InferenceEngine;
+std::vector<float> defaultAnchors[] = {
+    // YOLOv1v2
+    { 0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f },
+    // YOLOv3
+    { 10.0f, 13.0f, 16.0f, 30.0f, 33.0f, 23.0f,
+      30.0f, 61.0f, 62.0f, 45.0f, 59.0f, 119.0f,
+      116.0f, 90.0f, 156.0f, 198.0f, 373.0f, 326.0f},
+    // YOLOv4
+    { 12.0f, 16.0f, 19.0f, 36.0f, 40.0f, 28.0f,
+      36.0f, 75.0f, 76.0f, 55.0f, 72.0f, 146.0f,
+      142.0f, 110.0f, 192.0f, 243.0f, 459.0f, 401.0f},
+    // YOLOv4_Tiny
+    { 10.0f, 14.0f, 23.0f, 27.0f, 37.0f, 58.0f,
+      81.0f, 82.0f, 135.0f, 169.0f, 344.0f, 319.0f}
+};
+
+const std::vector<int64_t> defaultMasks[] = {
+    // YOLOv1v2
+    {},
+    // YOLOv3
+    {},
+    // YOLOv4
+    {0, 1, 2, 3, 4, 5, 6, 7, 8 },
+    // YOLOv4_Tiny
+    {1, 2, 3, 3, 4, 5}
+};
+
+static inline float sigmoid(float x) {
+    return 1.f / (1.f + exp(-x));
+}
+static inline float linear(float x) {
+    return x;
+}
+
 
 ModelYolo::ModelYolo(const std::string& modelFileName, float confidenceThreshold, bool useAutoResize,
-    bool useAdvancedPostprocessing, float boxIOUThreshold, const std::vector<std::string>& labels) :
+    bool useAdvancedPostprocessing, float boxIOUThreshold, const std::vector<std::string>& labels,
+    const std::vector<float>& anchors, const std::vector<int64_t>& masks) :
     DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels),
     boxIOUThreshold(boxIOUThreshold),
     useAdvancedPostprocessing(useAdvancedPostprocessing),
-    isYoloV3(true){
+    yoloVersion(YOLO_V3),
+    presetAnchors(anchors),
+    presetMasks(masks) {
 }
 
 void ModelYolo::prepareInputsOutputs(InferenceEngine::CNNNetwork& cnnNetwork) {
     // --------------------------- Configure input & output -------------------------------------------------
     // --------------------------- Prepare input blobs ------------------------------------------------------
     slog::info << "Checking that the inputs are as the demo expects" << slog::endl;
-    InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+    InferenceEngine::InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
     if (inputInfo.size() != 1) {
         throw std::logic_error("This demo accepts networks that have only one input");
     }
 
-    InputInfo::Ptr& input = inputInfo.begin()->second;
+    InferenceEngine::InputInfo::Ptr& input = inputInfo.begin()->second;
     inputsNames.push_back(inputInfo.begin()->first);
-    input->setPrecision(Precision::U8);
+    input->setPrecision(InferenceEngine::Precision::U8);
     if (useAutoResize) {
-        input->getPreProcess().setResizeAlgorithm(ResizeAlgorithm::RESIZE_BILINEAR);
-        input->getInputData()->setLayout(Layout::NHWC);
+        input->getPreProcess().setResizeAlgorithm(InferenceEngine::ResizeAlgorithm::RESIZE_BILINEAR);
+        input->getInputData()->setLayout(InferenceEngine::Layout::NHWC);
     }
     else {
-        input->getInputData()->setLayout(Layout::NCHW);
+        input->getInputData()->setLayout(InferenceEngine::Layout::NCHW);
     }
 
     //--- Reading image input parameters
-    const TensorDesc& inputDesc = inputInfo.begin()->second->getTensorDesc();
+    const InferenceEngine::TensorDesc& inputDesc = inputInfo.begin()->second->getTensorDesc();
     netInputHeight = getTensorHeight(inputDesc);
     netInputWidth = getTensorWidth(inputDesc);
 
     // --------------------------- Prepare output blobs -----------------------------------------------------
     slog::info << "Checking that the outputs are as the demo expects" << slog::endl;
-    OutputsDataMap outputInfo(cnnNetwork.getOutputsInfo());
+    InferenceEngine::OutputsDataMap outputInfo(cnnNetwork.getOutputsInfo());
     for (auto& output : outputInfo) {
-        output.second->setPrecision(Precision::FP32);
+        output.second->setPrecision(InferenceEngine::Precision::FP32);
         if (output.second->getDims().size() == 4) {
-            output.second->setLayout(Layout::NCHW);
+            output.second->setLayout(InferenceEngine::Layout::NCHW);
         }
         outputsNames.push_back(output.first);
     }
 
+    yoloVersion = YOLO_V3;
+    bool isRegionFound = false;
     if (auto ngraphFunction = (cnnNetwork).getFunction()) {
         for (const auto op : ngraphFunction->get_ops()) {
             auto outputLayer = outputInfo.find(op->get_friendly_name());
             if (outputLayer != outputInfo.end()) {
                 auto regionYolo = std::dynamic_pointer_cast<ngraph::op::RegionYolo>(op);
 
-                if (!regionYolo) {
-                    throw std::runtime_error("Invalid output type: " +
-                        std::string(op->get_type_info().name) + ". RegionYolo expected");
-                }
+                if (regionYolo) {
+                    isRegionFound = true;
 
-                if(!regionYolo->get_mask().size()) {
-                    isYoloV3 = false;
-                }
+                    if (!regionYolo->get_mask().size()) {
+                        yoloVersion = YOLO_V1V2;
+                    }
 
-                regions.emplace(outputLayer->first, Region(regionYolo));
+                    regions.emplace(outputLayer->first, Region(regionYolo));
+                }
             }
         }
     }
     else {
         throw std::runtime_error("Can't get ngraph::Function. Make sure the provided model is in IR version 10 or greater.");
     }
+
+    if(!isRegionFound)
+    {
+        yoloVersion = outputsNames.size() == 2 ? YOLO_V4_TINY : YOLO_V4;
+
+        int num = 3;
+        int i = 0;
+
+        auto chosenMasks = presetMasks.size() ? presetMasks : defaultMasks[yoloVersion];
+        if(chosenMasks.size() != num * outputInfo.size()) {
+            throw std::runtime_error(std::string("Invalid size of masks array, got ") + std::to_string(presetMasks.size()) +
+                ", should be " + std::to_string(num * outputInfo.size()));
+        }
+
+        std::sort(outputsNames.begin(), outputsNames.end(),
+            [&outputInfo](const std::string& x, const std::string&  y) {return outputInfo[x]->getDims()[2] > outputInfo[y]->getDims()[2];});
+
+        for (const auto& name : outputsNames) {
+            auto& output = outputInfo[name];
+            auto shape = output->getDims();
+            auto classes = shape[1] / num - 5;
+            if (shape[1] % num != 0) {
+                throw std::runtime_error(std::string("The output blob ") + name + " has wrong 2nd dimension");
+            }
+            regions.emplace(name, Region(classes, 4,
+                presetAnchors.size() ? presetAnchors : defaultAnchors[yoloVersion],
+                std::vector<int64_t>(chosenMasks.begin() + i*num, chosenMasks.begin() + (i+1)*num),
+                shape[3], shape[2]));
+            i++;
+        }
+    }
+    else {
+        // Currently externally set anchors and masks are supported only for YoloV4
+        if(presetAnchors.size() || presetMasks.size()){
+            slog::warn << "Preset anchors and mask can be set for YoloV4 model only. "
+                "This model is not YoloV4, so these options will be ignored." << slog::endl;
+        }
+    }
 }
 
 std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult & infResult) {
@@ -151,24 +226,27 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
     int sideH = 0;
     unsigned long scaleH;
     unsigned long scaleW;
-    if (isYoloV3) {
-        auto& dims = blob->getTensorDesc().getDims();
-        const int out_blob_h = static_cast<int>(dims[2]);
-        const int out_blob_w = static_cast<int>(dims[3]);
-        sideH = out_blob_h;
-        sideW = out_blob_w;
-        scaleW = resized_im_w;
-        scaleH = resized_im_h;
-    }
-    else {
+    switch(yoloVersion) {
+    case YOLO_V1V2:
         sideH = region.outputHeight;
         sideW = region.outputWidth;
         scaleW = region.outputWidth;
         scaleH = region.outputHeight;
+        break;
+    case YOLO_V3:
+    case YOLO_V4:
+    case YOLO_V4_TINY:
+        sideH = static_cast<int>(blob->getTensorDesc().getDims()[2]);
+        sideW = static_cast<int>(blob->getTensorDesc().getDims()[3]);
+        scaleW = resized_im_w;
+        scaleH = resized_im_h;
+        break;
     }
 
     auto entriesNum = sideW * sideH;
-    const float* output_blob = blob->buffer().as<PrecisionTrait<Precision::FP32>::value_type*>();
+    const float* output_blob = blob->buffer().as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::FP32>::value_type*>();
+
+    auto postprocessRawData = (yoloVersion == YOLO_V4 || yoloVersion == YOLO_V4_TINY) ? sigmoid : linear;
 
     // --------------------------- Parsing YOLO Region output -------------------------------------
     for (int i = 0; i < entriesNum; ++i) {
@@ -178,13 +256,13 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
             //--- Getting region data from blob
             int obj_index = calculateEntryIndex(entriesNum, region.coords, region.classes, n * entriesNum + i, region.coords);
             int box_index = calculateEntryIndex(entriesNum, region.coords, region.classes, n * entriesNum + i, 0);
-            float scale = output_blob[obj_index];
+            float scale = postprocessRawData(output_blob[obj_index]);
 
             //--- Preliminary check for confidence threshold conformance
             if (scale >= confidenceThreshold){
                 //--- Calculating scaled region's coordinates
-                double x = (col + output_blob[box_index + 0 * entriesNum]) / sideW * original_im_w;
-                double y = (row + output_blob[box_index + 1 * entriesNum]) / sideH * original_im_h;
+                double x = (col + postprocessRawData(output_blob[box_index + 0 * entriesNum])) / sideW * original_im_w;
+                double y = (row + postprocessRawData(output_blob[box_index + 1 * entriesNum])) / sideH * original_im_h;
                 double height = std::exp(output_blob[box_index + 3 * entriesNum]) * region.anchors[2 * n + 1] * original_im_h / scaleH;
                 double width = std::exp(output_blob[box_index + 2 * entriesNum]) * region.anchors[2 * n] * original_im_w / scaleW;
 
@@ -196,7 +274,7 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
 
                 for (int j = 0; j < region.classes; ++j) {
                     int class_index = calculateEntryIndex(entriesNum, region.coords, region.classes, n * entriesNum + i, region.coords + 1 + j);
-                    float prob = scale * output_blob[class_index];
+                    float prob = scale * postprocessRawData(output_blob[class_index]);
 
                     //--- Checking confidence threshold conformance and adding region to the list
                     if (prob >= confidenceThreshold) {
@@ -250,9 +328,31 @@ ModelYolo::Region::Region(const std::shared_ptr<ngraph::op::RegionYolo>& regionY
         num = regionYolo->get_num_regions();
         anchors = regionYolo->get_anchors();
         if (anchors.empty()) {
-            anchors.insert(anchors.end(),
-                { 0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f });
+            anchors = defaultAnchors[YOLO_V1V2];
             num = 5;
         }
     }
 }
+
+ModelYolo::Region::Region(int classes, int coords, const std::vector<float>& anchors, const std::vector<int64_t>& masks, int outputWidth, int outputHeight) :
+    classes(classes), coords(coords),
+    outputWidth(outputWidth), outputHeight(outputHeight) {
+    num = masks.size();
+
+    if (anchors.size() == 0 || anchors.size() % 2 != 0) {
+        throw std::runtime_error("Explicitly initialized region should have non-empty even-sized regions vector");
+    }
+
+    if (num) {
+        this->anchors.resize(num * 2);
+
+        for (int i = 0; i < num; ++i) {
+            this->anchors[i * 2] = anchors[masks[i] * 2];
+            this->anchors[i * 2 + 1] = anchors[masks[i] * 2 + 1];
+        }
+    }
+    else {
+        this->anchors = anchors;
+        num = anchors.size() / 2;
+    }
+}
diff --git a/demos/object_detection_demo/cpp/README.md b/demos/object_detection_demo/cpp/README.md
@@ -132,6 +132,8 @@ python3 <omz_dir>/tools/downloader/converter.py --list models.lst
   - yolo-v2-tiny-ava-sparse-60-0001
   - yolo-v2-tiny-tf
   - yolo-v2-tiny-vehicle-detection-0001
+  - yolo-v4-tf
+  - yolo-v4-tiny-tf
 
 > **NOTE**: Refer to the tables [Intel's Pre-Trained Models Device Support](../../../models/intel/device_support.md) and [Public Pre-Trained Models Device Support](../../../models/public/device_support.md) for the details on models inference support at different devices.
 
@@ -171,6 +173,8 @@ Options:
     -output_resolution        Optional. Specify the maximum output window resolution in (width x height) format. Example: 1280x720. Input frame size used by default.
     -u                        Optional. List of monitors to show initially.
     -yolo_af                  Optional. Use advanced postprocessing/filtering algorithm for YOLO.
+    -anchors                  Optional. A comma separated list of anchors. By default used default anchors for model. Only for YOLOV4 architecture type.
+    -masks                    Optional. A comma separated list of mask for anchors. By default used default masks for model. Only for YOLOV4 architecture type.                                   [
 ```
 
 Running the application with the empty list of options yields the usage message given above and an error message.
diff --git a/demos/object_detection_demo/cpp/main.cpp b/demos/object_detection_demo/cpp/main.cpp
@@ -68,6 +68,10 @@ static const char iou_thresh_output_message[] = "Optional. Filtering intersectio
 static const char yolo_af_message[] = "Optional. Use advanced postprocessing/filtering algorithm for YOLO.";
 static const char output_resolution_message[] = "Optional. Specify the maximum output window resolution "
     "in (width x height) format. Example: 1280x720. Input frame size used by default.";
+static const char anchors_message[] = "Optional. A comma separated list of anchors. "
+    "By default used default anchors for model. Only for YOLOV4 architecture type.";
+static const char masks_message[] = "Optional. A comma separated list of mask for anchors. "
+    "By default used default masks for model. Only for YOLOV4 architecture type.";
 
 DEFINE_bool(h, false, help_message);
 DEFINE_string(at, "", at_message);
@@ -88,6 +92,8 @@ DEFINE_bool(no_show, false, no_show_message);
 DEFINE_string(u, "", utilization_monitors_message);
 DEFINE_bool(yolo_af, true, yolo_af_message);
 DEFINE_string(output_resolution, "", output_resolution_message);
+DEFINE_string(anchors, "", anchors_message);
+DEFINE_string(masks, "", masks_message);
 
 /**
 * \brief This function shows a help message
@@ -121,6 +127,8 @@ static void showUsage() {
     std::cout << "    -output_resolution        " << output_resolution_message << std::endl;
     std::cout << "    -u                        " << utilization_monitors_message << std::endl;
     std::cout << "    -yolo_af                  " << yolo_af_message << std::endl;
+    std::cout << "    -anchors                  "      << anchors_message << std::endl;
+    std::cout << "    -masks                    "      << masks_message << std::endl;
 }
 
 class ColorPalette {
@@ -280,6 +288,28 @@ int main(int argc, char *argv[]) {
             return 0;
         }
 
+        const auto& strAnchors = split(FLAGS_anchors, ',');
+        const auto& strMasks = split(FLAGS_masks, ',');
+
+        std::vector<float> anchors;
+        std::vector<int64_t> masks;
+        try {
+            for (auto& str : strAnchors) {
+                anchors.push_back(std::stof(str));
+            }
+        } catch(...) {
+            throw std::runtime_error("Invalid anchors list is provided.");
+        }
+
+        try {
+            for (auto& str : strMasks) {
+                masks.push_back(std::stoll(str));
+            }
+        }
+        catch (...) {
+            throw std::runtime_error("Invalid masks list is provided.");
+        }
+
         //------------------------------- Preparing Input ------------------------------------------------------
         slog::info << "Reading input" << slog::endl;
         auto cap = openImagesCapture(FLAGS_i, FLAGS_loop);
@@ -308,7 +338,7 @@ int main(int argc, char *argv[]) {
             model.reset(new ModelSSD(FLAGS_m, (float)FLAGS_t, FLAGS_auto_resize, labels));
         }
         else if (FLAGS_at == "yolo") {
-            model.reset(new ModelYolo(FLAGS_m, (float)FLAGS_t, FLAGS_auto_resize, FLAGS_yolo_af, (float)FLAGS_iou_t, labels));
+            model.reset(new ModelYolo(FLAGS_m, (float)FLAGS_t, FLAGS_auto_resize, FLAGS_yolo_af, (float)FLAGS_iou_t, labels, anchors, masks));
         }
         else {
             slog::err << "No model type or invalid model type (-at) provided: " + FLAGS_at << slog::endl;
diff --git a/demos/object_detection_demo/cpp/models.lst b/demos/object_detection_demo/cpp/models.lst
diff --git a/demos/tests/cases.py b/demos/tests/cases.py