Skip to content

Commit 5ecfb29

Browse files
fzhareizamalievvladimir-dudnik
authored
YOLO-V4 support in C++ demo (#2689)
* Initial version * Update demos/common/cpp/models/src/detection_model_yolo.cpp Co-authored-by: Eduard Zamaliev <[email protected]> * Update demos/common/cpp/models/src/detection_model_yolo.cpp Co-authored-by: Eduard Zamaliev <[email protected]> * Update demos/common/cpp/models/src/detection_model_yolo.cpp Co-authored-by: Eduard Zamaliev <[email protected]> * New parameters are added to object_detection_demo Bugfixes. * linux fixes * extend test cases, fix detection of yolo-v4-tiny * fix test cases intendation * Tiny detection is changed to be based on ouptus number. Additional review issues fixes. * Docs update and alignment. Co-authored-by: Eduard Zamaliev <[email protected]> Co-authored-by: Vladimir Dudnik <[email protected]>
1 parent e761bee commit 5ecfb29

File tree

6 files changed

+207
-46
lines changed

6 files changed

+207
-46
lines changed

demos/common/cpp/models/include/models/detection_model_yolo.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,17 @@ class ModelYolo : public DetectionModel {
3838
int outputHeight = 0;
3939

4040
Region(const std::shared_ptr<ngraph::op::RegionYolo>& regionYolo);
41+
Region(int classes, int coords, const std::vector<float>& anchors, const std::vector<int64_t>& masks, int outputWidth, int outputHeight);
4142
};
4243

4344
public:
45+
enum YoloVersion {
46+
YOLO_V1V2,
47+
YOLO_V3,
48+
YOLO_V4,
49+
YOLO_V4_TINY
50+
};
51+
4452
/// Constructor.
4553
/// @param modelFileName name of model to load
4654
/// @param confidenceThreshold - threshold to eliminate low-confidence detections.
@@ -53,8 +61,11 @@ class ModelYolo : public DetectionModel {
5361
/// during postprocessing (only one of them should stay). The default value is 0.5
5462
/// @param labels - array of labels for every class. If this array is empty or contains less elements
5563
/// than actual classes number, default "Label #N" will be shown for missing items.
64+
/// @param anchors - vector of anchors coordinates. Required for YOLOv4, for other versions it may be omitted.
65+
/// @param masks - vector of masks values. Required for YOLOv4, for other versions it may be omitted.
5666
ModelYolo(const std::string& modelFileName, float confidenceThreshold, bool useAutoResize,
57-
bool useAdvancedPostprocessing = true, float boxIOUThreshold = 0.5, const std::vector<std::string>& labels = std::vector<std::string>());
67+
bool useAdvancedPostprocessing = true, float boxIOUThreshold = 0.5, const std::vector<std::string>& labels = std::vector<std::string>(),
68+
const std::vector<float>& anchors = std::vector<float>(), const std::vector<int64_t>& masks = std::vector<int64_t>());
5869

5970
std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
6071

@@ -71,5 +82,7 @@ class ModelYolo : public DetectionModel {
7182
std::map<std::string, Region> regions;
7283
double boxIOUThreshold;
7384
bool useAdvancedPostprocessing;
74-
bool isYoloV3;
85+
YoloVersion yoloVersion;
86+
const std::vector<float> presetAnchors;
87+
const std::vector<int64_t> presetMasks;
7588
};

demos/common/cpp/models/src/detection_model_yolo.cpp

Lines changed: 138 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -19,74 +19,149 @@
1919
#include <utils/common.hpp>
2020
#include <ngraph/ngraph.hpp>
2121

22-
using namespace InferenceEngine;
22+
std::vector<float> defaultAnchors[] = {
23+
// YOLOv1v2
24+
{ 0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f },
25+
// YOLOv3
26+
{ 10.0f, 13.0f, 16.0f, 30.0f, 33.0f, 23.0f,
27+
30.0f, 61.0f, 62.0f, 45.0f, 59.0f, 119.0f,
28+
116.0f, 90.0f, 156.0f, 198.0f, 373.0f, 326.0f},
29+
// YOLOv4
30+
{ 12.0f, 16.0f, 19.0f, 36.0f, 40.0f, 28.0f,
31+
36.0f, 75.0f, 76.0f, 55.0f, 72.0f, 146.0f,
32+
142.0f, 110.0f, 192.0f, 243.0f, 459.0f, 401.0f},
33+
// YOLOv4_Tiny
34+
{ 10.0f, 14.0f, 23.0f, 27.0f, 37.0f, 58.0f,
35+
81.0f, 82.0f, 135.0f, 169.0f, 344.0f, 319.0f}
36+
};
37+
38+
const std::vector<int64_t> defaultMasks[] = {
39+
// YOLOv1v2
40+
{},
41+
// YOLOv3
42+
{},
43+
// YOLOv4
44+
{0, 1, 2, 3, 4, 5, 6, 7, 8 },
45+
// YOLOv4_Tiny
46+
{1, 2, 3, 3, 4, 5}
47+
};
48+
49+
static inline float sigmoid(float x) {
50+
return 1.f / (1.f + exp(-x));
51+
}
52+
static inline float linear(float x) {
53+
return x;
54+
}
55+
2356

2457
ModelYolo::ModelYolo(const std::string& modelFileName, float confidenceThreshold, bool useAutoResize,
25-
bool useAdvancedPostprocessing, float boxIOUThreshold, const std::vector<std::string>& labels) :
58+
bool useAdvancedPostprocessing, float boxIOUThreshold, const std::vector<std::string>& labels,
59+
const std::vector<float>& anchors, const std::vector<int64_t>& masks) :
2660
DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels),
2761
boxIOUThreshold(boxIOUThreshold),
2862
useAdvancedPostprocessing(useAdvancedPostprocessing),
29-
isYoloV3(true){
63+
yoloVersion(YOLO_V3),
64+
presetAnchors(anchors),
65+
presetMasks(masks) {
3066
}
3167

3268
void ModelYolo::prepareInputsOutputs(InferenceEngine::CNNNetwork& cnnNetwork) {
3369
// --------------------------- Configure input & output -------------------------------------------------
3470
// --------------------------- Prepare input blobs ------------------------------------------------------
3571
slog::info << "Checking that the inputs are as the demo expects" << slog::endl;
36-
InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
72+
InferenceEngine::InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
3773
if (inputInfo.size() != 1) {
3874
throw std::logic_error("This demo accepts networks that have only one input");
3975
}
4076

41-
InputInfo::Ptr& input = inputInfo.begin()->second;
77+
InferenceEngine::InputInfo::Ptr& input = inputInfo.begin()->second;
4278
inputsNames.push_back(inputInfo.begin()->first);
43-
input->setPrecision(Precision::U8);
79+
input->setPrecision(InferenceEngine::Precision::U8);
4480
if (useAutoResize) {
45-
input->getPreProcess().setResizeAlgorithm(ResizeAlgorithm::RESIZE_BILINEAR);
46-
input->getInputData()->setLayout(Layout::NHWC);
81+
input->getPreProcess().setResizeAlgorithm(InferenceEngine::ResizeAlgorithm::RESIZE_BILINEAR);
82+
input->getInputData()->setLayout(InferenceEngine::Layout::NHWC);
4783
}
4884
else {
49-
input->getInputData()->setLayout(Layout::NCHW);
85+
input->getInputData()->setLayout(InferenceEngine::Layout::NCHW);
5086
}
5187

5288
//--- Reading image input parameters
53-
const TensorDesc& inputDesc = inputInfo.begin()->second->getTensorDesc();
89+
const InferenceEngine::TensorDesc& inputDesc = inputInfo.begin()->second->getTensorDesc();
5490
netInputHeight = getTensorHeight(inputDesc);
5591
netInputWidth = getTensorWidth(inputDesc);
5692

5793
// --------------------------- Prepare output blobs -----------------------------------------------------
5894
slog::info << "Checking that the outputs are as the demo expects" << slog::endl;
59-
OutputsDataMap outputInfo(cnnNetwork.getOutputsInfo());
95+
InferenceEngine::OutputsDataMap outputInfo(cnnNetwork.getOutputsInfo());
6096
for (auto& output : outputInfo) {
61-
output.second->setPrecision(Precision::FP32);
97+
output.second->setPrecision(InferenceEngine::Precision::FP32);
6298
if (output.second->getDims().size() == 4) {
63-
output.second->setLayout(Layout::NCHW);
99+
output.second->setLayout(InferenceEngine::Layout::NCHW);
64100
}
65101
outputsNames.push_back(output.first);
66102
}
67103

104+
yoloVersion = YOLO_V3;
105+
bool isRegionFound = false;
68106
if (auto ngraphFunction = (cnnNetwork).getFunction()) {
69107
for (const auto op : ngraphFunction->get_ops()) {
70108
auto outputLayer = outputInfo.find(op->get_friendly_name());
71109
if (outputLayer != outputInfo.end()) {
72110
auto regionYolo = std::dynamic_pointer_cast<ngraph::op::RegionYolo>(op);
73111

74-
if (!regionYolo) {
75-
throw std::runtime_error("Invalid output type: " +
76-
std::string(op->get_type_info().name) + ". RegionYolo expected");
77-
}
112+
if (regionYolo) {
113+
isRegionFound = true;
78114

79-
if(!regionYolo->get_mask().size()) {
80-
isYoloV3 = false;
81-
}
115+
if (!regionYolo->get_mask().size()) {
116+
yoloVersion = YOLO_V1V2;
117+
}
82118

83-
regions.emplace(outputLayer->first, Region(regionYolo));
119+
regions.emplace(outputLayer->first, Region(regionYolo));
120+
}
84121
}
85122
}
86123
}
87124
else {
88125
throw std::runtime_error("Can't get ngraph::Function. Make sure the provided model is in IR version 10 or greater.");
89126
}
127+
128+
if(!isRegionFound)
129+
{
130+
yoloVersion = outputsNames.size() == 2 ? YOLO_V4_TINY : YOLO_V4;
131+
132+
int num = 3;
133+
int i = 0;
134+
135+
auto chosenMasks = presetMasks.size() ? presetMasks : defaultMasks[yoloVersion];
136+
if(chosenMasks.size() != num * outputInfo.size()) {
137+
throw std::runtime_error(std::string("Invalid size of masks array, got ") + std::to_string(presetMasks.size()) +
138+
", should be " + std::to_string(num * outputInfo.size()));
139+
}
140+
141+
std::sort(outputsNames.begin(), outputsNames.end(),
142+
[&outputInfo](const std::string& x, const std::string& y) {return outputInfo[x]->getDims()[2] > outputInfo[y]->getDims()[2];});
143+
144+
for (const auto& name : outputsNames) {
145+
auto& output = outputInfo[name];
146+
auto shape = output->getDims();
147+
auto classes = shape[1] / num - 5;
148+
if (shape[1] % num != 0) {
149+
throw std::runtime_error(std::string("The output blob ") + name + " has wrong 2nd dimension");
150+
}
151+
regions.emplace(name, Region(classes, 4,
152+
presetAnchors.size() ? presetAnchors : defaultAnchors[yoloVersion],
153+
std::vector<int64_t>(chosenMasks.begin() + i*num, chosenMasks.begin() + (i+1)*num),
154+
shape[3], shape[2]));
155+
i++;
156+
}
157+
}
158+
else {
159+
// Currently externally set anchors and masks are supported only for YoloV4
160+
if(presetAnchors.size() || presetMasks.size()){
161+
slog::warn << "Preset anchors and mask can be set for YoloV4 model only. "
162+
"This model is not YoloV4, so these options will be ignored." << slog::endl;
163+
}
164+
}
90165
}
91166

92167
std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult & infResult) {
@@ -151,24 +226,27 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
151226
int sideH = 0;
152227
unsigned long scaleH;
153228
unsigned long scaleW;
154-
if (isYoloV3) {
155-
auto& dims = blob->getTensorDesc().getDims();
156-
const int out_blob_h = static_cast<int>(dims[2]);
157-
const int out_blob_w = static_cast<int>(dims[3]);
158-
sideH = out_blob_h;
159-
sideW = out_blob_w;
160-
scaleW = resized_im_w;
161-
scaleH = resized_im_h;
162-
}
163-
else {
229+
switch(yoloVersion) {
230+
case YOLO_V1V2:
164231
sideH = region.outputHeight;
165232
sideW = region.outputWidth;
166233
scaleW = region.outputWidth;
167234
scaleH = region.outputHeight;
235+
break;
236+
case YOLO_V3:
237+
case YOLO_V4:
238+
case YOLO_V4_TINY:
239+
sideH = static_cast<int>(blob->getTensorDesc().getDims()[2]);
240+
sideW = static_cast<int>(blob->getTensorDesc().getDims()[3]);
241+
scaleW = resized_im_w;
242+
scaleH = resized_im_h;
243+
break;
168244
}
169245

170246
auto entriesNum = sideW * sideH;
171-
const float* output_blob = blob->buffer().as<PrecisionTrait<Precision::FP32>::value_type*>();
247+
const float* output_blob = blob->buffer().as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::FP32>::value_type*>();
248+
249+
auto postprocessRawData = (yoloVersion == YOLO_V4 || yoloVersion == YOLO_V4_TINY) ? sigmoid : linear;
172250

173251
// --------------------------- Parsing YOLO Region output -------------------------------------
174252
for (int i = 0; i < entriesNum; ++i) {
@@ -178,13 +256,13 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
178256
//--- Getting region data from blob
179257
int obj_index = calculateEntryIndex(entriesNum, region.coords, region.classes, n * entriesNum + i, region.coords);
180258
int box_index = calculateEntryIndex(entriesNum, region.coords, region.classes, n * entriesNum + i, 0);
181-
float scale = output_blob[obj_index];
259+
float scale = postprocessRawData(output_blob[obj_index]);
182260

183261
//--- Preliminary check for confidence threshold conformance
184262
if (scale >= confidenceThreshold){
185263
//--- Calculating scaled region's coordinates
186-
double x = (col + output_blob[box_index + 0 * entriesNum]) / sideW * original_im_w;
187-
double y = (row + output_blob[box_index + 1 * entriesNum]) / sideH * original_im_h;
264+
double x = (col + postprocessRawData(output_blob[box_index + 0 * entriesNum])) / sideW * original_im_w;
265+
double y = (row + postprocessRawData(output_blob[box_index + 1 * entriesNum])) / sideH * original_im_h;
188266
double height = std::exp(output_blob[box_index + 3 * entriesNum]) * region.anchors[2 * n + 1] * original_im_h / scaleH;
189267
double width = std::exp(output_blob[box_index + 2 * entriesNum]) * region.anchors[2 * n] * original_im_w / scaleW;
190268

@@ -196,7 +274,7 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
196274

197275
for (int j = 0; j < region.classes; ++j) {
198276
int class_index = calculateEntryIndex(entriesNum, region.coords, region.classes, n * entriesNum + i, region.coords + 1 + j);
199-
float prob = scale * output_blob[class_index];
277+
float prob = scale * postprocessRawData(output_blob[class_index]);
200278

201279
//--- Checking confidence threshold conformance and adding region to the list
202280
if (prob >= confidenceThreshold) {
@@ -250,9 +328,31 @@ ModelYolo::Region::Region(const std::shared_ptr<ngraph::op::RegionYolo>& regionY
250328
num = regionYolo->get_num_regions();
251329
anchors = regionYolo->get_anchors();
252330
if (anchors.empty()) {
253-
anchors.insert(anchors.end(),
254-
{ 0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f });
331+
anchors = defaultAnchors[YOLO_V1V2];
255332
num = 5;
256333
}
257334
}
258335
}
336+
337+
ModelYolo::Region::Region(int classes, int coords, const std::vector<float>& anchors, const std::vector<int64_t>& masks, int outputWidth, int outputHeight) :
338+
classes(classes), coords(coords),
339+
outputWidth(outputWidth), outputHeight(outputHeight) {
340+
num = masks.size();
341+
342+
if (anchors.size() == 0 || anchors.size() % 2 != 0) {
343+
throw std::runtime_error("Explicitly initialized region should have non-empty even-sized regions vector");
344+
}
345+
346+
if (num) {
347+
this->anchors.resize(num * 2);
348+
349+
for (int i = 0; i < num; ++i) {
350+
this->anchors[i * 2] = anchors[masks[i] * 2];
351+
this->anchors[i * 2 + 1] = anchors[masks[i] * 2 + 1];
352+
}
353+
}
354+
else {
355+
this->anchors = anchors;
356+
num = anchors.size() / 2;
357+
}
358+
}

demos/object_detection_demo/cpp/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ python3 <omz_dir>/tools/downloader/converter.py --list models.lst
132132
- yolo-v2-tiny-ava-sparse-60-0001
133133
- yolo-v2-tiny-tf
134134
- yolo-v2-tiny-vehicle-detection-0001
135+
- yolo-v4-tf
136+
- yolo-v4-tiny-tf
135137

136138
> **NOTE**: Refer to the tables [Intel's Pre-Trained Models Device Support](../../../models/intel/device_support.md) and [Public Pre-Trained Models Device Support](../../../models/public/device_support.md) for the details on models inference support at different devices.
137139
@@ -171,6 +173,8 @@ Options:
171173
-output_resolution Optional. Specify the maximum output window resolution in (width x height) format. Example: 1280x720. Input frame size used by default.
172174
-u Optional. List of monitors to show initially.
173175
-yolo_af Optional. Use advanced postprocessing/filtering algorithm for YOLO.
176+
-anchors Optional. A comma separated list of anchors. By default used default anchors for model. Only for YOLOV4 architecture type.
177+
-masks Optional. A comma separated list of mask for anchors. By default used default masks for model. Only for YOLOV4 architecture type. [
174178
```
175179

176180
Running the application with the empty list of options yields the usage message given above and an error message.

demos/object_detection_demo/cpp/main.cpp

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ static const char iou_thresh_output_message[] = "Optional. Filtering intersectio
6868
static const char yolo_af_message[] = "Optional. Use advanced postprocessing/filtering algorithm for YOLO.";
6969
static const char output_resolution_message[] = "Optional. Specify the maximum output window resolution "
7070
"in (width x height) format. Example: 1280x720. Input frame size used by default.";
71+
static const char anchors_message[] = "Optional. A comma separated list of anchors. "
72+
"By default used default anchors for model. Only for YOLOV4 architecture type.";
73+
static const char masks_message[] = "Optional. A comma separated list of mask for anchors. "
74+
"By default used default masks for model. Only for YOLOV4 architecture type.";
7175

7276
DEFINE_bool(h, false, help_message);
7377
DEFINE_string(at, "", at_message);
@@ -88,6 +92,8 @@ DEFINE_bool(no_show, false, no_show_message);
8892
DEFINE_string(u, "", utilization_monitors_message);
8993
DEFINE_bool(yolo_af, true, yolo_af_message);
9094
DEFINE_string(output_resolution, "", output_resolution_message);
95+
DEFINE_string(anchors, "", anchors_message);
96+
DEFINE_string(masks, "", masks_message);
9197

9298
/**
9399
* \brief This function shows a help message
@@ -121,6 +127,8 @@ static void showUsage() {
121127
std::cout << " -output_resolution " << output_resolution_message << std::endl;
122128
std::cout << " -u " << utilization_monitors_message << std::endl;
123129
std::cout << " -yolo_af " << yolo_af_message << std::endl;
130+
std::cout << " -anchors " << anchors_message << std::endl;
131+
std::cout << " -masks " << masks_message << std::endl;
124132
}
125133

126134
class ColorPalette {
@@ -280,6 +288,28 @@ int main(int argc, char *argv[]) {
280288
return 0;
281289
}
282290

291+
const auto& strAnchors = split(FLAGS_anchors, ',');
292+
const auto& strMasks = split(FLAGS_masks, ',');
293+
294+
std::vector<float> anchors;
295+
std::vector<int64_t> masks;
296+
try {
297+
for (auto& str : strAnchors) {
298+
anchors.push_back(std::stof(str));
299+
}
300+
} catch(...) {
301+
throw std::runtime_error("Invalid anchors list is provided.");
302+
}
303+
304+
try {
305+
for (auto& str : strMasks) {
306+
masks.push_back(std::stoll(str));
307+
}
308+
}
309+
catch (...) {
310+
throw std::runtime_error("Invalid masks list is provided.");
311+
}
312+
283313
//------------------------------- Preparing Input ------------------------------------------------------
284314
slog::info << "Reading input" << slog::endl;
285315
auto cap = openImagesCapture(FLAGS_i, FLAGS_loop);
@@ -308,7 +338,7 @@ int main(int argc, char *argv[]) {
308338
model.reset(new ModelSSD(FLAGS_m, (float)FLAGS_t, FLAGS_auto_resize, labels));
309339
}
310340
else if (FLAGS_at == "yolo") {
311-
model.reset(new ModelYolo(FLAGS_m, (float)FLAGS_t, FLAGS_auto_resize, FLAGS_yolo_af, (float)FLAGS_iou_t, labels));
341+
model.reset(new ModelYolo(FLAGS_m, (float)FLAGS_t, FLAGS_auto_resize, FLAGS_yolo_af, (float)FLAGS_iou_t, labels, anchors, masks));
312342
}
313343
else {
314344
slog::err << "No model type or invalid model type (-at) provided: " + FLAGS_at << slog::endl;

0 commit comments

Comments
 (0)