smart_classroom_demo: clarify -ad, fix bbox coordinates (#3626)

Wovchena · web-flow · commit 393d2ea57c6e · 2022-12-08T18:06:33.000+04:00
Ticket 97205
diff --git a/demos/smart_classroom_demo/cpp/README.md b/demos/smart_classroom_demo/cpp/README.md
@@ -84,7 +84,7 @@ Options:
     -d_reid '<device>'             Optional. Specify the target device for Face Reidentification Retail (the list of available devices is shown below). Default value is CPU. Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin. The application looks for a suitable plugin for the specified device.
     -greedy_reid_matching          Optional. Use faster greedy matching algorithm in face reid.
     -r                             Optional. Output Inference results as raw values.
-    -ad                            Optional. Output file name to save per-person action statistics in.
+    -ad                            Optional. Output file name to save per-person action statistics in. Requires -teacher_id and -a_top to be unset and -fg to be set
     -t_ad                          Optional. Probability threshold for person/action detection.
     -t_ar                          Optional. Probability threshold for action recognition.
     -t_fd                          Optional. Probability threshold for face detections.
diff --git a/demos/smart_classroom_demo/cpp/main.cpp b/demos/smart_classroom_demo/cpp/main.cpp
@@ -544,6 +544,16 @@ int main(int argc, char* argv[]) {
         const auto actions_type = FLAGS_teacher_id.empty() ?
             (FLAGS_a_top > 0 ? TOP_K : STUDENT) :
             TEACHER;
+        if (!FLAGS_ad.empty()) {
+            if (actions_type != STUDENT) {
+                slog::err << "-ad requires -teacher_id and -a_top to be unset" << slog::endl;
+                return 1;
+            }
+            if (FLAGS_fg.empty()) {
+                slog::err << "-ad requires -fg to be set" << slog::endl;
+                return 1;
+            }
+        }
         const auto actions_map = actions_type == STUDENT ?
             split(FLAGS_student_ac, ',') : actions_type == TOP_K ?
             split(FLAGS_top_ac, ',') :
diff --git a/demos/smart_classroom_demo/cpp/smart_classroom_demo.hpp b/demos/smart_classroom_demo/cpp/smart_classroom_demo.hpp
@@ -18,8 +18,8 @@ static const char help_message[] = "Print a usage message.";
 static const char read_limit_message[] = "Optional. Read length limit before stopping or restarting reading the input.";
 static const char person_action_detection_model_message[] = "Required. Path to the Person/Action Detection Retail model (.xml) file.";
 static const char face_detection_model_message[] = "Required. Path to the Face Detection model (.xml) file.";
-static const char facial_landmarks_model_message[] = "Required. Path to the Facial Landmarks Regression Retail model (.xml) file.";
-static const char face_reid_model_message[] = "Required. Path to the Face Reidentification Retail model (.xml) file.";
+static const char facial_landmarks_model_message[] = "Optional. Path to the Facial Landmarks Regression Retail model (.xml) file.";
+static const char face_reid_model_message[] = "Optional. Path to the Face Reidentification Retail model (.xml) file.";
 static const char target_device_message_action_detection[] = "Optional. Specify the target device for Person/Action Detection Retail "
                                                              "(the list of available devices is shown below). Default value is CPU. "
                                                              "Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. "
@@ -42,7 +42,7 @@ static const char person_threshold_output_message[] = "Optional. Probability thr
 static const char action_threshold_output_message[] = "Optional. Probability threshold for action recognition.";
 static const char threshold_output_message_face_reid[] = "Optional. Cosine distance threshold between two vectors for face reidentification.";
 static const char reid_gallery_path_message[] = "Optional. Path to a faces gallery in .json format.";
-static const char act_stat_output_message[] = "Optional. Output file name to save per-person action statistics in.";
+static const char act_stat_output_message[] = "Optional. Output file name to save per-person action statistics in. Requires -teacher_id and -a_top to be unset and -fg to be set";
 static const char raw_output_message[] = "Optional. Output Inference results as raw values.";
 static const char no_show_message[] = "Optional. Don't show output.";
 static const char input_image_height_output_message[] = "Optional. Input image height for face detector.";
diff --git a/demos/smart_classroom_demo/cpp/src/reid_gallery.cpp b/demos/smart_classroom_demo/cpp/src/reid_gallery.cpp
@@ -139,10 +139,10 @@ size_t EmbeddingsGallery::size() const {
     return identities.size();
 }
 
-std::vector<std::string> EmbeddingsGallery::GetIDToLabelMap() const  {
+std::vector<std::string> EmbeddingsGallery::GetIDToLabelMap() const {
     std::vector<std::string> map;
     map.reserve(identities.size());
-    for (const auto& item : identities)  {
+    for (const auto& item : identities) {
         map.emplace_back(item.label);
     }
     return map;
diff --git a/demos/smart_classroom_demo/cpp_gapi/README.md b/demos/smart_classroom_demo/cpp_gapi/README.md
@@ -72,19 +72,20 @@ Options:
     -i                             Required. An input to process. The input must be a single image, a folder of images, video file or camera id.
     -loop                          Optional. Enable reading the input in a loop.
     -read_limit                    Optional. Read length limit before stopping or restarting reading the input.
-    -o "<path>"                    Optional. Name of output to save.
+    -o "<path>"                    Optional. Name of the output file(s) to save.
     -limit "<num>"                 Optional. Number of frames to store in output. If 0 is set, all frames are stored.
     -m_act '<path>'                Required. Path to the Person/Action Detection Retail model (.xml) file.
     -m_fd '<path>'                 Required. Path to the Face Detection model (.xml) file.
     -m_lm '<path>'                 Required. Path to the Facial Landmarks Regression Retail model (.xml) file.
     -m_reid '<path>'               Required. Path to the Face Reidentification Retail model (.xml) file.
-    -d_act '<device>'              Optional. Specify the target device for Person/Action Detection Retail (the list of available devices is shown below). Default value is CPU.
-    -d_fd '<device>'               Optional. Specify the target device for Face Detection Retail (the list of available devices is shown below). Default value is CPU.
-    -d_lm '<device>'               Optional. Specify the target device for Landmarks Regression Retail (the list of available devices is shown below). Default value is CPU.
-    -d_reid '<device>'             Optional. Specify the target device for Face Reidentification Retail (the list of available devices is shown below). Default value is CPU.
+          Or
+    -d_act '<device>'              Optional. Specify the target device for Person/Action Detection Retail (the list of available devices is shown below). Default value is CPU. The application looks for a suitable plugin for the specified device.
+    -d_fd '<device>'               Optional. Specify the target device for Face Detection Retail (the list of available devices is shown below). Default value is CPU. The application looks for a suitable plugin for the specified device.
+    -d_lm '<device>'               Optional. Specify the target device for Landmarks Regression Retail (the list of available devices is shown below). Default value is CPU. The application looks for a suitable plugin for the specified device.
+    -d_reid '<device>'             Optional. Specify the target device for Face Reidentification Retail (the list of available devices is shown below). Default value is CPU. The application looks for a suitable plugin for the specified device.
     -greedy_reid_matching          Optional. Use faster greedy matching algorithm in face reid.
     -r                             Optional. Output Inference results as raw values.
-    -ad                            Optional. Output file name to save per-person action statistics in.
+    -ad                            Optional. Output file name to save per-person action statistics in. Requires -teacher_id and -a_top to be unset and -fg to be set
     -t_ad                          Optional. Probability threshold for person/action detection.
     -t_ar                          Optional. Probability threshold for action recognition.
     -t_fd                          Optional. Probability threshold for face detections.
diff --git a/demos/smart_classroom_demo/cpp_gapi/include/initialize.hpp b/demos/smart_classroom_demo/cpp_gapi/include/initialize.hpp
@@ -74,14 +74,14 @@ bool isNetForSixActions(const std::string& model_path) {
     return model_path.at(model_path.size() - 5) == '6';
 }
 
-std::shared_ptr<ActionDetection> createActDetPtr(const bool net_with_six_actions,
+std::shared_ptr<ActionDetection> createActDetPtr(const std::string& ad_model_path,
                                                  const cv::Size frame_size,
                                                  const size_t actions_map_size,
                                                  const double t_ad,
                                                  const double t_ar) {
     // Load action detector
     ActionDetectorConfig action_config;
-    action_config.net_with_six_actions = net_with_six_actions;
+    action_config.net_with_six_actions = config::isNetForSixActions(ad_model_path);
     action_config.detection_confidence_threshold = static_cast<float>(t_ad);
     action_config.action_confidence_threshold = static_cast<float>(t_ar);
     action_config.num_action_classes = actions_map_size;
@@ -188,7 +188,7 @@ void printInfo(const NetsFlagsPack& flags, std::string& teacher_id, std::string&
     }
 }
 
-void configNets(const NetsFlagsPack& flags, cv::gapi::GNetPackage& networks, cv::Scalar& reid_net_in_size) {
+void configNets(const NetsFlagsPack& flags, cv::gapi::GNetPackage& networks, cv::Size& act_net_in_size, cv::Scalar& reid_net_in_size) {
     if (!flags.m_act.empty()) {
         const std::array<std::string, 7> action_detector_5 = {"mbox_loc1/out/conv/flat",
                                                               "mbox_main_conf/out/conv/flat/softmax/flat",
@@ -217,6 +217,10 @@ void configNets(const NetsFlagsPack& flags, cv::gapi::GNetPackage& networks, cv:
         // clang-format on
 
         networks += cv::gapi::networks(action_net);
+        InferenceEngine::Core core;
+        const auto layerData = core.ReadNetwork(flags.m_act).getInputsInfo().begin()->second;
+        auto layerDims = layerData->getTensorDesc().getDims();
+        act_net_in_size = {int(layerDims[3]), int(layerDims[2])};
         slog::info << "The Person/Action Detection model " << flags.m_act << " is loaded to " << flags.d_act
                    << " device." << slog::endl;
     } else {
diff --git a/demos/smart_classroom_demo/cpp_gapi/main.cpp b/demos/smart_classroom_demo/cpp_gapi/main.cpp
@@ -149,14 +149,15 @@ int main(int argc, char* argv[]) {
         cv::gapi::GNetPackage networks;
 
         /** Configure nets **/
+        cv::Size act_net_in_size;
         cv::Scalar reid_net_in_size;
-        config::configNets(netsFlags, networks, reid_net_in_size);
+        config::configNets(netsFlags, networks, act_net_in_size, reid_net_in_size);
 
         /** Configure and create action detector **/
         std::shared_ptr<ActionDetection> act_det_ptr;
         if (!ad_model_path.empty()) {
-            act_det_ptr = config::createActDetPtr(config::isNetForSixActions(ad_model_path),
-                                                  frame_size,
+            act_det_ptr = config::createActDetPtr(ad_model_path,
+                                                  act_net_in_size,
                                                   const_params.actions_map.size(),
                                                   FLAGS_t_ad,
                                                   FLAGS_t_ar);
@@ -185,6 +186,16 @@ int main(int argc, char* argv[]) {
             slog::err << "Teacher id does not exist in the gallery!" << slog::endl;
             return 1;
         }
+        if (!FLAGS_ad.empty()) {
+            if (const_params.actions_type != STUDENT) {
+                slog::err << "-ad requires -teacher_id and -a_top to be unset" << slog::endl;
+                return 1;
+            }
+            if (FLAGS_fg.empty()) {
+                slog::err << "-ad requires -fg to be set" << slog::endl;
+                return 1;
+            }
+        }
 
         /** ---------------- Main graph of demo ---------------- **/
         cv::GMat in;
diff --git a/demos/smart_classroom_demo/cpp_gapi/smart_classroom_demo_gapi.hpp b/demos/smart_classroom_demo/cpp_gapi/smart_classroom_demo_gapi.hpp
@@ -45,7 +45,7 @@ static const char action_threshold_output_message[] = "Optional. Probability thr
 static const char threshold_output_message_face_reid[] =
     "Optional. Cosine distance threshold between two vectors for face reidentification.";
 static const char reid_gallery_path_message[] = "Optional. Path to a faces gallery in .json format.";
-static const char act_stat_output_message[] = "Optional. Output file name to save per-person action statistics in.";
+static const char act_stat_output_message[] = "Optional. Output file name to save per-person action statistics in. Requires -teacher_id and -a_top to be unset and -fg to be set";
 static const char raw_output_message[] = "Optional. Output Inference results as raw values.";
 static const char no_show_message[] = "Optional. Don't show output.";
 static const char input_image_height_output_message[] = "Optional. Input image height for face detector.";
diff --git a/demos/smart_classroom_demo/cpp_gapi/src/action_detector.cpp b/demos/smart_classroom_demo/cpp_gapi/src/action_detector.cpp
@@ -87,8 +87,8 @@ inline ActionDetection::NormalizedBBox ActionDetection::GeneratePriorBox(int pos
                                                                          int step,
                                                                          const cv::Size2f& anchor,
                                                                          const cv::Size& blob_size) const {
-    const float row = static_cast<float>(pos / blob_size.width);
-    const float col = static_cast<float>(pos % blob_size.width);
+    const int row = pos / blob_size.width;
+    const int col = pos % blob_size.width;
 
     const float center_x = (col + 0.5f) * static_cast<float>(step);
     const float center_y = (row + 0.5f) * static_cast<float>(step);

Original file line number	Diff line number	Diff line change
`@@ -139,10 +139,10 @@ size_t EmbeddingsGallery::size() const {`
`139`	`139`	`return identities.size();`
`140`	`140`	`}`
`141`	`141`
`142`		`-std::vector<std::string> EmbeddingsGallery::GetIDToLabelMap() const {`
	`142`	`+std::vector<std::string> EmbeddingsGallery::GetIDToLabelMap() const {`
`143`	`143`	`std::vector<std::string> map;`
`144`	`144`	`map.reserve(identities.size());`
`145`		`- for (const auto& item : identities) {`
	`145`	`+ for (const auto& item : identities) {`
`146`	`146`	`map.emplace_back(item.label);`
`147`	`147`	`}`
`148`	`148`	`return map;`