Merge branch 'master' into smartlab_models

wdkwyf · web-flow · commit a10f83c70ddf · 2022-10-28T14:48:59.000+08:00
diff --git a/demos/gaze_estimation_demo/cpp/src/results_marker.cpp b/demos/gaze_estimation_demo/cpp/src/results_marker.cpp
@@ -60,17 +60,34 @@ void ResultsMarker::mark(cv::Mat& image, const FaceInferenceResults& faceInferen
         auto xCenter = faceBoundingBox.x + faceBoundingBoxWidth / 2;
         auto yCenter = faceBoundingBox.y + faceBoundingBoxHeight / 2;
 
-        // center to right
+        // OX points from face center to camera
+        // OY points from face center to right
+        // OZ points from face center to up
+
+        // Rotation matrix:
+        // Yaw - counterclockwise Pitch - counterclockwise Roll - clockwise
+        //     [cosY -sinY 0]          [ cosP 0 sinP]       [1    0    0 ]
+        //     [sinY  cosY 0]    *     [  0   1  0  ]   *   [0  cosR sinR] =
+        //     [  0    0   1]          [-sinP 0 cosP]       [0 -sinR cosR]
+
+        //   [cosY*cosP cosY*sinP*sinR-sinY*cosR cosY*sinP*cosR+sinY*sinR]
+        // = [sinY*cosP cosY*cosR-sinY*sinP*sinR sinY*sinP*cosR+cosY*sinR]
+        //   [  -sinP          -cosP*sinR                cosP*cosR       ]
+
+        // Multiply third row by -1 because screen drawing axis points down
+        // Drop first row to project to a screen plane
+
+        // OY: center to right
         cv::line(image, cv::Point(xCenter, yCenter),
-                 cv::Point(static_cast<int>(xCenter + axisLength * (cosR * cosY + sinY * sinP * sinR)),
+                 cv::Point(static_cast<int>(xCenter + axisLength * (cosR * cosY - sinY * sinP * sinR)),
                            static_cast<int>(yCenter + axisLength * cosP * sinR)),
                  cv::Scalar(0, 0, 255), 2);
-        // center to top
+        // OZ: center to top
         cv::line(image, cv::Point(xCenter, yCenter),
                  cv::Point(static_cast<int>(xCenter + axisLength * (cosR * sinY * sinP + cosY * sinR)),
                            static_cast<int>(yCenter - axisLength * cosP * cosR)),
                  cv::Scalar(0, 255, 0), 2);
-        // center to forward
+        // OX: center to camera
         cv::line(image, cv::Point(xCenter, yCenter),
                  cv::Point(static_cast<int>(xCenter + axisLength * sinY * cosP),
                            static_cast<int>(yCenter + axisLength * sinP)),
diff --git a/demos/gaze_estimation_demo/cpp_gapi/src/results_marker.cpp b/demos/gaze_estimation_demo/cpp_gapi/src/results_marker.cpp
@@ -66,27 +66,38 @@ void ResultsMarker::mark(cv::Mat& image, const FaceInferenceResults& faceInferen
         auto xCenter = faceBoundingBox.x + faceBoundingBoxWidth / 2;
         auto yCenter = faceBoundingBox.y + faceBoundingBoxHeight / 2;
 
-        // center to right
-        cv::line(image,
-                 cv::Point(xCenter, yCenter),
-                 cv::Point(static_cast<int>(xCenter + axisLength * (cosR * cosY + sinY * sinP * sinR)),
+        // OX points from face center to camera
+        // OY points from face center to right
+        // OZ points from face center to up
+
+        // Rotation matrix:
+        // Yaw - counterclockwise Pitch - counterclockwise Roll - clockwise
+        //     [cosY -sinY 0]          [ cosP 0 sinP]       [1    0    0 ]
+        //     [sinY  cosY 0]    *     [  0   1  0  ]   *   [0  cosR sinR] =
+        //     [  0    0   1]          [-sinP 0 cosP]       [0 -sinR cosR]
+
+        //   [cosY*cosP cosY*sinP*sinR-sinY*cosR cosY*sinP*cosR+sinY*sinR]
+        // = [sinY*cosP cosY*cosR-sinY*sinP*sinR sinY*sinP*cosR+cosY*sinR]
+        //   [  -sinP          -cosP*sinR                cosP*cosR       ]
+
+        // Multiply third row by -1 because screen drawing axis points down
+        // Drop first row to project to a screen plane
+
+        // OY: center to right
+        cv::line(image, cv::Point(xCenter, yCenter),
+                 cv::Point(static_cast<int>(xCenter + axisLength * (cosR * cosY - sinY * sinP * sinR)),
                            static_cast<int>(yCenter + axisLength * cosP * sinR)),
-                 cv::Scalar(0, 0, 255),
-                 2);
-        // center to top
-        cv::line(image,
-                 cv::Point(xCenter, yCenter),
+                 cv::Scalar(0, 0, 255), 2);
+        // OZ: center to top
+        cv::line(image, cv::Point(xCenter, yCenter),
                  cv::Point(static_cast<int>(xCenter + axisLength * (cosR * sinY * sinP + cosY * sinR)),
                            static_cast<int>(yCenter - axisLength * cosP * cosR)),
-                 cv::Scalar(0, 255, 0),
-                 2);
-        // center to forward
-        cv::line(image,
-                 cv::Point(xCenter, yCenter),
+                 cv::Scalar(0, 255, 0), 2);
+        // OX: center to camera
+        cv::line(image, cv::Point(xCenter, yCenter),
                  cv::Point(static_cast<int>(xCenter + axisLength * sinY * cosP),
                            static_cast<int>(yCenter + axisLength * sinP)),
-                 cv::Scalar(255, 0, 255),
-                 2);
+                 cv::Scalar(255, 0, 255), 2);
 
         putHighlightedText(
             image,
diff --git a/models/intel/head-pose-estimation-adas-0001/README.md b/models/intel/head-pose-estimation-adas-0001/README.md
@@ -6,6 +6,19 @@ Head pose estimation network based on simple, handmade CNN architecture. Angle r
 layers are convolutions + ReLU + batch norm + fully connected with
 one output.
 
+The estimator outputs yaw pitch and roll angles measured in degrees. Suppose the following coordinate system:
+* OX points from face center to camera
+* OY points from face center to right
+* OZ points from face center to up
+
+The predicted angles show how the face is rotated according to a rotation matrix:
+```
+Yaw - counterclockwise Pitch - counterclockwise Roll - clockwise
+    [cosY -sinY 0]          [ cosP 0 sinP]       [1    0    0 ]   [cosY*cosP cosY*sinP*sinR-sinY*cosR cosY*sinP*cosR+sinY*sinR]
+    [sinY  cosY 0]    *     [  0   1  0  ]   *   [0  cosR sinR] = [sinY*cosP cosY*cosR-sinY*sinP*sinR sinY*sinP*cosR+cosY*sinR]
+    [  0    0   1]          [-sinP 0 cosP]       [0 -sinR cosR]   [  -sinP          -cosP*sinR                cosP*cosR       ]
+```
+
 ## Validation Dataset
 
 [Biwi Kinect Head Pose Database](https://icu.ee.ethz.ch/research/datsets.html)
diff --git a/models/intel/machine-translation-nar-de-en-0002/README.md b/models/intel/machine-translation-nar-de-en-0002/README.md
@@ -2,7 +2,7 @@
 
 ## Use Case and High-Level Description
 
-This is a Deutsch-English machine translation model based on non-autoregressive Transformer topology.
+This is a Deutsch-English machine translation model based on non-autoregressive Transformer topology. The model is [trained](https://github.com/openvinotoolkit/training_extensions/tree/089de2f24667329a58e8560ed4e01ef203e99def/misc/pytorch_toolkit/machine_translation) on internal dataset.
 
 Tokenization occurs using the SentencePieceBPETokenizer (see the demo code for implementation details) and the enclosed tokenizer_src and tokenizer_tgt folders.
 
diff --git a/models/intel/machine-translation-nar-en-de-0002/README.md b/models/intel/machine-translation-nar-en-de-0002/README.md
@@ -2,7 +2,7 @@
 
 ## Use Case and High-Level Description
 
-This is an English-Deutsch machine translation model based on non-autoregressive Transformer topology.
+This is an English-Deutsch machine translation model based on non-autoregressive Transformer topology. The model is [trained](https://github.com/openvinotoolkit/training_extensions/tree/089de2f24667329a58e8560ed4e01ef203e99def/misc/pytorch_toolkit/machine_translation) on internal dataset.
 
 Tokenization occurs using the SentencePieceBPETokenizer (see the demo code for implementation details) and the enclosed tokenizer_src and tokenizer_tgt folders.
 
diff --git a/models/intel/machine-translation-nar-en-ru-0002/README.md b/models/intel/machine-translation-nar-en-ru-0002/README.md
@@ -2,7 +2,7 @@
 
 ## Use Case and High-Level Description
 
-This is an English-Russian machine translation model based on non-autoregressive Transformer topology.
+This is an English-Russian machine translation model based on non-autoregressive Transformer topology. The model is [trained](https://github.com/openvinotoolkit/training_extensions/tree/089de2f24667329a58e8560ed4e01ef203e99def/misc/pytorch_toolkit/machine_translation) on internal dataset.
 
 Tokenization occurs using the SentencePieceBPETokenizer (see the demo code for implementation details) and is enclosed in tokenizer_src and tokenizer_tgt folders.
 
diff --git a/models/intel/machine-translation-nar-ru-en-0002/README.md b/models/intel/machine-translation-nar-ru-en-0002/README.md
@@ -2,7 +2,7 @@
 
 ## Use Case and High-Level Description
 
-This is a Russian-English machine translation model based on non-autoregressive Transformer topology.
+This is a Russian-English machine translation model based on non-autoregressive Transformer topology. The model is [trained](https://github.com/openvinotoolkit/training_extensions/tree/089de2f24667329a58e8560ed4e01ef203e99def/misc/pytorch_toolkit/machine_translation) on internal dataset.
 
 Tokenization occurs using the SentencePieceBPETokenizer (see the demo code for implementation details) and the enclosed tokenizer_src and tokenizer_tgt folders.
 
diff --git a/models/public/detr-resnet50/README.md b/models/public/detr-resnet50/README.md
@@ -66,7 +66,7 @@ Expected color order is `BGR`.
     - `w` - width of bounding box(values are in normalized format, in range [0, 1])
     - `h` - height of bounding box(values are in normalized format, in range [0, 1])
 
-2. Scores, name: `scores`, shape - `1, 100, 92`. Contains scores for 91 [Common Objects in Context (COCO)](https://cocodataset.org/#home) object classes. The last class is `no-object` class.
+2. Scores, name: `scores`, shape - `1, 100, 92`. Contains scores in logits format for 91 [Common Objects in Context (COCO)](https://cocodataset.org/#home) object classes. The last class is `no-object` class.
 
 ### Converted model
 
@@ -81,7 +81,7 @@ Expected color order is `BGR`.
     - `w` - width of bounding box(values are in normalized format, in range [0, 1])
     - `h` - height of bounding box(values are in normalized format, in range [0, 1])
 
-2. Scores, name: `scores`, shape - `1, 100, 92`. Contains scores for 91 [Common Objects in Context (COCO)](https://cocodataset.org/#home) object classes. The last class is `no-object` class.
+2. Scores, name: `scores`, shape - `1, 100, 92`. Contains scores in logits format for 91 [Common Objects in Context (COCO)](https://cocodataset.org/#home) object classes. The last class is `no-object` class.
 
 ## Download a Model and Convert it into OpenVINO™ IR Format