From 8b5add16d2e4a35524a2f0472f5504c1547d5881 Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Wed, 23 Apr 2025 16:05:36 +0200
Subject: [PATCH 01/16] Wrap Result in Scene class

This allows the virtual overrides not to bother with pointer logic.
This is the first step. The next step should be to consolidate every
model type to scene output without a sub ResultBase. This can now be
done gradually
---
 src/cpp/models/include/models/anomaly_model.h |  6 +--
 src/cpp/models/include/models/base_model.h    | 10 ++--
 .../include/models/classification_model.h     | 12 ++---
 .../models/include/models/detection_model.h   |  4 +-
 .../include/models/detection_model_ssd.h      |  6 +--
 .../include/models/detection_model_yolo.h     |  5 +-
 .../models/detection_model_yolov3_onnx.h      |  2 +-
 .../include/models/detection_model_yolox.h    |  2 +-
 .../include/models/instance_segmentation.h    |  6 +--
 .../include/models/keypoint_detection.h       |  6 +--
 src/cpp/models/include/models/results.h       | 19 +++++++
 .../include/models/segmentation_model.h       |  6 +--
 src/cpp/models/src/anomaly_model.cpp          | 25 ++++------
 src/cpp/models/src/base_model.cpp             | 26 ++++------
 src/cpp/models/src/classification_model.cpp   | 50 +++++++++----------
 src/cpp/models/src/detection_model.cpp        | 15 ++----
 src/cpp/models/src/detection_model_ssd.cpp    | 27 +++++-----
 src/cpp/models/src/detection_model_yolo.cpp   | 20 +++++---
 .../src/detection_model_yolov3_onnx.cpp       |  9 ++--
 src/cpp/models/src/detection_model_yolox.cpp  |  8 +--
 src/cpp/models/src/instance_segmentation.cpp  | 27 +++++-----
 src/cpp/models/src/keypoint_detection.cpp     | 23 ++++-----
 src/cpp/models/src/segmentation_model.cpp     | 29 +++++------
 src/cpp/tilers/include/tilers/detection.h     |  8 +--
 .../include/tilers/instance_segmentation.h    |  8 +--
 .../include/tilers/semantic_segmentation.h    |  6 +--
 src/cpp/tilers/include/tilers/tiler_base.h    | 10 ++--
 src/cpp/tilers/src/detection.cpp              | 32 ++++++------
 src/cpp/tilers/src/instance_segmentation.cpp  | 31 ++++++------
 src/cpp/tilers/src/semantic_segmentation.cpp  | 34 ++++++-------
 src/cpp/tilers/src/tiler_base.cpp             | 10 ++--
 tests/cpp/accuracy/test_YOLOv8.cpp            |  2 +-
 tests/cpp/accuracy/test_accuracy.cpp          | 26 +++++-----
 33 files changed, 249 insertions(+), 261 deletions(-)
diff --git a/src/cpp/models/include/models/anomaly_model.h b/src/cpp/models/include/models/anomaly_model.h
index 1cc5be22..6dd55d91 100644
--- a/src/cpp/models/include/models/anomaly_model.h
+++ b/src/cpp/models/include/models/anomaly_model.h
@@ -23,9 +23,9 @@ class AnomalyModel : public BaseModel {
                                                       const std::string& device = "AUTO");
     static std::unique_ptr<AnomalyModel> create_model(std::shared_ptr<InferenceAdapter>& adapter);
 
-    virtual std::unique_ptr<AnomalyResult> infer(const ImageInputData& inputData);
-    virtual std::vector<std::unique_ptr<AnomalyResult>> inferBatch(const std::vector<ImageInputData>& inputImgs);
-    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    virtual std::unique_ptr<Scene> infer(const ImageInputData& inputData);
+    virtual std::vector<std::unique_ptr<Scene>> inferBatch(const std::vector<ImageInputData>& inputImgs);
+    std::unique_ptr<Scene> postprocess(InferenceResult& infResult) override;
 
     friend std::ostream& operator<<(std::ostream& os, std::unique_ptr<AnomalyModel>& model);
 
diff --git a/src/cpp/models/include/models/base_model.h b/src/cpp/models/include/models/base_model.h
index 85131805..383bc865 100644
--- a/src/cpp/models/include/models/base_model.h
+++ b/src/cpp/models/include/models/base_model.h
@@ -38,7 +38,7 @@ class BaseModel {
     BaseModel(std::shared_ptr<InferenceAdapter>& adapter, const ov::AnyMap& configuration = {});
 
     virtual std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, InferenceInput& input);
-    virtual std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) = 0;
+    virtual std::unique_ptr<Scene> postprocess(InferenceResult& infResult) = 0;
 
     void load(ov::Core& core, const std::string& device, size_t num_infer_requests = 1);
 
@@ -49,7 +49,7 @@ class BaseModel {
     virtual void awaitAll();
     virtual void awaitAny();
     virtual void setCallback(
-        std::function<void(std::unique_ptr<ResultBase>, const ov::AnyMap& callback_args)> callback);
+        std::function<void(std::unique_ptr<Scene>, const ov::AnyMap& callback_args)> callback);
 
     std::shared_ptr<ov::Model> getModel();
     std::shared_ptr<InferenceAdapter> getInferenceAdapter();
@@ -67,8 +67,8 @@ class BaseModel {
                                                const std::vector<float>& scale,
                                                const std::type_info& dtype = typeid(int));
     virtual void inferAsync(const ImageInputData& inputData, const ov::AnyMap& callback_args = {});
-    std::unique_ptr<ResultBase> inferImage(const ImageInputData& inputData);
-    std::vector<std::unique_ptr<ResultBase>> inferBatchImage(const std::vector<ImageInputData>& inputData);
+    std::unique_ptr<Scene> inferImage(const ImageInputData& inputData);
+    std::vector<std::unique_ptr<Scene>> inferBatchImage(const std::vector<ImageInputData>& inputData);
 
 protected:
     RESIZE_MODE selectResizeMode(const std::string& resize_type);
@@ -104,5 +104,5 @@ class BaseModel {
     std::shared_ptr<InferenceAdapter> inferenceAdapter;
     std::map<std::string, ov::Layout> inputsLayouts;
     ov::Layout getInputLayout(const ov::Output<ov::Node>& input);
-    std::function<void(std::unique_ptr<ResultBase>, const ov::AnyMap&)> lastCallback;
+    std::function<void(std::unique_ptr<Scene>, const ov::AnyMap&)> lastCallback;
 };
diff --git a/src/cpp/models/include/models/classification_model.h b/src/cpp/models/include/models/classification_model.h
index 88ac03bc..529d5e13 100644
--- a/src/cpp/models/include/models/classification_model.h
+++ b/src/cpp/models/include/models/classification_model.h
@@ -99,10 +99,10 @@ class ClassificationModel : public BaseModel {
                                                              const std::string& device = "AUTO");
     static std::unique_ptr<ClassificationModel> create_model(std::shared_ptr<InferenceAdapter>& adapter);
 
-    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::unique_ptr<Scene> postprocess(InferenceResult& infResult) override;
 
-    virtual std::unique_ptr<ClassificationResult> infer(const ImageInputData& inputData);
-    virtual std::vector<std::unique_ptr<ClassificationResult>> inferBatch(const std::vector<ImageInputData>& inputImgs);
+    virtual std::unique_ptr<Scene> infer(const ImageInputData& inputData);
+    virtual std::vector<std::unique_ptr<Scene>> inferBatch(const std::vector<ImageInputData>& inputImgs);
     static std::string ModelType;
 
 protected:
@@ -119,8 +119,8 @@ class ClassificationModel : public BaseModel {
     void init_from_config(const ov::AnyMap& top_priority, const ov::AnyMap& mid_priority);
     void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
     void updateModelInfo() override;
-    std::unique_ptr<ResultBase> get_multilabel_predictions(InferenceResult& infResult, bool add_raw_scores);
-    std::unique_ptr<ResultBase> get_multiclass_predictions(InferenceResult& infResult, bool add_raw_scores);
-    std::unique_ptr<ResultBase> get_hierarchical_predictions(InferenceResult& infResult, bool add_raw_scores);
+    std::unique_ptr<Scene> get_multilabel_predictions(InferenceResult& infResult, bool add_raw_scores);
+    std::unique_ptr<Scene> get_multiclass_predictions(InferenceResult& infResult, bool add_raw_scores);
+    std::unique_ptr<Scene> get_hierarchical_predictions(InferenceResult& infResult, bool add_raw_scores);
     ov::Tensor reorder_saliency_maps(const ov::Tensor&);
 };
diff --git a/src/cpp/models/include/models/detection_model.h b/src/cpp/models/include/models/detection_model.h
index 16ba8cf8..9ec7ab8f 100644
--- a/src/cpp/models/include/models/detection_model.h
+++ b/src/cpp/models/include/models/detection_model.h
@@ -25,8 +25,8 @@ class DetectionModel : public BaseModel {
                                                         const std::string& device = "AUTO");
     static std::unique_ptr<DetectionModel> create_model(std::shared_ptr<InferenceAdapter>& adapter);
 
-    virtual std::unique_ptr<DetectionResult> infer(const ImageInputData& inputData);
-    virtual std::vector<std::unique_ptr<DetectionResult>> inferBatch(const std::vector<ImageInputData>& inputImgs);
+    virtual std::unique_ptr<Scene> infer(const ImageInputData& inputData);
+    virtual std::vector<std::unique_ptr<Scene>> inferBatch(const std::vector<ImageInputData>& inputImgs);
 
 protected:
     float confidence_threshold = 0.5f;
diff --git a/src/cpp/models/include/models/detection_model_ssd.h b/src/cpp/models/include/models/detection_model_ssd.h
index acb3060f..188301c5 100644
--- a/src/cpp/models/include/models/detection_model_ssd.h
+++ b/src/cpp/models/include/models/detection_model_ssd.h
@@ -25,12 +25,12 @@ class ModelSSD : public DetectionModel {
 public:
     using DetectionModel::DetectionModel;
     std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, InferenceInput& input) override;
-    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::unique_ptr<Scene> postprocess(InferenceResult& infResult) override;
     static std::string ModelType;
 
 protected:
-    std::unique_ptr<ResultBase> postprocessSingleOutput(InferenceResult& infResult);
-    std::unique_ptr<ResultBase> postprocessMultipleOutputs(InferenceResult& infResult);
+    std::unique_ptr<Scene> postprocessSingleOutput(InferenceResult& infResult);
+    std::unique_ptr<Scene> postprocessMultipleOutputs(InferenceResult& infResult);
     void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
     void prepareSingleOutput(std::shared_ptr<ov::Model>& model);
     void prepareMultipleOutputs(std::shared_ptr<ov::Model>& model);
diff --git a/src/cpp/models/include/models/detection_model_yolo.h b/src/cpp/models/include/models/detection_model_yolo.h
index 56055588..40ffad20 100644
--- a/src/cpp/models/include/models/detection_model_yolo.h
+++ b/src/cpp/models/include/models/detection_model_yolo.h
@@ -18,7 +18,6 @@
 
 struct DetectedObject;
 struct InferenceResult;
-struct ResultBase;
 
 class ModelYolo : public DetectionModelExt {
 protected:
@@ -46,7 +45,7 @@ class ModelYolo : public DetectionModelExt {
     ModelYolo(std::shared_ptr<ov::Model>& model, const ov::AnyMap& configuration);
     ModelYolo(std::shared_ptr<InferenceAdapter>& adapter);
 
-    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::unique_ptr<Scene> postprocess(InferenceResult& infResult) override;
 
 protected:
     void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
@@ -82,7 +81,7 @@ class YOLOv5 : public DetectionModelExt {
 public:
     YOLOv5(std::shared_ptr<ov::Model>& model, const ov::AnyMap& configuration);
     YOLOv5(std::shared_ptr<InferenceAdapter>& adapter);
-    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::unique_ptr<Scene> postprocess(InferenceResult& infResult) override;
     static std::string ModelType;
 };
 
diff --git a/src/cpp/models/include/models/detection_model_yolov3_onnx.h b/src/cpp/models/include/models/detection_model_yolov3_onnx.h
index 9dead24d..a11c900a 100644
--- a/src/cpp/models/include/models/detection_model_yolov3_onnx.h
+++ b/src/cpp/models/include/models/detection_model_yolov3_onnx.h
@@ -17,7 +17,7 @@ class ModelYoloV3ONNX : public DetectionModel {
     ModelYoloV3ONNX(std::shared_ptr<InferenceAdapter>& adapter);
     using DetectionModel::DetectionModel;
 
-    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::unique_ptr<Scene> postprocess(InferenceResult& infResult) override;
     std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, InferenceInput& input) override;
 
 protected:
diff --git a/src/cpp/models/include/models/detection_model_yolox.h b/src/cpp/models/include/models/detection_model_yolox.h
index bc747ee5..1849ba68 100644
--- a/src/cpp/models/include/models/detection_model_yolox.h
+++ b/src/cpp/models/include/models/detection_model_yolox.h
@@ -17,7 +17,7 @@ class ModelYoloX : public DetectionModelExt {
     ModelYoloX(std::shared_ptr<InferenceAdapter>& adapter);
     using DetectionModelExt::DetectionModelExt;
 
-    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::unique_ptr<Scene> postprocess(InferenceResult& infResult) override;
     std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, InferenceInput& input) override;
     static std::string ModelType;
 
diff --git a/src/cpp/models/include/models/instance_segmentation.h b/src/cpp/models/include/models/instance_segmentation.h
index c6cadce7..21c94690 100644
--- a/src/cpp/models/include/models/instance_segmentation.h
+++ b/src/cpp/models/include/models/instance_segmentation.h
@@ -30,10 +30,10 @@ class MaskRCNNModel : public BaseModel {
                                                        const std::string& device = "AUTO");
     static std::unique_ptr<MaskRCNNModel> create_model(std::shared_ptr<InferenceAdapter>& adapter);
 
-    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::unique_ptr<Scene> postprocess(InferenceResult& infResult) override;
 
-    virtual std::unique_ptr<InstanceSegmentationResult> infer(const ImageInputData& inputData);
-    virtual std::vector<std::unique_ptr<InstanceSegmentationResult>> inferBatch(
+    virtual std::unique_ptr<Scene> infer(const ImageInputData& inputData);
+    virtual std::vector<std::unique_ptr<Scene>> inferBatch(
         const std::vector<ImageInputData>& inputImgs);
     static std::string ModelType;
     bool postprocess_semantic_masks = true;
diff --git a/src/cpp/models/include/models/keypoint_detection.h b/src/cpp/models/include/models/keypoint_detection.h
index 15d21cba..6fbea292 100644
--- a/src/cpp/models/include/models/keypoint_detection.h
+++ b/src/cpp/models/include/models/keypoint_detection.h
@@ -29,10 +29,10 @@ class KeypointDetectionModel : public BaseModel {
                                                                 const std::string& device = "AUTO");
     static std::unique_ptr<KeypointDetectionModel> create_model(std::shared_ptr<InferenceAdapter>& adapter);
 
-    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::unique_ptr<Scene> postprocess(InferenceResult& infResult) override;
 
-    virtual std::unique_ptr<KeypointDetectionResult> infer(const ImageInputData& inputData);
-    virtual std::vector<std::unique_ptr<KeypointDetectionResult>> inferBatch(
+    virtual std::unique_ptr<Scene> infer(const ImageInputData& inputData);
+    virtual std::vector<std::unique_ptr<Scene>> inferBatch(
         const std::vector<ImageInputData>& inputImgs);
 
     static std::string ModelType;
diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index 1a648723..d87849b5 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -16,6 +16,7 @@
 #include "internal_model_data.h"
 
 struct MetaData;
+
 struct ResultBase {
     ResultBase(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
         : frameId(frameId),
@@ -377,3 +378,21 @@ struct KeypointDetectionResult : public ResultBase {
         : ResultBase(frameId, metaData) {}
     std::vector<DetectedKeypoints> poses;
 };
+
+
+class Scene {
+public:
+    Scene(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+        : frameId(frameId),
+          metaData(metaData) {}
+
+    int64_t frameId;
+    std::shared_ptr<MetaData> metaData;
+
+    std::unique_ptr<DetectionResult> detection_result;
+    std::unique_ptr<ClassificationResult> classification_result;
+    std::unique_ptr<KeypointDetectionResult> keypoint_detection_result;
+    std::unique_ptr<AnomalyResult> anomaly_result;
+    std::unique_ptr<InstanceSegmentationResult> instance_segmentation_result;
+    std::unique_ptr<ImageResult> image_result;
+};
diff --git a/src/cpp/models/include/models/segmentation_model.h b/src/cpp/models/include/models/segmentation_model.h
index 922828f7..d802e026 100644
--- a/src/cpp/models/include/models/segmentation_model.h
+++ b/src/cpp/models/include/models/segmentation_model.h
@@ -31,10 +31,10 @@ class SegmentationModel : public BaseModel {
                                                            const std::string& device = "AUTO");
     static std::unique_ptr<SegmentationModel> create_model(std::shared_ptr<InferenceAdapter>& adapter);
 
-    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::unique_ptr<Scene> postprocess(InferenceResult& infResult) override;
 
-    virtual std::unique_ptr<ImageResult> infer(const ImageInputData& inputData);
-    virtual std::vector<std::unique_ptr<ImageResult>> inferBatch(const std::vector<ImageInputData>& inputImgs);
+    virtual std::unique_ptr<Scene> infer(const ImageInputData& inputData);
+    virtual std::vector<std::unique_ptr<Scene>> inferBatch(const std::vector<ImageInputData>& inputImgs);
 
     static std::string ModelType;
     std::vector<Contour> getContours(const ImageResultWithSoftPrediction& imageResult);
diff --git a/src/cpp/models/src/anomaly_model.cpp b/src/cpp/models/src/anomaly_model.cpp
index eeccf08a..1cd1a9e8 100644
--- a/src/cpp/models/src/anomaly_model.cpp
+++ b/src/cpp/models/src/anomaly_model.cpp
@@ -38,23 +38,15 @@ AnomalyModel::AnomalyModel(std::shared_ptr<InferenceAdapter>& adapter, const ov:
     init_from_config(configuration, adapter->getModelConfig());
 }
 
-std::unique_ptr<AnomalyResult> AnomalyModel::infer(const ImageInputData& inputData) {
-    auto result = BaseModel::inferImage(inputData);
-
-    return std::unique_ptr<AnomalyResult>(static_cast<AnomalyResult*>(result.release()));
+std::unique_ptr<Scene> AnomalyModel::infer(const ImageInputData& inputData) {
+    return BaseModel::inferImage(inputData);
 }
 
-std::vector<std::unique_ptr<AnomalyResult>> AnomalyModel::inferBatch(const std::vector<ImageInputData>& inputImgs) {
-    auto results = BaseModel::inferBatchImage(inputImgs);
-    std::vector<std::unique_ptr<AnomalyResult>> anoResults;
-    anoResults.reserve(results.size());
-    for (auto& result : results) {
-        anoResults.emplace_back(static_cast<AnomalyResult*>(result.release()));
-    }
-    return anoResults;
+std::vector<std::unique_ptr<Scene>> AnomalyModel::inferBatch(const std::vector<ImageInputData>& inputImgs) {
+    return BaseModel::inferBatchImage(inputImgs);
 }
 
-std::unique_ptr<ResultBase> AnomalyModel::postprocess(InferenceResult& infResult) {
+std::unique_ptr<Scene> AnomalyModel::postprocess(InferenceResult& infResult) {
     ov::Tensor predictions = infResult.outputsData[outputNames[0]];
     const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
 
@@ -95,13 +87,16 @@ std::unique_ptr<ResultBase> AnomalyModel::postprocess(InferenceResult& infResult
         pred_boxes = getBoxes(pred_mask);
     }
 
-    AnomalyResult* result = new AnomalyResult(infResult.frameId, infResult.metaData);
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<AnomalyResult>(infResult.frameId, infResult.metaData);
     result->anomaly_map = std::move(anomaly_map);
     result->pred_score = pred_score;
     result->pred_label = std::move(pred_label);
     result->pred_mask = std::move(pred_mask);
     result->pred_boxes = std::move(pred_boxes);
-    return std::unique_ptr<ResultBase>(result);
+
+    scene->anomaly_result = std::move(result);
+    return scene;
 }
 
 cv::Mat AnomalyModel::normalize(cv::Mat& tensor, float threshold) {
diff --git a/src/cpp/models/src/base_model.cpp b/src/cpp/models/src/base_model.cpp
index 6fd83d73..8db177ff 100644
--- a/src/cpp/models/src/base_model.cpp
+++ b/src/cpp/models/src/base_model.cpp
@@ -25,10 +25,10 @@ namespace {
 class TmpCallbackSetter {
 public:
     BaseModel* model;
-    std::function<void(std::unique_ptr<ResultBase>, const ov::AnyMap&)> last_callback;
+    std::function<void(std::unique_ptr<Scene>, const ov::AnyMap&)> last_callback;
     TmpCallbackSetter(BaseModel* model_,
-                      std::function<void(std::unique_ptr<ResultBase>, const ov::AnyMap&)> tmp_callback,
-                      std::function<void(std::unique_ptr<ResultBase>, const ov::AnyMap&)> last_callback_)
+                      std::function<void(std::unique_ptr<Scene>, const ov::AnyMap&)> tmp_callback,
+                      std::function<void(std::unique_ptr<Scene>, const ov::AnyMap&)> last_callback_)
         : model(model_),
           last_callback(last_callback_) {
         model->setCallback(tmp_callback);
@@ -37,7 +37,7 @@ class TmpCallbackSetter {
         if (last_callback) {
             model->setCallback(last_callback);
         } else {
-            model->setCallback([](std::unique_ptr<ResultBase>, const ov::AnyMap&) {});
+            model->setCallback([](std::unique_ptr<Scene>, const ov::AnyMap&) {});
         }
     }
 };
@@ -106,7 +106,7 @@ void BaseModel::awaitAny() {
 }
 
 void BaseModel::setCallback(
-    std::function<void(std::unique_ptr<ResultBase>, const ov::AnyMap& callback_args)> callback) {
+    std::function<void(std::unique_ptr<Scene>, const ov::AnyMap& callback_args)> callback) {
     lastCallback = callback;
     inferenceAdapter->setCallback([this, callback](ov::InferRequest request, CallbackData args) {
         InferenceResult result;
@@ -121,9 +121,7 @@ void BaseModel::setCallback(
         if (model_data_iter != args->end()) {
             result.internalModelData = std::move(model_data_iter->second.as<std::shared_ptr<InternalModelData>>());
         }
-        auto retVal = this->postprocess(result);
-        *retVal = static_cast<ResultBase&>(result);
-        callback(std::move(retVal), args ? *args : ov::AnyMap());
+        callback(std::move(this->postprocess(result)), args ? *args : ov::AnyMap());
     });
 }
 
@@ -215,7 +213,7 @@ BaseModel::BaseModel(std::shared_ptr<InferenceAdapter>& adapter, const ov::AnyMa
     init_from_config(configuration, adapter->getModelConfig());
 }
 
-std::unique_ptr<ResultBase> BaseModel::inferImage(const ImageInputData& inputData) {
+std::unique_ptr<Scene> BaseModel::inferImage(const ImageInputData& inputData) {
     InferenceInput inputs;
     InferenceResult result;
     auto internalModelData = this->preprocess(inputData, inputs);
@@ -223,21 +221,19 @@ std::unique_ptr<ResultBase> BaseModel::inferImage(const ImageInputData& inputDat
     result.outputsData = inferenceAdapter->infer(inputs);
     result.internalModelData = std::move(internalModelData);
 
-    auto retVal = this->postprocess(result);
-    *retVal = static_cast<ResultBase&>(result);
-    return retVal;
+    return this->postprocess(result);
 }
 
-std::vector<std::unique_ptr<ResultBase>> BaseModel::inferBatchImage(const std::vector<ImageInputData>& inputImgs) {
+std::vector<std::unique_ptr<Scene>> BaseModel::inferBatchImage(const std::vector<ImageInputData>& inputImgs) {
     std::vector<std::reference_wrapper<const ImageInputData>> inputData;
     inputData.reserve(inputImgs.size());
     for (const auto& img : inputImgs) {
         inputData.push_back(img);
     }
-    auto results = std::vector<std::unique_ptr<ResultBase>>(inputData.size());
+    auto results = std::vector<std::unique_ptr<Scene>>(inputData.size());
     auto setter = TmpCallbackSetter(
         this,
-        [&](std::unique_ptr<ResultBase> result, const ov::AnyMap& callback_args) {
+        [&](std::unique_ptr<Scene> result, const ov::AnyMap& callback_args) {
             size_t id = callback_args.find("id")->second.as<size_t>();
             results[id] = std::move(result);
         },
diff --git a/src/cpp/models/src/classification_model.cpp b/src/cpp/models/src/classification_model.cpp
index a9d281e1..5ca38cfa 100644
--- a/src/cpp/models/src/classification_model.cpp
+++ b/src/cpp/models/src/classification_model.cpp
@@ -293,8 +293,8 @@ std::unique_ptr<ClassificationModel> ClassificationModel::create_model(std::shar
     return classifier;
 }
 
-std::unique_ptr<ResultBase> ClassificationModel::postprocess(InferenceResult& infResult) {
-    std::unique_ptr<ResultBase> result;
+std::unique_ptr<Scene> ClassificationModel::postprocess(InferenceResult& infResult) {
+    std::unique_ptr<Scene> result;
     if (multilabel) {
         result = get_multilabel_predictions(infResult, output_raw_scores);
     } else if (hierarchical) {
@@ -303,7 +303,7 @@ std::unique_ptr<ResultBase> ClassificationModel::postprocess(InferenceResult& in
         result = get_multiclass_predictions(infResult, output_raw_scores);
     }
 
-    ClassificationResult* cls_res = static_cast<ClassificationResult*>(result.get());
+    auto& cls_res = result->classification_result;
     auto saliency_map_iter = infResult.outputsData.find(saliency_map_name);
     if (saliency_map_iter != infResult.outputsData.end()) {
         cls_res->saliency_map = std::move(saliency_map_iter->second);
@@ -313,16 +313,17 @@ std::unique_ptr<ResultBase> ClassificationModel::postprocess(InferenceResult& in
     if (feature_vector_iter != infResult.outputsData.end()) {
         cls_res->feature_vector = std::move(feature_vector_iter->second);
     }
+
     return result;
 }
 
-std::unique_ptr<ResultBase> ClassificationModel::get_multilabel_predictions(InferenceResult& infResult,
+std::unique_ptr<Scene> ClassificationModel::get_multilabel_predictions(InferenceResult& infResult,
                                                                             bool add_raw_scores) {
     const ov::Tensor& logitsTensor = infResult.outputsData.find(outputNames[0])->second;
     const float* logitsPtr = logitsTensor.data<float>();
 
-    ClassificationResult* result = new ClassificationResult(infResult.frameId, infResult.metaData);
-    auto retVal = std::unique_ptr<ResultBase>(result);
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<ClassificationResult>(infResult.frameId, infResult.metaData);
 
     auto raw_scores = ov::Tensor();
     float* raw_scoresPtr = nullptr;
@@ -343,12 +344,14 @@ std::unique_ptr<ResultBase> ClassificationModel::get_multilabel_predictions(Infe
         }
     }
 
-    return retVal;
+    scene->classification_result = std::move(result);
+    return scene;
 }
 
-std::unique_ptr<ResultBase> ClassificationModel::get_hierarchical_predictions(InferenceResult& infResult,
+std::unique_ptr<Scene> ClassificationModel::get_hierarchical_predictions(InferenceResult& infResult,
                                                                               bool add_raw_scores) {
-    ClassificationResult* result = new ClassificationResult(infResult.frameId, infResult.metaData);
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<ClassificationResult>(infResult.frameId, infResult.metaData);
 
     const ov::Tensor& logitsTensor = infResult.outputsData.find(outputNames[0])->second;
     float* logitsPtr = logitsTensor.data<float>();
@@ -396,13 +399,13 @@ std::unique_ptr<ResultBase> ClassificationModel::get_hierarchical_predictions(In
 
     auto resolved_labels = resolver->resolve_labels(predicted_labels, predicted_scores);
 
-    auto retVal = std::unique_ptr<ResultBase>(result);
     result->topLabels.reserve(resolved_labels.size());
     for (const auto& label : resolved_labels) {
         result->topLabels.emplace_back(hierarchical_info.label_to_idx[label.first], label.first, label.second);
     }
 
-    return retVal;
+    scene->classification_result = std::move(result);
+    return scene;
 }
 
 ov::Tensor ClassificationModel::reorder_saliency_maps(const ov::Tensor& source_maps) {
@@ -426,16 +429,15 @@ ov::Tensor ClassificationModel::reorder_saliency_maps(const ov::Tensor& source_m
     return reordered_maps;
 }
 
-std::unique_ptr<ResultBase> ClassificationModel::get_multiclass_predictions(InferenceResult& infResult,
+std::unique_ptr<Scene> ClassificationModel::get_multiclass_predictions(InferenceResult& infResult,
                                                                             bool add_raw_scores) {
     const ov::Tensor& indicesTensor = infResult.outputsData.find(indices_name)->second;
     const int* indicesPtr = indicesTensor.data<int>();
     const ov::Tensor& scoresTensor = infResult.outputsData.find(scores_name)->second;
     const float* scoresPtr = scoresTensor.data<float>();
 
-    ClassificationResult* result = new ClassificationResult(infResult.frameId, infResult.metaData);
-    auto retVal = std::unique_ptr<ResultBase>(result);
-
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<ClassificationResult>(infResult.frameId, infResult.metaData);
     if (add_raw_scores) {
         const ov::Tensor& logitsTensor = infResult.outputsData.find(raw_scores_name)->second;
         result->raw_scores = ov::Tensor(logitsTensor.get_element_type(), logitsTensor.get_shape());
@@ -452,7 +454,8 @@ std::unique_ptr<ResultBase> ClassificationModel::get_multiclass_predictions(Infe
         result->topLabels.emplace_back(ind, labels[ind], scoresPtr[i]);
     }
 
-    return retVal;
+    scene->classification_result = std::move(result);
+    return scene;
 }
 
 void ClassificationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
@@ -535,20 +538,13 @@ void ClassificationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model
     append_xai_names(model->outputs(), outputNames);
 }
 
-std::unique_ptr<ClassificationResult> ClassificationModel::infer(const ImageInputData& inputData) {
-    auto result = BaseModel::inferImage(inputData);
-    return std::unique_ptr<ClassificationResult>(static_cast<ClassificationResult*>(result.release()));
+std::unique_ptr<Scene> ClassificationModel::infer(const ImageInputData& inputData) {
+    return BaseModel::inferImage(inputData);
 }
 
-std::vector<std::unique_ptr<ClassificationResult>> ClassificationModel::inferBatch(
+std::vector<std::unique_ptr<Scene>> ClassificationModel::inferBatch(
     const std::vector<ImageInputData>& inputImgs) {
-    auto results = BaseModel::inferBatchImage(inputImgs);
-    std::vector<std::unique_ptr<ClassificationResult>> clsResults;
-    clsResults.reserve(results.size());
-    for (auto& result : results) {
-        clsResults.emplace_back(static_cast<ClassificationResult*>(result.release()));
-    }
-    return clsResults;
+    return BaseModel::inferBatchImage(inputImgs);
 }
 
 HierarchicalConfig::HierarchicalConfig(const std::string& json_repr) {
diff --git a/src/cpp/models/src/detection_model.cpp b/src/cpp/models/src/detection_model.cpp
index 6b55eeba..7f3ea895 100644
--- a/src/cpp/models/src/detection_model.cpp
+++ b/src/cpp/models/src/detection_model.cpp
@@ -102,17 +102,10 @@ std::unique_ptr<DetectionModel> DetectionModel::create_model(std::shared_ptr<Inf
     return detectionModel;
 }
 
-std::unique_ptr<DetectionResult> DetectionModel::infer(const ImageInputData& inputData) {
-    auto result = BaseModel::inferImage(inputData);
-    return std::unique_ptr<DetectionResult>(static_cast<DetectionResult*>(result.release()));
+std::unique_ptr<Scene> DetectionModel::infer(const ImageInputData& inputData) {
+    return BaseModel::inferImage(inputData);
 }
 
-std::vector<std::unique_ptr<DetectionResult>> DetectionModel::inferBatch(const std::vector<ImageInputData>& inputImgs) {
-    auto results = BaseModel::inferBatchImage(inputImgs);
-    std::vector<std::unique_ptr<DetectionResult>> detResults;
-    detResults.reserve(results.size());
-    for (auto& result : results) {
-        detResults.emplace_back(static_cast<DetectionResult*>(result.release()));
-    }
-    return detResults;
+std::vector<std::unique_ptr<Scene>> DetectionModel::inferBatch(const std::vector<ImageInputData>& inputImgs) {
+    return BaseModel::inferBatchImage(inputImgs);
 }
diff --git a/src/cpp/models/src/detection_model_ssd.cpp b/src/cpp/models/src/detection_model_ssd.cpp
index f6f7c818..0eb968cf 100644
--- a/src/cpp/models/src/detection_model_ssd.cpp
+++ b/src/cpp/models/src/detection_model_ssd.cpp
@@ -93,30 +93,29 @@ std::shared_ptr<InternalModelData> ModelSSD::preprocess(const InputData& inputDa
     return DetectionModel::preprocess(inputData, input);
 }
 
-std::unique_ptr<ResultBase> ModelSSD::postprocess(InferenceResult& infResult) {
-    std::unique_ptr<ResultBase> result = filterOutXai(outputNames).size() > 1 ? postprocessMultipleOutputs(infResult)
+std::unique_ptr<Scene> ModelSSD::postprocess(InferenceResult& infResult) {
+    std::unique_ptr<Scene> result = filterOutXai(outputNames).size() > 1 ? postprocessMultipleOutputs(infResult)
                                                                               : postprocessSingleOutput(infResult);
-    DetectionResult* cls_res = static_cast<DetectionResult*>(result.get());
     auto saliency_map_iter = infResult.outputsData.find(saliency_map_name);
     if (saliency_map_iter != infResult.outputsData.end()) {
-        cls_res->saliency_map = std::move(saliency_map_iter->second);
+        result->detection_result->saliency_map = std::move(saliency_map_iter->second);
     }
     auto feature_vector_iter = infResult.outputsData.find(feature_vector_name);
     if (feature_vector_iter != infResult.outputsData.end()) {
-        cls_res->feature_vector = std::move(feature_vector_iter->second);
+        result->detection_result->feature_vector = std::move(feature_vector_iter->second);
     }
     return result;
 }
 
-std::unique_ptr<ResultBase> ModelSSD::postprocessSingleOutput(InferenceResult& infResult) {
+std::unique_ptr<Scene> ModelSSD::postprocessSingleOutput(InferenceResult& infResult) {
     const std::vector<std::string> namesWithoutXai = filterOutXai(outputNames);
     assert(namesWithoutXai.size() == 1);
     const ov::Tensor& detectionsTensor = infResult.outputsData[namesWithoutXai[0]];
     NumAndStep numAndStep = fromSingleOutput(detectionsTensor.get_shape());
     const float* detections = detectionsTensor.data<float>();
 
-    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
-    auto retVal = std::unique_ptr<ResultBase>(result);
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
 
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
     float floatInputImgWidth = float(internalData.inputImgWidth),
@@ -168,10 +167,11 @@ std::unique_ptr<ResultBase> ModelSSD::postprocessSingleOutput(InferenceResult& i
         }
     }
 
-    return retVal;
+    scene->detection_result = std::move(result);
+    return scene;
 }
 
-std::unique_ptr<ResultBase> ModelSSD::postprocessMultipleOutputs(InferenceResult& infResult) {
+std::unique_ptr<Scene> ModelSSD::postprocessMultipleOutputs(InferenceResult& infResult) {
     const std::vector<std::string> namesWithoutXai = filterOutXai(outputNames);
     const float* boxes = infResult.outputsData[namesWithoutXai[0]].data<float>();
     NumAndStep numAndStep = fromMultipleOutputs(infResult.outputsData[namesWithoutXai[0]].get_shape());
@@ -179,8 +179,8 @@ std::unique_ptr<ResultBase> ModelSSD::postprocessMultipleOutputs(InferenceResult
     const float* scores =
         namesWithoutXai.size() > 2 ? infResult.outputsData[namesWithoutXai[2]].data<float>() : nullptr;
 
-    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
-    auto retVal = std::unique_ptr<ResultBase>(result);
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
 
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
     float floatInputImgWidth = float(internalData.inputImgWidth),
@@ -232,7 +232,8 @@ std::unique_ptr<ResultBase> ModelSSD::postprocessMultipleOutputs(InferenceResult
         }
     }
 
-    return retVal;
+    scene->detection_result = std::move(result);
+    return scene;
 }
 
 void ModelSSD::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
diff --git a/src/cpp/models/src/detection_model_yolo.cpp b/src/cpp/models/src/detection_model_yolo.cpp
index 1698b8e6..49e83f8a 100644
--- a/src/cpp/models/src/detection_model_yolo.cpp
+++ b/src/cpp/models/src/detection_model_yolo.cpp
@@ -257,8 +257,9 @@ void ModelYolo::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
     }
 }
 
-std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult& infResult) {
-    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+std::unique_ptr<Scene> ModelYolo::postprocess(InferenceResult& infResult) {
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
     std::vector<DetectedObject> objects;
 
     // Parsing outputs
@@ -308,7 +309,9 @@ std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult& infResult) {
         }
     }
 
-    return std::unique_ptr<ResultBase>(result);
+    scene->detection_result = std::move(result);
+
+    return scene;
 }
 
 void ModelYolo::parseYOLOOutput(const std::string& output_name,
@@ -566,7 +569,7 @@ YOLOv5::YOLOv5(std::shared_ptr<InferenceAdapter>& adapter) : DetectionModelExt(a
     init_from_config(adapter->getModelConfig(), ov::AnyMap{});
 }
 
-std::unique_ptr<ResultBase> YOLOv5::postprocess(InferenceResult& infResult) {
+std::unique_ptr<Scene> YOLOv5::postprocess(InferenceResult& infResult) {
     if (1 != infResult.outputsData.size()) {
         throw std::runtime_error("YOLO: expect 1 output");
     }
@@ -609,8 +612,9 @@ std::unique_ptr<ResultBase> YOLOv5::postprocess(InferenceResult& infResult) {
     } else {
         keep = multiclass_nms(boxes_with_class, confidences, iou_threshold, includeBoundaries, keep_top_k);
     }
-    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
-    auto base = std::unique_ptr<ResultBase>(result);
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
+
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
     float floatInputImgWidth = float(internalData.inputImgWidth),
           floatInputImgHeight = float(internalData.inputImgHeight);
@@ -636,7 +640,9 @@ std::unique_ptr<ResultBase> YOLOv5::postprocess(InferenceResult& infResult) {
         desc.label = getLabelName(desc.labelID);
         result->objects.push_back(desc);
     }
-    return base;
+    scene->detection_result = std::move(result);
+
+    return scene;
 }
 
 std::string YOLOv8::ModelType = "YOLOv8";
diff --git a/src/cpp/models/src/detection_model_yolov3_onnx.cpp b/src/cpp/models/src/detection_model_yolov3_onnx.cpp
index 68830220..f9ccf57a 100644
--- a/src/cpp/models/src/detection_model_yolov3_onnx.cpp
+++ b/src/cpp/models/src/detection_model_yolov3_onnx.cpp
@@ -116,7 +116,7 @@ float getScore(const ov::Tensor& scoresTensor, size_t classInd, size_t boxInd) {
 }
 }  // namespace
 
-std::unique_ptr<ResultBase> ModelYoloV3ONNX::postprocess(InferenceResult& infResult) {
+std::unique_ptr<Scene> ModelYoloV3ONNX::postprocess(InferenceResult& infResult) {
     // Get info about input image
     const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth;
     const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight;
@@ -133,7 +133,8 @@ std::unique_ptr<ResultBase> ModelYoloV3ONNX::postprocess(InferenceResult& infRes
     const auto boxShape = boxes.get_shape();
 
     // Generate detection results
-    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
     size_t numberOfBoxes = indicesShape.size() == 3 ? indicesShape[1] : indicesShape[0];
     size_t indicesStride = indicesShape.size() == 3 ? indicesShape[2] : indicesShape[1];
 
@@ -170,5 +171,7 @@ std::unique_ptr<ResultBase> ModelYoloV3ONNX::postprocess(InferenceResult& infRes
         }
     }
 
-    return std::unique_ptr<ResultBase>(result);
+    scene->detection_result = std::move(result);
+
+    return scene;
 }
diff --git a/src/cpp/models/src/detection_model_yolox.cpp b/src/cpp/models/src/detection_model_yolox.cpp
index 3c4df1fe..421e4e32 100644
--- a/src/cpp/models/src/detection_model_yolox.cpp
+++ b/src/cpp/models/src/detection_model_yolox.cpp
@@ -126,7 +126,7 @@ std::shared_ptr<InternalModelData> ModelYoloX::preprocess(const InputData& input
     return std::make_shared<InternalScaleData>(origImg.cols, origImg.rows, scale, scale);
 }
 
-std::unique_ptr<ResultBase> ModelYoloX::postprocess(InferenceResult& infResult) {
+std::unique_ptr<Scene> ModelYoloX::postprocess(InferenceResult& infResult) {
     // Get metadata about input image shape and scale
     const auto& scale = infResult.internalModelData->asRef<InternalScaleData>();
 
@@ -136,7 +136,8 @@ std::unique_ptr<ResultBase> ModelYoloX::postprocess(InferenceResult& infResult)
     float* outputPtr = output.data<float>();
 
     // Generate detection results
-    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
 
     // Update coordinates according to strides
     for (size_t box_index = 0; box_index < expandedStrides.size(); ++box_index) {
@@ -200,5 +201,6 @@ std::unique_ptr<ResultBase> ModelYoloX::postprocess(InferenceResult& infResult)
         result->objects.push_back(obj);
     }
 
-    return std::unique_ptr<ResultBase>(result);
+    scene->detection_result = std::move(result);
+    return scene;
 }
diff --git a/src/cpp/models/src/instance_segmentation.cpp b/src/cpp/models/src/instance_segmentation.cpp
index 384fb057..5af1bf3a 100644
--- a/src/cpp/models/src/instance_segmentation.cpp
+++ b/src/cpp/models/src/instance_segmentation.cpp
@@ -281,7 +281,7 @@ void MaskRCNNModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
     append_xai_names(model->outputs(), outputNames);
 }
 
-std::unique_ptr<ResultBase> MaskRCNNModel::postprocess(InferenceResult& infResult) {
+std::unique_ptr<Scene> MaskRCNNModel::postprocess(InferenceResult& infResult) {
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
     float floatInputImgWidth = float(internalData.inputImgWidth),
           floatInputImgHeight = float(internalData.inputImgHeight);
@@ -300,8 +300,10 @@ std::unique_ptr<ResultBase> MaskRCNNModel::postprocess(InferenceResult& infResul
     size_t objectSize = lbm.boxes.get_shape().back();
     float* const masks = lbm.masks.data<float>();
     const cv::Size& masks_size{int(lbm.masks.get_shape()[3]), int(lbm.masks.get_shape()[2])};
-    InstanceSegmentationResult* result = new InstanceSegmentationResult(infResult.frameId, infResult.metaData);
-    auto retVal = std::unique_ptr<ResultBase>(result);
+
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<InstanceSegmentationResult>(infResult.frameId, infResult.metaData);
+
     std::vector<std::vector<cv::Mat>> saliency_maps;
     bool has_feature_vector_name =
         std::find(outputNames.begin(), outputNames.end(), feature_vector_name) != outputNames.end();
@@ -355,21 +357,16 @@ std::unique_ptr<ResultBase> MaskRCNNModel::postprocess(InferenceResult& infResul
     if (has_feature_vector_name) {
         result->feature_vector = std::move(infResult.outputsData[feature_vector_name]);
     }
-    return retVal;
+
+    scene->instance_segmentation_result = std::move(result);
+    return scene;
 }
 
-std::unique_ptr<InstanceSegmentationResult> MaskRCNNModel::infer(const ImageInputData& inputData) {
-    auto result = BaseModel::inferImage(inputData);
-    return std::unique_ptr<InstanceSegmentationResult>(static_cast<InstanceSegmentationResult*>(result.release()));
+std::unique_ptr<Scene> MaskRCNNModel::infer(const ImageInputData& inputData) {
+    return BaseModel::inferImage(inputData);
 }
 
-std::vector<std::unique_ptr<InstanceSegmentationResult>> MaskRCNNModel::inferBatch(
+std::vector<std::unique_ptr<Scene>> MaskRCNNModel::inferBatch(
     const std::vector<ImageInputData>& inputImgs) {
-    auto results = BaseModel::inferBatchImage(inputImgs);
-    std::vector<std::unique_ptr<InstanceSegmentationResult>> isegResults;
-    isegResults.reserve(results.size());
-    for (auto& result : results) {
-        isegResults.emplace_back(static_cast<InstanceSegmentationResult*>(result.release()));
-    }
-    return isegResults;
+    return BaseModel::inferBatchImage(inputImgs);
 }
diff --git a/src/cpp/models/src/keypoint_detection.cpp b/src/cpp/models/src/keypoint_detection.cpp
index 4fbe778c..9554cbb4 100644
--- a/src/cpp/models/src/keypoint_detection.cpp
+++ b/src/cpp/models/src/keypoint_detection.cpp
@@ -207,8 +207,9 @@ void KeypointDetectionModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& mo
     }
 }
 
-std::unique_ptr<ResultBase> KeypointDetectionModel::postprocess(InferenceResult& infResult) {
-    KeypointDetectionResult* result = new KeypointDetectionResult(infResult.frameId, infResult.metaData);
+std::unique_ptr<Scene> KeypointDetectionModel::postprocess(InferenceResult& infResult) {
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    auto result = std::make_unique<KeypointDetectionResult>(infResult.frameId, infResult.metaData);
 
     const ov::Tensor& pred_x_tensor = infResult.outputsData.find(outputNames[0])->second;
     size_t shape_offset = pred_x_tensor.get_shape().size() == 3 ? 1 : 0;
@@ -246,21 +247,15 @@ std::unique_ptr<ResultBase> KeypointDetectionModel::postprocess(InferenceResult&
     result->poses.emplace_back(
         decode_simcc(pred_x_mat, pred_y_mat, {inverted_scale_x, inverted_scale_y}, {pad_left, pad_top}, apply_softmax));
 
-    return std::unique_ptr<ResultBase>(result);
+    scene->keypoint_detection_result = std::move(result);
+    return scene;
 }
 
-std::unique_ptr<KeypointDetectionResult> KeypointDetectionModel::infer(const ImageInputData& inputData) {
-    auto result = BaseModel::inferImage(inputData);
-    return std::unique_ptr<KeypointDetectionResult>(static_cast<KeypointDetectionResult*>(result.release()));
+std::unique_ptr<Scene> KeypointDetectionModel::infer(const ImageInputData& inputData) {
+    return BaseModel::inferImage(inputData);
 }
 
-std::vector<std::unique_ptr<KeypointDetectionResult>> KeypointDetectionModel::inferBatch(
+std::vector<std::unique_ptr<Scene>> KeypointDetectionModel::inferBatch(
     const std::vector<ImageInputData>& inputImgs) {
-    auto results = BaseModel::inferBatchImage(inputImgs);
-    std::vector<std::unique_ptr<KeypointDetectionResult>> kpDetResults;
-    kpDetResults.reserve(results.size());
-    for (auto& result : results) {
-        kpDetResults.emplace_back(static_cast<KeypointDetectionResult*>(result.release()));
-    }
-    return kpDetResults;
+    return BaseModel::inferBatchImage(inputImgs);
 }
diff --git a/src/cpp/models/src/segmentation_model.cpp b/src/cpp/models/src/segmentation_model.cpp
index deea1c87..19ad5158 100644
--- a/src/cpp/models/src/segmentation_model.cpp
+++ b/src/cpp/models/src/segmentation_model.cpp
@@ -219,7 +219,7 @@ void SegmentationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model)
     }
 }
 
-std::unique_ptr<ResultBase> SegmentationModel::postprocess(InferenceResult& infResult) {
+std::unique_ptr<Scene> SegmentationModel::postprocess(InferenceResult& infResult) {
     const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
     const auto& outputName = outputNames[0] == feature_vector_name ? outputNames[1] : outputNames[0];
     const auto& outTensor = infResult.outputsData[outputName];
@@ -259,6 +259,7 @@ std::unique_ptr<ResultBase> SegmentationModel::postprocess(InferenceResult& infR
                0.0,
                cv::INTER_NEAREST);
 
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
     if (return_soft_prediction) {
         ImageResultWithSoftPrediction* result =
             new ImageResultWithSoftPrediction(infResult.frameId, infResult.metaData);
@@ -275,12 +276,13 @@ std::unique_ptr<ResultBase> SegmentationModel::postprocess(InferenceResult& infR
             result->saliency_map = get_activation_map(soft_prediction);
             result->feature_vector = iter->second;
         }
-        return std::unique_ptr<ResultBase>(result);
+        scene->image_result = std::unique_ptr<ImageResult>(result);
+    } else {
+        auto result = std::make_unique<ImageResult>(infResult.frameId, infResult.metaData);
+        result->resultImage = hard_prediction;
+        scene->image_result = std::move(result);
     }
-
-    ImageResult* result = new ImageResult(infResult.frameId, infResult.metaData);
-    result->resultImage = hard_prediction;
-    return std::unique_ptr<ResultBase>(result);
+    return scene;
 }
 
 std::vector<Contour> SegmentationModel::getContours(const ImageResultWithSoftPrediction& imageResult) {
@@ -315,17 +317,10 @@ std::vector<Contour> SegmentationModel::getContours(const ImageResultWithSoftPre
     return combined_contours;
 }
 
-std::unique_ptr<ImageResult> SegmentationModel::infer(const ImageInputData& inputData) {
-    auto result = BaseModel::inferImage(inputData);
-    return std::unique_ptr<ImageResult>(static_cast<ImageResult*>(result.release()));
+std::unique_ptr<Scene> SegmentationModel::infer(const ImageInputData& inputData) {
+    return BaseModel::inferImage(inputData);
 }
 
-std::vector<std::unique_ptr<ImageResult>> SegmentationModel::inferBatch(const std::vector<ImageInputData>& inputImgs) {
-    auto results = BaseModel::inferBatchImage(inputImgs);
-    std::vector<std::unique_ptr<ImageResult>> segResults;
-    segResults.reserve(results.size());
-    for (auto& result : results) {
-        segResults.emplace_back(static_cast<ImageResult*>(result.release()));
-    }
-    return segResults;
+std::vector<std::unique_ptr<Scene>> SegmentationModel::inferBatch(const std::vector<ImageInputData>& inputImgs) {
+    return BaseModel::inferBatchImage(inputImgs);
 }
diff --git a/src/cpp/tilers/include/tilers/detection.h b/src/cpp/tilers/include/tilers/detection.h
index 8fde112b..07525681 100644
--- a/src/cpp/tilers/include/tilers/detection.h
+++ b/src/cpp/tilers/include/tilers/detection.h
@@ -15,14 +15,14 @@ class DetectionTiler : public TilerBase {
                    ExecutionMode exec_mode = ExecutionMode::sync);
     virtual ~DetectionTiler() = default;
 
-    virtual std::unique_ptr<DetectionResult> run(const ImageInputData& inputData);
+    virtual std::unique_ptr<Scene> run(const ImageInputData& inputData);
 
 protected:
-    virtual std::unique_ptr<ResultBase> postprocess_tile(std::unique_ptr<ResultBase>, const cv::Rect&);
-    virtual std::unique_ptr<ResultBase> merge_results(const std::vector<std::unique_ptr<ResultBase>>&,
+    virtual std::unique_ptr<Scene> postprocess_tile(std::unique_ptr<Scene>, const cv::Rect&);
+    virtual std::unique_ptr<Scene> merge_results(const std::vector<std::unique_ptr<Scene>>&,
                                                       const cv::Size&,
                                                       const std::vector<cv::Rect>&);
-    ov::Tensor merge_saliency_maps(const std::vector<std::unique_ptr<ResultBase>>&,
+    ov::Tensor merge_saliency_maps(const std::vector<std::unique_ptr<Scene>>&,
                                    const cv::Size&,
                                    const std::vector<cv::Rect>&);
 
diff --git a/src/cpp/tilers/include/tilers/instance_segmentation.h b/src/cpp/tilers/include/tilers/instance_segmentation.h
index 3ca20dcb..43574e82 100644
--- a/src/cpp/tilers/include/tilers/instance_segmentation.h
+++ b/src/cpp/tilers/include/tilers/instance_segmentation.h
@@ -14,17 +14,17 @@ class InstanceSegmentationTiler : public TilerBase {
     InstanceSegmentationTiler(std::shared_ptr<BaseModel> model,
                               const ov::AnyMap& configuration,
                               ExecutionMode exec_mode = ExecutionMode::sync);
-    virtual std::unique_ptr<InstanceSegmentationResult> run(const ImageInputData& inputData);
+    virtual std::unique_ptr<Scene> run(const ImageInputData& inputData);
     virtual ~InstanceSegmentationTiler() = default;
     bool postprocess_semantic_masks = true;
 
 protected:
-    virtual std::unique_ptr<ResultBase> postprocess_tile(std::unique_ptr<ResultBase>, const cv::Rect&);
-    virtual std::unique_ptr<ResultBase> merge_results(const std::vector<std::unique_ptr<ResultBase>>&,
+    virtual std::unique_ptr<Scene> postprocess_tile(std::unique_ptr<Scene>, const cv::Rect&);
+    virtual std::unique_ptr<Scene> merge_results(const std::vector<std::unique_ptr<Scene>>&,
                                                       const cv::Size&,
                                                       const std::vector<cv::Rect>&);
 
-    std::vector<cv::Mat_<std::uint8_t>> merge_saliency_maps(const std::vector<std::unique_ptr<ResultBase>>&,
+    std::vector<cv::Mat_<std::uint8_t>> merge_saliency_maps(const std::vector<std::unique_ptr<Scene>>&,
                                                             const cv::Size&,
                                                             const std::vector<cv::Rect>&);
 
diff --git a/src/cpp/tilers/include/tilers/semantic_segmentation.h b/src/cpp/tilers/include/tilers/semantic_segmentation.h
index 4c9b9d1d..a9e3a951 100644
--- a/src/cpp/tilers/include/tilers/semantic_segmentation.h
+++ b/src/cpp/tilers/include/tilers/semantic_segmentation.h
@@ -14,12 +14,12 @@ class SemanticSegmentationTiler : public TilerBase {
     SemanticSegmentationTiler(std::shared_ptr<BaseModel> model,
                               const ov::AnyMap& configuration,
                               ExecutionMode exec_mode = ExecutionMode::sync);
-    virtual std::unique_ptr<ImageResultWithSoftPrediction> run(const ImageInputData& inputData);
+    virtual std::unique_ptr<Scene> run(const ImageInputData& inputData);
     virtual ~SemanticSegmentationTiler() = default;
 
 protected:
-    virtual std::unique_ptr<ResultBase> postprocess_tile(std::unique_ptr<ResultBase>, const cv::Rect&);
-    virtual std::unique_ptr<ResultBase> merge_results(const std::vector<std::unique_ptr<ResultBase>>&,
+    virtual std::unique_ptr<Scene> postprocess_tile(std::unique_ptr<Scene>, const cv::Rect&);
+    virtual std::unique_ptr<Scene> merge_results(const std::vector<std::unique_ptr<Scene>>&,
                                                       const cv::Size&,
                                                       const std::vector<cv::Rect>&);
 
diff --git a/src/cpp/tilers/include/tilers/tiler_base.h b/src/cpp/tilers/include/tilers/tiler_base.h
index 3fb45d1e..0df812c8 100644
--- a/src/cpp/tilers/include/tilers/tiler_base.h
+++ b/src/cpp/tilers/include/tilers/tiler_base.h
@@ -27,14 +27,14 @@ class TilerBase {
     virtual ~TilerBase() = default;
 
 protected:
-    virtual std::unique_ptr<ResultBase> run_impl(const ImageInputData& inputData);
+    virtual std::unique_ptr<Scene> run_impl(const ImageInputData& inputData);
     std::vector<cv::Rect> tile(const cv::Size&);
     std::vector<cv::Rect> filter_tiles(const cv::Mat&, const std::vector<cv::Rect>&);
-    std::unique_ptr<ResultBase> predict_sync(const cv::Mat&, const std::vector<cv::Rect>&);
-    std::unique_ptr<ResultBase> predict_async(const cv::Mat&, const std::vector<cv::Rect>&);
+    std::unique_ptr<Scene> predict_sync(const cv::Mat&, const std::vector<cv::Rect>&);
+    std::unique_ptr<Scene> predict_async(const cv::Mat&, const std::vector<cv::Rect>&);
     cv::Mat crop_tile(const cv::Mat&, const cv::Rect&);
-    virtual std::unique_ptr<ResultBase> postprocess_tile(std::unique_ptr<ResultBase>, const cv::Rect&) = 0;
-    virtual std::unique_ptr<ResultBase> merge_results(const std::vector<std::unique_ptr<ResultBase>>&,
+    virtual std::unique_ptr<Scene> postprocess_tile(std::unique_ptr<Scene>, const cv::Rect&) = 0;
+    virtual std::unique_ptr<Scene> merge_results(const std::vector<std::unique_ptr<Scene>>&,
                                                       const cv::Size&,
                                                       const std::vector<cv::Rect>&) = 0;
 
diff --git a/src/cpp/tilers/src/detection.cpp b/src/cpp/tilers/src/detection.cpp
index ec248664..72052e39 100644
--- a/src/cpp/tilers/src/detection.cpp
+++ b/src/cpp/tilers/src/detection.cpp
@@ -42,9 +42,9 @@ DetectionTiler::DetectionTiler(const std::shared_ptr<BaseModel>& _model,
     max_pred_number = get_from_any_maps("max_pred_number", configuration, extra_config, max_pred_number);
 }
 
-std::unique_ptr<ResultBase> DetectionTiler::postprocess_tile(std::unique_ptr<ResultBase> tile_result,
+std::unique_ptr<Scene> DetectionTiler::postprocess_tile(std::unique_ptr<Scene> tile_result,
                                                              const cv::Rect& coord) {
-    DetectionResult* det_res = static_cast<DetectionResult*>(tile_result.get());
+    auto& det_res = tile_result->detection_result;
     for (auto& det : det_res->objects) {
         det.x += coord.x;
         det.y += coord.y;
@@ -66,19 +66,18 @@ std::unique_ptr<ResultBase> DetectionTiler::postprocess_tile(std::unique_ptr<Res
     return tile_result;
 }
 
-std::unique_ptr<ResultBase> DetectionTiler::merge_results(const std::vector<std::unique_ptr<ResultBase>>& tiles_results,
+std::unique_ptr<Scene> DetectionTiler::merge_results(const std::vector<std::unique_ptr<Scene>>& tiles_results,
                                                           const cv::Size& image_size,
                                                           const std::vector<cv::Rect>& tile_coords) {
-    DetectionResult* result = new DetectionResult();
-    auto retVal = std::unique_ptr<ResultBase>(result);
+    auto result = std::make_unique<DetectionResult>();
+    auto scene = std::make_unique<Scene>();
 
     std::vector<AnchorLabeled> all_detections;
     std::vector<std::reference_wrapper<DetectedObject>> all_detections_refs;
     std::vector<float> all_scores;
 
     for (const auto& result : tiles_results) {
-        DetectionResult* det_res = static_cast<DetectionResult*>(result.get());
-        for (auto& det : det_res->objects) {
+        for (auto& det : result->detection_result->objects) {
             all_detections.emplace_back(det.x, det.y, det.x + det.width, det.y + det.height, det.labelID);
             all_scores.push_back(det.confidence);
             all_detections_refs.push_back(det);
@@ -93,7 +92,7 @@ std::unique_ptr<ResultBase> DetectionTiler::merge_results(const std::vector<std:
     }
 
     if (tiles_results.size()) {
-        DetectionResult* det_res = static_cast<DetectionResult*>(tiles_results.begin()->get());
+        auto& det_res = tiles_results.begin()->get()->detection_result;
         if (det_res->feature_vector) {
             result->feature_vector =
                 ov::Tensor(det_res->feature_vector.get_element_type(), det_res->feature_vector.get_shape());
@@ -110,8 +109,7 @@ std::unique_ptr<ResultBase> DetectionTiler::merge_results(const std::vector<std:
         std::fill(feature_ptr, feature_ptr + feature_size, 0.f);
 
         for (const auto& result : tiles_results) {
-            DetectionResult* det_res = static_cast<DetectionResult*>(result.get());
-            const float* current_feature_ptr = det_res->feature_vector.data<float>();
+            const float* current_feature_ptr = result->detection_result->feature_vector.data<float>();
 
             for (size_t i = 0; i < feature_size; ++i) {
                 feature_ptr[i] += current_feature_ptr[i];
@@ -123,17 +121,18 @@ std::unique_ptr<ResultBase> DetectionTiler::merge_results(const std::vector<std:
         }
     }
 
-    return retVal;
+    scene->detection_result = std::move(result);
+
+    return scene;
 }
 
-ov::Tensor DetectionTiler::merge_saliency_maps(const std::vector<std::unique_ptr<ResultBase>>& tiles_results,
+ov::Tensor DetectionTiler::merge_saliency_maps(const std::vector<std::unique_ptr<Scene>>& tiles_results,
                                                const cv::Size& image_size,
                                                const std::vector<cv::Rect>& tile_coords) {
     std::vector<ov::Tensor> all_saliency_maps;
     all_saliency_maps.reserve(tiles_results.size());
     for (const auto& result : tiles_results) {
-        auto det_res = static_cast<DetectionResult*>(result.get());
-        all_saliency_maps.push_back(det_res->saliency_map);
+        all_saliency_maps.push_back(result->detection_result->saliency_map);
     }
 
     ov::Tensor image_saliency_map;
@@ -219,7 +218,6 @@ ov::Tensor DetectionTiler::merge_saliency_maps(const std::vector<std::unique_ptr
     return merged_map;
 }
 
-std::unique_ptr<DetectionResult> DetectionTiler::run(const ImageInputData& inputData) {
-    auto result = this->run_impl(inputData);
-    return std::unique_ptr<DetectionResult>(static_cast<DetectionResult*>(result.release()));
+std::unique_ptr<Scene> DetectionTiler::run(const ImageInputData& inputData) {
+    return this->run_impl(inputData);
 }
diff --git a/src/cpp/tilers/src/instance_segmentation.cpp b/src/cpp/tilers/src/instance_segmentation.cpp
index 211a4761..27454d96 100644
--- a/src/cpp/tilers/src/instance_segmentation.cpp
+++ b/src/cpp/tilers/src/instance_segmentation.cpp
@@ -49,15 +49,14 @@ InstanceSegmentationTiler::InstanceSegmentationTiler(std::shared_ptr<BaseModel>
     max_pred_number = get_from_any_maps("max_pred_number", configuration, extra_config, max_pred_number);
 }
 
-std::unique_ptr<InstanceSegmentationResult> InstanceSegmentationTiler::run(const ImageInputData& inputData) {
+std::unique_ptr<Scene> InstanceSegmentationTiler::run(const ImageInputData& inputData) {
     auto setter = MaskRCNNModelParamsSetter(model);
-    auto result = this->run_impl(inputData);
-    return std::unique_ptr<InstanceSegmentationResult>(static_cast<InstanceSegmentationResult*>(result.release()));
+    return this->run_impl(inputData);
 }
 
-std::unique_ptr<ResultBase> InstanceSegmentationTiler::postprocess_tile(std::unique_ptr<ResultBase> tile_result,
+std::unique_ptr<Scene> InstanceSegmentationTiler::postprocess_tile(std::unique_ptr<Scene> tile_result,
                                                                         const cv::Rect& coord) {
-    auto* iseg_res = static_cast<InstanceSegmentationResult*>(tile_result.get());
+    auto& iseg_res = tile_result->instance_segmentation_result;
     for (auto& det : iseg_res->segmentedObjects) {
         det.x += coord.x;
         det.y += coord.y;
@@ -73,20 +72,19 @@ std::unique_ptr<ResultBase> InstanceSegmentationTiler::postprocess_tile(std::uni
     return tile_result;
 }
 
-std::unique_ptr<ResultBase> InstanceSegmentationTiler::merge_results(
-    const std::vector<std::unique_ptr<ResultBase>>& tiles_results,
+std::unique_ptr<Scene> InstanceSegmentationTiler::merge_results(
+    const std::vector<std::unique_ptr<Scene>>& tiles_results,
     const cv::Size& image_size,
     const std::vector<cv::Rect>& tile_coords) {
-    auto* result = new InstanceSegmentationResult();
-    auto retVal = std::unique_ptr<ResultBase>(result);
+    auto scene = std::make_unique<Scene>();
+    auto result = std::make_unique<InstanceSegmentationResult>();
 
     std::vector<AnchorLabeled> all_detections;
     std::vector<std::reference_wrapper<SegmentedObject>> all_detections_ptrs;
     std::vector<float> all_scores;
 
     for (const auto& result : tiles_results) {
-        auto* iseg_res = static_cast<InstanceSegmentationResult*>(result.get());
-        for (auto& det : iseg_res->segmentedObjects) {
+        for (auto& det : result->instance_segmentation_result->segmentedObjects) {
             all_detections.emplace_back(det.x, det.y, det.x + det.width, det.y + det.height, det.labelID);
             all_scores.push_back(det.confidence);
             all_detections_ptrs.push_back(det);
@@ -107,7 +105,7 @@ std::unique_ptr<ResultBase> InstanceSegmentationTiler::merge_results(
     }
 
     if (tiles_results.size()) {
-        auto* iseg_res = static_cast<InstanceSegmentationResult*>(tiles_results.begin()->get());
+        auto& iseg_res = tiles_results.begin()->get()->instance_segmentation_result;
         if (iseg_res->feature_vector) {
             result->feature_vector =
                 ov::Tensor(iseg_res->feature_vector.get_element_type(), iseg_res->feature_vector.get_shape());
@@ -121,7 +119,7 @@ std::unique_ptr<ResultBase> InstanceSegmentationTiler::merge_results(
         std::fill(feature_ptr, feature_ptr + feature_size, 0.f);
 
         for (const auto& result : tiles_results) {
-            auto* iseg_res = static_cast<InstanceSegmentationResult*>(result.get());
+            auto& iseg_res = result->instance_segmentation_result;
             const float* current_feature_ptr = iseg_res->feature_vector.data<float>();
 
             for (size_t i = 0; i < feature_size; ++i) {
@@ -136,17 +134,18 @@ std::unique_ptr<ResultBase> InstanceSegmentationTiler::merge_results(
 
     result->saliency_map = merge_saliency_maps(tiles_results, image_size, tile_coords);
 
-    return retVal;
+    scene->instance_segmentation_result = std::move(result);
+    return scene;
 }
 
 std::vector<cv::Mat_<std::uint8_t>> InstanceSegmentationTiler::merge_saliency_maps(
-    const std::vector<std::unique_ptr<ResultBase>>& tiles_results,
+    const std::vector<std::unique_ptr<Scene>>& tiles_results,
     const cv::Size& image_size,
     const std::vector<cv::Rect>& tile_coords) {
     std::vector<std::vector<cv::Mat_<std::uint8_t>>> all_saliecy_maps;
     all_saliecy_maps.reserve(tiles_results.size());
     for (const auto& result : tiles_results) {
-        auto det_res = static_cast<InstanceSegmentationResult*>(result.get());
+        auto& det_res = result->instance_segmentation_result;
         all_saliecy_maps.push_back(det_res->saliency_map);
     }
 
diff --git a/src/cpp/tilers/src/semantic_segmentation.cpp b/src/cpp/tilers/src/semantic_segmentation.cpp
index 6a8efc89..ac7ff5b7 100644
--- a/src/cpp/tilers/src/semantic_segmentation.cpp
+++ b/src/cpp/tilers/src/semantic_segmentation.cpp
@@ -50,15 +50,13 @@ SemanticSegmentationTiler::SemanticSegmentationTiler(std::shared_ptr<BaseModel>
         get_from_any_maps("return_soft_prediction", configuration, extra_config, return_soft_prediction);
 }
 
-std::unique_ptr<ImageResultWithSoftPrediction> SemanticSegmentationTiler::run(const ImageInputData& inputData) {
-    auto result = this->run_impl(inputData);
-    return std::unique_ptr<ImageResultWithSoftPrediction>(
-        static_cast<ImageResultWithSoftPrediction*>(result.release()));
+std::unique_ptr<Scene> SemanticSegmentationTiler::run(const ImageInputData& inputData) {
+    return this->run_impl(inputData);
 }
 
-std::unique_ptr<ResultBase> SemanticSegmentationTiler::postprocess_tile(std::unique_ptr<ResultBase> tile_result,
+std::unique_ptr<Scene> SemanticSegmentationTiler::postprocess_tile(std::unique_ptr<Scene> tile_result,
                                                                         const cv::Rect&) {
-    ImageResultWithSoftPrediction* soft = dynamic_cast<ImageResultWithSoftPrediction*>(tile_result.get());
+    ImageResultWithSoftPrediction* soft = dynamic_cast<ImageResultWithSoftPrediction*>(tile_result->image_result.get());
     if (!soft) {
         throw std::runtime_error(
             "SemanticSegmentationTiler requires the underlying model to return ImageResultWithSoftPrediction");
@@ -66,22 +64,22 @@ std::unique_ptr<ResultBase> SemanticSegmentationTiler::postprocess_tile(std::uni
     return tile_result;
 }
 
-std::unique_ptr<ResultBase> SemanticSegmentationTiler::merge_results(
-    const std::vector<std::unique_ptr<ResultBase>>& tiles_results,
+std::unique_ptr<Scene> SemanticSegmentationTiler::merge_results(
+    const std::vector<std::unique_ptr<Scene>>& tiles_results,
     const cv::Size& image_size,
     const std::vector<cv::Rect>& tile_coords) {
     if (tiles_results.empty()) {
-        return std::unique_ptr<ResultBase>(new ImageResultWithSoftPrediction());
+        return std::make_unique<Scene>();
     }
 
     cv::Mat voting_mask(cv::Size(image_size.width, image_size.height), CV_32SC1, cv::Scalar(0));
-    auto* sseg_res = static_cast<ImageResultWithSoftPrediction*>(tiles_results[0].get());
+    auto* sseg_res = static_cast<ImageResultWithSoftPrediction*>(tiles_results[0]->image_result.get());
     cv::Mat merged_soft_prediction(cv::Size(image_size.width, image_size.height),
                                    CV_32FC(sseg_res->soft_prediction.channels()),
                                    cv::Scalar(0));
 
     for (size_t i = 0; i < tiles_results.size(); ++i) {
-        auto* sseg_res = static_cast<ImageResultWithSoftPrediction*>(tiles_results[i].get());
+        auto* sseg_res = static_cast<ImageResultWithSoftPrediction*>(tiles_results[i]->image_result.get());
         voting_mask(tile_coords[i]) += 1;
         merged_soft_prediction(tile_coords[i]) += sseg_res->soft_prediction;
     }
@@ -91,16 +89,12 @@ std::unique_ptr<ResultBase> SemanticSegmentationTiler::merge_results(
     cv::Mat hard_prediction =
         create_hard_prediction_from_soft_prediction(merged_soft_prediction, soft_threshold, blur_strength);
 
-    std::unique_ptr<ResultBase> retVal;
+    auto scene = std::make_unique<Scene>();
+    auto result = std::make_unique<ImageResultWithSoftPrediction>();
+    result->resultImage = hard_prediction;
     if (return_soft_prediction) {
-        auto* result = new ImageResultWithSoftPrediction();
-        retVal = std::unique_ptr<ResultBase>(result);
         result->soft_prediction = merged_soft_prediction;
-        result->resultImage = hard_prediction;
-    } else {
-        auto* result = new ImageResult();
-        retVal = std::unique_ptr<ResultBase>(result);
-        result->resultImage = hard_prediction;
     }
-    return retVal;
+    scene->image_result = std::move(result);
+    return scene;
 }
diff --git a/src/cpp/tilers/src/tiler_base.cpp b/src/cpp/tilers/src/tiler_base.cpp
index 6d979dea..e56ed226 100644
--- a/src/cpp/tilers/src/tiler_base.cpp
+++ b/src/cpp/tilers/src/tiler_base.cpp
@@ -68,8 +68,8 @@ std::vector<cv::Rect> TilerBase::filter_tiles(const cv::Mat&, const std::vector<
     return coords;
 }
 
-std::unique_ptr<ResultBase> TilerBase::predict_sync(const cv::Mat& image, const std::vector<cv::Rect>& tile_coords) {
-    std::vector<std::unique_ptr<ResultBase>> tile_results;
+std::unique_ptr<Scene> TilerBase::predict_sync(const cv::Mat& image, const std::vector<cv::Rect>& tile_coords) {
+    std::vector<std::unique_ptr<Scene>> tile_results;
 
     for (const auto& coord : tile_coords) {
         auto tile_img = crop_tile(image, coord);
@@ -81,7 +81,7 @@ std::unique_ptr<ResultBase> TilerBase::predict_sync(const cv::Mat& image, const
     return merge_results(tile_results, image.size(), tile_coords);
 }
 
-std::unique_ptr<ResultBase> TilerBase::predict_async(const cv::Mat& image, const std::vector<cv::Rect>& tile_coords) {
+std::unique_ptr<Scene> TilerBase::predict_async(const cv::Mat& image, const std::vector<cv::Rect>& tile_coords) {
     std::vector<ImageInputData> input_data;
 
     input_data.reserve(tile_coords.size());
@@ -90,7 +90,7 @@ std::unique_ptr<ResultBase> TilerBase::predict_async(const cv::Mat& image, const
         input_data.push_back(ImageInputData(tile_img.clone()));
     }
 
-    std::vector<std::unique_ptr<ResultBase>> tile_results;
+    std::vector<std::unique_ptr<Scene>> tile_results;
     auto tile_predictions = model->inferBatchImage(input_data);
     for (size_t i = 0; i < tile_predictions.size(); ++i) {
         auto tile_result = postprocess_tile(std::move(tile_predictions[i]), tile_coords[i]);
@@ -103,7 +103,7 @@ cv::Mat TilerBase::crop_tile(const cv::Mat& image, const cv::Rect& coord) {
     return cv::Mat(image, coord);
 }
 
-std::unique_ptr<ResultBase> TilerBase::run_impl(const ImageInputData& inputData) {
+std::unique_ptr<Scene> TilerBase::run_impl(const ImageInputData& inputData) {
     auto& image = inputData.inputImage;
     auto tile_coords = tile(image.size());
     tile_coords = filter_tiles(image, tile_coords);
diff --git a/tests/cpp/accuracy/test_YOLOv8.cpp b/tests/cpp/accuracy/test_YOLOv8.cpp
index c4ee90bd..881812f9 100644
--- a/tests/cpp/accuracy/test_YOLOv8.cpp
+++ b/tests/cpp/accuracy/test_YOLOv8.cpp
@@ -60,7 +60,7 @@ TEST_P(AccuracySuit, TestDetector) {
     EXPECT_EQ(ss.str(),
               string{*cached_model(param.model_name)
                           ->infer(cv::imread(data() + "/coco128/images/train2017/" + param.refpath.stem().string() +
-                                             ".jpg"))});
+                                             ".jpg"))->detection_result});
 }
 
 INSTANTIATE_TEST_SUITE_P(YOLOv8, AccuracySuit, testing::ValuesIn([] {
diff --git a/tests/cpp/accuracy/test_accuracy.cpp b/tests/cpp/accuracy/test_accuracy.cpp
index 615ebc43..ed5a3a17 100644
--- a/tests/cpp/accuracy/test_accuracy.cpp
+++ b/tests/cpp/accuracy/test_accuracy.cpp
@@ -159,7 +159,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         throw std::runtime_error{"Failed to read the image"};
                     }
 
-                    std::unique_ptr<DetectionResult> result;
+                    std::unique_ptr<Scene> result;
                     if (modelData.tiler == "DetectionTiler") {
                         auto tiler = DetectionTiler(std::move(model), {});
                         if (modelData.input_res.height > 0 && modelData.input_res.width > 0) {
@@ -169,7 +169,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                     } else {
                         result = model->infer(image);
                     }
-                    EXPECT_EQ(std::string{*result}, modelData.testData[i].reference[0]);
+                    EXPECT_EQ(std::string{*result->detection_result}, modelData.testData[i].reference[0]);
                 }
             }
         } else if (modelData.type == "ClassificationModel") {
@@ -183,7 +183,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         throw std::runtime_error{"Failed to read the image"};
                     }
                     auto result = model->infer(image);
-                    EXPECT_EQ(std::string{*result}, modelData.testData[i].reference[0]);
+                    EXPECT_EQ(std::string{*result->classification_result}, modelData.testData[i].reference[0]);
                 }
             }
         } else if (modelData.type == "SegmentationModel") {
@@ -197,7 +197,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         throw std::runtime_error{"Failed to read the image"};
                     }
 
-                    std::unique_ptr<ImageResult> pred;
+                    std::unique_ptr<Scene> pred;
                     if (modelData.tiler == "SemanticSegmentationTiler") {
                         auto tiler = SemanticSegmentationTiler(std::move(model), {});
                         if (modelData.input_res.height > 0 && modelData.input_res.width > 0) {
@@ -208,7 +208,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         pred = model->infer(image);
                     }
 
-                    ImageResultWithSoftPrediction* soft = dynamic_cast<ImageResultWithSoftPrediction*>(pred.get());
+                    ImageResultWithSoftPrediction* soft = dynamic_cast<ImageResultWithSoftPrediction*>(pred->image_result.get());
                     if (soft) {
                         const std::vector<Contour>& contours = model->getContours(*soft);
                         std::stringstream ss;
@@ -218,7 +218,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         }
                         ASSERT_EQ(ss.str(), modelData.testData[i].reference[0]);
                     } else {
-                        ASSERT_EQ(std::string{*pred}, modelData.testData[i].reference[0]);
+                        ASSERT_EQ(std::string{*pred->image_result}, modelData.testData[i].reference[0]);
                     }
                 }
             }
@@ -233,7 +233,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         throw std::runtime_error{"Failed to read the image"};
                     }
 
-                    std::unique_ptr<InstanceSegmentationResult> result;
+                    std::unique_ptr<Scene> result;
                     if (modelData.tiler == "InstanceSegmentationTiler") {
                         auto tiler = InstanceSegmentationTiler(std::move(model), {});
                         if (modelData.input_res.height > 0 && modelData.input_res.width > 0) {
@@ -245,20 +245,20 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                     }
 
                     const std::vector<SegmentedObjectWithRects>& withRects =
-                        add_rotated_rects(result->segmentedObjects);
+                        add_rotated_rects(result->instance_segmentation_result->segmentedObjects);
                     std::stringstream ss;
                     for (const SegmentedObjectWithRects& obj : withRects) {
                         ss << obj << "; ";
                     }
                     size_t filled = 0;
-                    for (const cv::Mat_<std::uint8_t>& cls_map : result->saliency_map) {
+                    for (const cv::Mat_<std::uint8_t>& cls_map : result->instance_segmentation_result->saliency_map) {
                         if (cls_map.data) {
                             ++filled;
                         }
                     }
                     ss << filled << "; ";
                     try {
-                        ss << result->feature_vector.get_shape();
+                        ss << result->instance_segmentation_result->feature_vector.get_shape();
                     } catch (ov::Exception&) {
                         ss << "[0]";
                     }
@@ -266,7 +266,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                     try {
                         // getContours() assumes each instance generates only one contour.
                         // That doesn't hold for some models
-                        for (const Contour& contour : getContours(result->segmentedObjects)) {
+                        for (const Contour& contour : getContours(result->instance_segmentation_result->segmentedObjects)) {
                             ss << contour << "; ";
                         }
                     } catch (const std::runtime_error&) {
@@ -285,7 +285,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         throw std::runtime_error{"Failed to read the image"};
                     }
                     auto result = model->infer(image);
-                    EXPECT_EQ(std::string{*result}, modelData.testData[i].reference[0]);
+                    EXPECT_EQ(std::string{*result->anomaly_result}, modelData.testData[i].reference[0]);
                 }
             }
         } else if (modelData.type == "KeypointDetectionModel") {
@@ -303,7 +303,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         throw std::runtime_error{"Failed to read the image"};
                     }
                     auto result = model->infer(image);
-                    EXPECT_EQ(std::string{(*result).poses[0]}, modelData.testData[i].reference[0]);
+                    EXPECT_EQ(std::string{(*result->keypoint_detection_result).poses[0]}, modelData.testData[i].reference[0]);
                 }
             }
         }

From 110886bf6fb198e9a23f39bc7091d0066174f436 Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Thu, 24 Apr 2025 15:32:15 +0200
Subject: [PATCH 02/16] Rework Detection to new scene system

Refactored quite a bit for the tiler since saliency maps are cv::Mats now.
---
 .../include/models/detection_model_yolo.h     |   5 +-
 src/cpp/models/include/models/results.h       |  66 ++++++-
 src/cpp/models/src/detection_model_ssd.cpp    |  78 ++++----
 src/cpp/models/src/detection_model_yolo.cpp   |  59 +++---
 .../src/detection_model_yolov3_onnx.cpp       |  10 +-
 src/cpp/models/src/detection_model_yolox.cpp  |  12 +-
 src/cpp/tilers/include/tilers/detection.h     |   2 +-
 src/cpp/tilers/src/detection.cpp              | 173 ++++++------------
 tests/cpp/accuracy/test_YOLOv8.cpp            |   2 +-
 tests/cpp/accuracy/test_accuracy.cpp          |   2 +-
 10 files changed, 199 insertions(+), 210 deletions(-)

diff --git a/src/cpp/models/include/models/detection_model_yolo.h b/src/cpp/models/include/models/detection_model_yolo.h
index 40ffad20..491ee50b 100644
--- a/src/cpp/models/include/models/detection_model_yolo.h
+++ b/src/cpp/models/include/models/detection_model_yolo.h
@@ -16,7 +16,6 @@
 
 #include "models/detection_model_ext.h"
 
-struct DetectedObject;
 struct InferenceResult;
 
 class ModelYolo : public DetectionModelExt {
@@ -56,10 +55,10 @@ class ModelYolo : public DetectionModelExt {
                          const unsigned long resized_im_w,
                          const unsigned long original_im_h,
                          const unsigned long original_im_w,
-                         std::vector<DetectedObject>& objects);
+                         std::vector<Box>& objects);
 
     static int calculateEntryIndex(int entriesNum, int lcoords, size_t lclasses, int location, int entry);
-    static double intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2);
+    static double intersectionOverUnion(const Box& o1, const Box& o2);
 
     std::map<std::string, Region> regions;
     float iou_threshold;
diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index d87849b5..6000698d 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -379,6 +379,42 @@ struct KeypointDetectionResult : public ResultBase {
     std::vector<DetectedKeypoints> poses;
 };
 
+class Label {
+public:
+    Label(std::string id, std::string name, float score):  id(id), name(name), score(score) {}
+
+    std::string id;
+    std::string name;
+    float score;
+
+    friend std::ostream& operator<< (std::ostream& os, const Label& label) {
+        return os << label.id << " (" << label.name << "): " << std::fixed << std::setprecision(3) << label.score << "; ";
+    }
+};
+
+class Box {
+public:
+    Box(cv::Rect shape, std::vector<Label> labels): shape(shape), labels(labels) {}
+    cv::Rect shape;
+    std::vector<Label> labels;
+
+    friend std::ostream& operator<< (std::ostream& os, const Box& box) {
+
+        os << int(box.shape.x) << ", " << int(box.shape.y) << ", " << int(box.shape.x + box.shape.width) << ", "
+                  << int(box.shape.y + box.shape.height) << ", ";
+        for (auto& label: box.labels) {
+            os << label;
+        }
+
+        return os;
+    }
+
+    explicit operator std::string() {
+        std::stringstream ss;
+        ss << *this;
+        return ss.str();
+    }
+};
 
 class Scene {
 public:
@@ -389,10 +425,38 @@ class Scene {
     int64_t frameId;
     std::shared_ptr<MetaData> metaData;
 
-    std::unique_ptr<DetectionResult> detection_result;
     std::unique_ptr<ClassificationResult> classification_result;
     std::unique_ptr<KeypointDetectionResult> keypoint_detection_result;
     std::unique_ptr<AnomalyResult> anomaly_result;
     std::unique_ptr<InstanceSegmentationResult> instance_segmentation_result;
     std::unique_ptr<ImageResult> image_result;
+
+    std::vector<Box> boxes;
+    std::vector<cv::Mat> saliency_maps;
+    std::vector<ov::Tensor> feature_vectors;
+
+    friend std::ostream& operator<<(std::ostream& os, const Scene& scene) {
+        for (auto& box: scene.boxes) {
+            os << box;
+        }
+
+        if (scene.saliency_maps.empty()){
+            os << "[0]; ";
+        } else {
+            os << "[1," << scene.saliency_maps.size() << "," << scene.saliency_maps[0].rows << "," << scene.saliency_maps[0].cols << "]; ";
+        }
+
+        if (scene.feature_vectors.empty()){
+            os << "[0]";
+        } else {
+            os << scene.feature_vectors[0].get_shape();
+        }
+        return os;
+    }
+
+    explicit operator std::string() {
+        std::stringstream ss;
+        ss << *this;
+        return ss.str();
+    }
 };
diff --git a/src/cpp/models/src/detection_model_ssd.cpp b/src/cpp/models/src/detection_model_ssd.cpp
index 0eb968cf..3013869a 100644
--- a/src/cpp/models/src/detection_model_ssd.cpp
+++ b/src/cpp/models/src/detection_model_ssd.cpp
@@ -98,11 +98,15 @@ std::unique_ptr<Scene> ModelSSD::postprocess(InferenceResult& infResult) {
                                                                               : postprocessSingleOutput(infResult);
     auto saliency_map_iter = infResult.outputsData.find(saliency_map_name);
     if (saliency_map_iter != infResult.outputsData.end()) {
-        result->detection_result->saliency_map = std::move(saliency_map_iter->second);
+        size_t shape_shift = (saliency_map_iter->second.get_shape().size() > 3) ? 1 : 0;
+        for (size_t i = 0; i < labels.size(); i++){
+            result->saliency_maps.push_back(wrap_saliency_map_tensor_to_mat(saliency_map_iter->second, shape_shift, i).clone());
+
+        }
     }
     auto feature_vector_iter = infResult.outputsData.find(feature_vector_name);
     if (feature_vector_iter != infResult.outputsData.end()) {
-        result->detection_result->feature_vector = std::move(feature_vector_iter->second);
+        result->feature_vectors.push_back(std::move(feature_vector_iter->second));
     }
     return result;
 }
@@ -115,7 +119,6 @@ std::unique_ptr<Scene> ModelSSD::postprocessSingleOutput(InferenceResult& infRes
     const float* detections = detectionsTensor.data<float>();
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
 
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
     float floatInputImgWidth = float(internalData.inputImgWidth),
@@ -140,34 +143,30 @@ std::unique_ptr<Scene> ModelSSD::postprocessSingleOutput(InferenceResult& infRes
 
         /** Filtering out objects with confidence < confidence_threshold probability **/
         if (confidence > confidence_threshold) {
-            DetectedObject desc;
-
-            desc.confidence = confidence;
-            desc.labelID = static_cast<size_t>(detections[i * numAndStep.objectSize + 1]);
-            desc.label = getLabelName(desc.labelID);
-            desc.x =
-                clamp(round((detections[i * numAndStep.objectSize + 3] * netInputWidth - padLeft) * invertedScaleX),
+            float x = clamp(round((detections[i * numAndStep.objectSize + 3] * netInputWidth - padLeft) * invertedScaleX),
                       0.f,
                       floatInputImgWidth);
-            desc.y =
-                clamp(round((detections[i * numAndStep.objectSize + 4] * netInputHeight - padTop) * invertedScaleY),
+            float y = clamp(round((detections[i * numAndStep.objectSize + 4] * netInputHeight - padTop) * invertedScaleY),
                       0.f,
                       floatInputImgHeight);
-            desc.width =
-                clamp(round((detections[i * numAndStep.objectSize + 5] * netInputWidth - padLeft) * invertedScaleX),
-                      0.f,
-                      floatInputImgWidth) -
-                desc.x;
-            desc.height =
-                clamp(round((detections[i * numAndStep.objectSize + 6] * netInputHeight - padTop) * invertedScaleY),
-                      0.f,
-                      floatInputImgHeight) -
-                desc.y;
-            result->objects.push_back(desc);
+            size_t labelID = static_cast<size_t>(detections[i * numAndStep.objectSize + 1]);
+            Box box(
+                cv::Rect(
+                    x,
+                    y,
+                    clamp(round((detections[i * numAndStep.objectSize + 5] * netInputWidth - padLeft) * invertedScaleX),
+                        0.f,
+                        floatInputImgWidth) - x,
+
+                    clamp(round((detections[i * numAndStep.objectSize + 6] * netInputHeight - padTop) * invertedScaleY),
+                        0.f,
+                        floatInputImgHeight) - y
+                ),
+                {Label(std::to_string(labelID), getLabelName(labelID), confidence)}
+            );
+            scene->boxes.push_back(box);
         }
     }
-
-    scene->detection_result = std::move(result);
     return scene;
 }
 
@@ -205,34 +204,31 @@ std::unique_ptr<Scene> ModelSSD::postprocessMultipleOutputs(InferenceResult& inf
 
         /** Filtering out objects with confidence < confidence_threshold probability **/
         if (confidence > confidence_threshold) {
-            DetectedObject desc;
-
-            desc.confidence = confidence;
-            desc.labelID = labels[i];
-            desc.label = getLabelName(desc.labelID);
-            desc.x = clamp_and_round((boxes[i * numAndStep.objectSize] * widthScale - padLeft) * invertedScaleX,
+            auto x = clamp_and_round((boxes[i * numAndStep.objectSize] * widthScale - padLeft) * invertedScaleX,
                                      0.f,
                                      floatInputImgWidth);
-            desc.y = clamp_and_round((boxes[i * numAndStep.objectSize + 1] * heightScale - padTop) * invertedScaleY,
+            auto y = clamp_and_round((boxes[i * numAndStep.objectSize + 1] * heightScale - padTop) * invertedScaleY,
                                      0.f,
                                      floatInputImgHeight);
-            desc.width = clamp_and_round((boxes[i * numAndStep.objectSize + 2] * widthScale - padLeft) * invertedScaleX,
+            auto width = clamp_and_round((boxes[i * numAndStep.objectSize + 2] * widthScale - padLeft) * invertedScaleX,
                                          0.f,
-                                         floatInputImgWidth) -
-                         desc.x;
-            desc.height =
+                                         floatInputImgWidth) - x;
+            auto height =
                 clamp_and_round((boxes[i * numAndStep.objectSize + 3] * heightScale - padTop) * invertedScaleY,
                                 0.f,
-                                floatInputImgHeight) -
-                desc.y;
+                                floatInputImgHeight) - y;
+
 
-            if (desc.width * desc.height >= box_area_threshold) {
-                result->objects.push_back(desc);
+            if (width * height >= box_area_threshold) {
+                scene->boxes.push_back(Box(
+                  cv::Rect(x, y, width, height),
+                  {Label(std::to_string(labels[i]), getLabelName(labels[i]), confidence)}
+                ));
             }
         }
     }
 
-    scene->detection_result = std::move(result);
+    //scene->detection_result = std::move(result);
     return scene;
 }
 
diff --git a/src/cpp/models/src/detection_model_yolo.cpp b/src/cpp/models/src/detection_model_yolo.cpp
index 49e83f8a..229c033d 100644
--- a/src/cpp/models/src/detection_model_yolo.cpp
+++ b/src/cpp/models/src/detection_model_yolo.cpp
@@ -259,8 +259,7 @@ void ModelYolo::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
 
 std::unique_ptr<Scene> ModelYolo::postprocess(InferenceResult& infResult) {
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
-    std::vector<DetectedObject> objects;
+    std::vector<Box> objects;
 
     // Parsing outputs
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
@@ -283,7 +282,7 @@ std::unique_ptr<Scene> ModelYolo::postprocess(InferenceResult& infResult) {
         for (const auto& obj1 : objects) {
             bool isGoodResult = true;
             for (const auto& obj2 : objects) {
-                if (obj1.labelID == obj2.labelID && obj1.confidence < obj2.confidence &&
+                if (obj1.labels[0].id == obj2.labels[0].id && obj1.labels[0].score < obj2.labels[0].score &&
                     intersectionOverUnion(obj1, obj2) >= iou_threshold) {  // if obj1 is the same as obj2, condition
                                                                            // expression will evaluate to false anyway
                     isGoodResult = false;
@@ -291,26 +290,24 @@ std::unique_ptr<Scene> ModelYolo::postprocess(InferenceResult& infResult) {
                 }
             }
             if (isGoodResult) {
-                result->objects.push_back(obj1);
+                scene->boxes.push_back(obj1);
             }
         }
     } else {
         // Classic postprocessing
-        std::sort(objects.begin(), objects.end(), [](const DetectedObject& x, const DetectedObject& y) {
-            return x.confidence > y.confidence;
+        std::sort(objects.begin(), objects.end(), [](const Box& x, const Box& y) {
+            return x.labels[0].score > y.labels[0].score;
         });
         for (size_t i = 0; i < objects.size(); ++i) {
-            if (objects[i].confidence == 0)
+            if (objects[i].labels[0].score == 0)
                 continue;
             for (size_t j = i + 1; j < objects.size(); ++j)
                 if (intersectionOverUnion(objects[i], objects[j]) >= iou_threshold)
-                    objects[j].confidence = 0;
-            result->objects.push_back(objects[i]);
+                    objects[j].labels[0].score = 0;
+            scene->boxes.push_back(objects[i]);
         }
     }
 
-    scene->detection_result = std::move(result);
-
     return scene;
 }
 
@@ -320,7 +317,7 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
                                 const unsigned long resized_im_w,
                                 const unsigned long original_im_h,
                                 const unsigned long original_im_w,
-                                std::vector<DetectedObject>& objects) {
+                                std::vector<Box>& objects) {
     // --------------------------- Extracting layer parameters -------------------------------------
     auto it = regions.find(output_name);
     if (it == regions.end()) {
@@ -397,7 +394,7 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
                 float width = static_cast<float>(std::exp(outData[box_index + 2 * entriesNum]) * region.anchors[2 * n] *
                                                  original_im_w / scaleW);
 
-                DetectedObject obj;
+                cv::Rect obj;
                 obj.x = clamp(x - width / 2, 0.f, static_cast<float>(original_im_w));
                 obj.y = clamp(y - height / 2, 0.f, static_cast<float>(original_im_h));
                 obj.width = clamp(width, 0.f, static_cast<float>(original_im_w - obj.x));
@@ -413,10 +410,7 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
 
                     //--- Checking confidence threshold conformance and adding region to the list
                     if (prob >= confidence_threshold) {
-                        obj.confidence = prob;
-                        obj.labelID = j;
-                        obj.label = getLabelName(obj.labelID);
-                        objects.push_back(obj);
+                        objects.push_back(Box(obj, {Label(std::to_string(j), getLabelName(j), prob)}));
                     }
                 }
             }
@@ -430,7 +424,9 @@ int ModelYolo::calculateEntryIndex(int totalCells, int lcoords, size_t lclasses,
     return (n * (lcoords + lclasses) + entry) * totalCells + loc;
 }
 
-double ModelYolo::intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2) {
+double ModelYolo::intersectionOverUnion(const Box& o1Box, const Box& o2Box) {
+    auto& o1 = o1Box.shape;
+    auto& o2 = o2Box.shape;
     double overlappingWidth = fmin(o1.x + o1.width, o2.x + o2.width) - fmax(o1.x, o2.x);
     double overlappingHeight = fmin(o1.y + o1.height, o2.y + o2.height) - fmax(o1.y, o2.y);
     double intersectionArea =
@@ -613,7 +609,6 @@ std::unique_ptr<Scene> YOLOv5::postprocess(InferenceResult& infResult) {
         keep = multiclass_nms(boxes_with_class, confidences, iou_threshold, includeBoundaries, keep_top_k);
     }
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
 
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
     float floatInputImgWidth = float(internalData.inputImgWidth),
@@ -628,19 +623,21 @@ std::unique_ptr<Scene> YOLOv5::postprocess(InferenceResult& infResult) {
         }
     }
     for (size_t idx : keep) {
-        DetectedObject desc;
-        desc.x = clamp(round((boxes_with_class[idx].left - padLeft) * invertedScaleX), 0.f, floatInputImgWidth);
-        desc.y = clamp(round((boxes_with_class[idx].top - padTop) * invertedScaleY), 0.f, floatInputImgHeight);
-        desc.width =
-            clamp(round((boxes_with_class[idx].right - padLeft) * invertedScaleX), 0.f, floatInputImgWidth) - desc.x;
-        desc.height =
-            clamp(round((boxes_with_class[idx].bottom - padTop) * invertedScaleY), 0.f, floatInputImgHeight) - desc.y;
-        desc.confidence = confidences[idx];
-        desc.labelID = static_cast<size_t>(boxes_with_class[idx].labelID);
-        desc.label = getLabelName(desc.labelID);
-        result->objects.push_back(desc);
+        auto x = clamp(round((boxes_with_class[idx].left - padLeft) * invertedScaleX), 0.f, floatInputImgWidth);
+        auto y = clamp(round((boxes_with_class[idx].top - padTop) * invertedScaleY), 0.f, floatInputImgHeight);
+        auto width =
+            clamp(round((boxes_with_class[idx].right - padLeft) * invertedScaleX), 0.f, floatInputImgWidth) - x;
+        auto height =
+            clamp(round((boxes_with_class[idx].bottom - padTop) * invertedScaleY), 0.f, floatInputImgHeight) - y;
+        auto confidence = confidences[idx];
+        auto labelID = static_cast<size_t>(boxes_with_class[idx].labelID);
+        auto label = getLabelName(labelID);
+
+        scene->boxes.push_back(Box(
+            cv::Rect(x, y, width, height),
+            {Label(std::to_string(labelID), label, confidence)}
+        ));
     }
-    scene->detection_result = std::move(result);
 
     return scene;
 }
diff --git a/src/cpp/models/src/detection_model_yolov3_onnx.cpp b/src/cpp/models/src/detection_model_yolov3_onnx.cpp
index f9ccf57a..348d3073 100644
--- a/src/cpp/models/src/detection_model_yolov3_onnx.cpp
+++ b/src/cpp/models/src/detection_model_yolov3_onnx.cpp
@@ -150,7 +150,7 @@ std::unique_ptr<Scene> ModelYoloV3ONNX::postprocess(InferenceResult& infResult)
         float score = getScore(scores, classInd, boxInd);
 
         if (score > confidence_threshold) {
-            DetectedObject obj;
+            cv::Rect obj;
             size_t startPos = boxShape[2] * boxInd;
 
             auto x = boxesPtr[startPos + 1];
@@ -163,15 +163,9 @@ std::unique_ptr<Scene> ModelYoloV3ONNX::postprocess(InferenceResult& infResult)
             obj.y = clamp(y, 0.f, static_cast<float>(imgHeight));
             obj.height = clamp(height, 0.f, static_cast<float>(imgHeight));
             obj.width = clamp(width, 0.f, static_cast<float>(imgWidth));
-            obj.confidence = score;
-            obj.labelID = classInd;
-            obj.label = getLabelName(classInd);
 
-            result->objects.push_back(obj);
+            scene->boxes.push_back(Box(obj, {Label(std::to_string(classInd), getLabelName(classInd), score)}));
         }
     }
-
-    scene->detection_result = std::move(result);
-
     return scene;
 }
diff --git a/src/cpp/models/src/detection_model_yolox.cpp b/src/cpp/models/src/detection_model_yolox.cpp
index 421e4e32..0dd333e4 100644
--- a/src/cpp/models/src/detection_model_yolox.cpp
+++ b/src/cpp/models/src/detection_model_yolox.cpp
@@ -137,7 +137,6 @@ std::unique_ptr<Scene> ModelYoloX::postprocess(InferenceResult& infResult) {
 
     // Generate detection results
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
 
     // Update coordinates according to strides
     for (size_t box_index = 0; box_index < expandedStrides.size(); ++box_index) {
@@ -188,19 +187,16 @@ std::unique_ptr<Scene> ModelYoloX::postprocess(InferenceResult& infResult) {
     const std::vector<size_t>& keep = nms(validBoxes, scores, iou_threshold, true);
     for (size_t index : keep) {
         // Create new detected box
-        DetectedObject obj;
+        cv::Rect obj;
         obj.x = clamp(validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth));
         obj.y = clamp(validBoxes[index].top, 0.f, static_cast<float>(scale.inputImgHeight));
         obj.height =
             clamp(validBoxes[index].bottom - validBoxes[index].top, 0.f, static_cast<float>(scale.inputImgHeight));
         obj.width =
             clamp(validBoxes[index].right - validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth));
-        obj.confidence = scores[index];
-        obj.labelID = classes[index];
-        obj.label = getLabelName(classes[index]);
-        result->objects.push_back(obj);
+        scene->boxes.push_back(
+            Box(obj, {Label(std::to_string(classes[index]), getLabelName(classes[index]), scores[index])})
+        );
     }
-
-    scene->detection_result = std::move(result);
     return scene;
 }
diff --git a/src/cpp/tilers/include/tilers/detection.h b/src/cpp/tilers/include/tilers/detection.h
index 07525681..943dbcc6 100644
--- a/src/cpp/tilers/include/tilers/detection.h
+++ b/src/cpp/tilers/include/tilers/detection.h
@@ -22,7 +22,7 @@ class DetectionTiler : public TilerBase {
     virtual std::unique_ptr<Scene> merge_results(const std::vector<std::unique_ptr<Scene>>&,
                                                       const cv::Size&,
                                                       const std::vector<cv::Rect>&);
-    ov::Tensor merge_saliency_maps(const std::vector<std::unique_ptr<Scene>>&,
+    std::vector<cv::Mat> merge_saliency_maps(const std::vector<std::unique_ptr<Scene>>&,
                                    const cv::Size&,
                                    const std::vector<cv::Rect>&);
 
diff --git a/src/cpp/tilers/src/detection.cpp b/src/cpp/tilers/src/detection.cpp
index 72052e39..9d9eceaa 100644
--- a/src/cpp/tilers/src/detection.cpp
+++ b/src/cpp/tilers/src/detection.cpp
@@ -16,12 +16,16 @@ namespace {
 
 cv::Mat non_linear_normalization(cv::Mat& class_map) {
     double min_soft_score, max_soft_score;
-    cv::minMaxLoc(class_map, &min_soft_score);
-    cv::pow(class_map - min_soft_score, 1.5, class_map);
+    cv::Mat tmp;
 
-    cv::minMaxLoc(class_map, &min_soft_score, &max_soft_score);
-    class_map = 255.0 / (max_soft_score + 1e-12) * class_map;
+    class_map.convertTo(tmp, CV_32F);
+    cv::minMaxLoc(tmp, &min_soft_score);
+    cv::pow(tmp - min_soft_score, 1.5, tmp);
 
+    cv::minMaxLoc(tmp, &min_soft_score, &max_soft_score);
+    tmp = 255.0 / (max_soft_score + 1e-12) * tmp;
+
+    tmp.convertTo(class_map, class_map.type());
     return class_map;
 }
 
@@ -44,129 +48,99 @@ DetectionTiler::DetectionTiler(const std::shared_ptr<BaseModel>& _model,
 
 std::unique_ptr<Scene> DetectionTiler::postprocess_tile(std::unique_ptr<Scene> tile_result,
                                                              const cv::Rect& coord) {
-    auto& det_res = tile_result->detection_result;
-    for (auto& det : det_res->objects) {
-        det.x += coord.x;
-        det.y += coord.y;
-    }
-
-    if (det_res->feature_vector) {
-        auto tmp_feature_vector =
-            ov::Tensor(det_res->feature_vector.get_element_type(), det_res->feature_vector.get_shape());
-        det_res->feature_vector.copy_to(tmp_feature_vector);
-        det_res->feature_vector = tmp_feature_vector;
-    }
-
-    if (det_res->saliency_map) {
-        auto tmp_saliency_map = ov::Tensor(det_res->saliency_map.get_element_type(), det_res->saliency_map.get_shape());
-        det_res->saliency_map.copy_to(tmp_saliency_map);
-        det_res->saliency_map = tmp_saliency_map;
+    for (auto& det : tile_result->boxes) {
+        det.shape.x += coord.x;
+        det.shape.y += coord.y;
     }
-
     return tile_result;
 }
 
 std::unique_ptr<Scene> DetectionTiler::merge_results(const std::vector<std::unique_ptr<Scene>>& tiles_results,
                                                           const cv::Size& image_size,
                                                           const std::vector<cv::Rect>& tile_coords) {
-    auto result = std::make_unique<DetectionResult>();
     auto scene = std::make_unique<Scene>();
 
     std::vector<AnchorLabeled> all_detections;
-    std::vector<std::reference_wrapper<DetectedObject>> all_detections_refs;
+    std::vector<std::reference_wrapper<Box>> all_detections_refs;
     std::vector<float> all_scores;
 
     for (const auto& result : tiles_results) {
-        for (auto& det : result->detection_result->objects) {
-            all_detections.emplace_back(det.x, det.y, det.x + det.width, det.y + det.height, det.labelID);
-            all_scores.push_back(det.confidence);
+        for (auto& det : result->boxes) {
+            size_t id;
+            sscanf(det.labels[0].id.c_str(), "%zu", &id);
+            all_detections.emplace_back(det.shape.x, det.shape.y, det.shape.x + det.shape.width, det.shape.y + det.shape.height, id);
+            all_scores.push_back(det.labels[0].score);
             all_detections_refs.push_back(det);
         }
     }
 
     auto keep_idx = multiclass_nms(all_detections, all_scores, iou_threshold, false, max_pred_number);
 
-    result->objects.reserve(keep_idx.size());
+    scene->boxes.reserve(keep_idx.size());
     for (auto idx : keep_idx) {
-        result->objects.push_back(all_detections_refs[idx]);
+        scene->boxes.push_back(all_detections_refs[idx]);
     }
 
-    if (tiles_results.size()) {
-        auto& det_res = tiles_results.begin()->get()->detection_result;
-        if (det_res->feature_vector) {
-            result->feature_vector =
-                ov::Tensor(det_res->feature_vector.get_element_type(), det_res->feature_vector.get_shape());
-        }
-        if (det_res->saliency_map) {
-            result->saliency_map = merge_saliency_maps(tiles_results, image_size, tile_coords);
-        }
-    }
+    if (!tiles_results.empty()) {
+        auto& feature_vectors = tiles_results.begin()->get()->feature_vectors;
+        if (!feature_vectors.empty()) {
+            auto tensor = ov::Tensor(feature_vectors[0].get_element_type(), feature_vectors[0].get_shape());
 
-    if (result->feature_vector) {
-        float* feature_ptr = result->feature_vector.data<float>();
-        size_t feature_size = result->feature_vector.get_size();
+            float* feature_ptr = tensor.data<float>();
+            size_t feature_size = tensor.get_size();
 
-        std::fill(feature_ptr, feature_ptr + feature_size, 0.f);
+            std::fill(feature_ptr, feature_ptr + feature_size, 0.f);
 
-        for (const auto& result : tiles_results) {
-            const float* current_feature_ptr = result->detection_result->feature_vector.data<float>();
+            for (const auto& result : tiles_results) {
+                const float* current_feature_ptr = result->feature_vectors[0].data<float>();
+
+                for (size_t i = 0; i < feature_size; ++i) {
+                    feature_ptr[i] += current_feature_ptr[i];
+                }
+            }
 
             for (size_t i = 0; i < feature_size; ++i) {
-                feature_ptr[i] += current_feature_ptr[i];
+                feature_ptr[i] /= tiles_results.size();
             }
-        }
 
-        for (size_t i = 0; i < feature_size; ++i) {
-            feature_ptr[i] /= tiles_results.size();
+            scene->feature_vectors.push_back(tensor);
         }
-    }
 
-    scene->detection_result = std::move(result);
+        scene->saliency_maps = merge_saliency_maps(tiles_results, image_size, tile_coords);
+    }
 
     return scene;
 }
 
-ov::Tensor DetectionTiler::merge_saliency_maps(const std::vector<std::unique_ptr<Scene>>& tiles_results,
+std::vector<cv::Mat> DetectionTiler::merge_saliency_maps(const std::vector<std::unique_ptr<Scene>>& tiles_results,
                                                const cv::Size& image_size,
                                                const std::vector<cv::Rect>& tile_coords) {
-    std::vector<ov::Tensor> all_saliency_maps;
-    all_saliency_maps.reserve(tiles_results.size());
-    for (const auto& result : tiles_results) {
-        all_saliency_maps.push_back(result->detection_result->saliency_map);
-    }
 
-    ov::Tensor image_saliency_map;
-    if (all_saliency_maps.size()) {
-        image_saliency_map = all_saliency_maps[0];
-    }
-
-    if ((image_saliency_map.get_size() == 1) || (all_saliency_maps.size() == 1)) {
-        return image_saliency_map;
-    }
+    auto map_size = tiles_results[0]->saliency_maps[0].size();
 
-    size_t shape_shift = (image_saliency_map.get_shape().size() > 3) ? 1 : 0;
-    size_t num_classes = image_saliency_map.get_shape()[shape_shift];
-    size_t map_h = image_saliency_map.get_shape()[shape_shift + 1];
-    size_t map_w = image_saliency_map.get_shape()[shape_shift + 2];
+    auto dtype = tiles_results[0]->saliency_maps[0].type();
+    auto num_classes = tiles_results[0]->saliency_maps.size();
+    size_t map_h = map_size.height;
+    size_t map_w = map_size.width;
 
     float ratio_h = static_cast<float>(map_h) / std::min(tile_size, static_cast<size_t>(image_size.height));
     float ratio_w = static_cast<float>(map_w) / std::min(tile_size, static_cast<size_t>(image_size.width));
 
+    cv::Size ratio(ratio_w, ratio_h);
+
+
     size_t image_map_h = static_cast<size_t>(image_size.height * ratio_h);
     size_t image_map_w = static_cast<size_t>(image_size.width * ratio_w);
 
-    std::vector<cv::Mat_<float>> merged_map_mat(num_classes);
-    for (auto& class_map : merged_map_mat) {
-        class_map = cv::Mat_<float>(cv::Size{int(image_map_w), int(image_map_h)}, 0.f);
-    }
+    cv::Size merged_map_size(image_map_w, image_map_h);
 
-    size_t start_idx = tile_with_full_img ? 1 : 0;
-    for (size_t i = start_idx; i < all_saliency_maps.size(); ++i) {
-        for (size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
-            auto current_cls_map_mat = wrap_saliency_map_tensor_to_mat(all_saliency_maps[i], shape_shift, class_idx);
-            cv::Mat current_cls_map_mat_float;
-            current_cls_map_mat.convertTo(current_cls_map_mat_float, CV_32F);
+    std::vector<cv::Mat> saliency_maps(num_classes);
 
+    size_t start_index = (tile_with_full_img ? 1 : 0);
+    for (size_t class_index = 0; class_index < saliency_maps.size(); class_index++) {
+        saliency_maps[class_index] = cv::Mat(merged_map_size, dtype, 0.f);
+
+        for (size_t i = start_index; i < tiles_results.size(); i++) {
             cv::Rect map_location(
                 static_cast<int>(tile_coords[i].x * ratio_w),
                 static_cast<int>(tile_coords[i].y * ratio_h),
@@ -174,48 +148,17 @@ ov::Tensor DetectionTiler::merge_saliency_maps(const std::vector<std::unique_ptr
                                  static_cast<int>(tile_coords[i].x * ratio_w)),
                 static_cast<int>(static_cast<int>(tile_coords[i].height + tile_coords[i].y) * ratio_h -
                                  static_cast<int>(tile_coords[i].y * ratio_h)));
-
-            if (current_cls_map_mat.rows > map_location.height && map_location.height > 0 &&
-                current_cls_map_mat.cols > map_location.width && map_location.width > 0) {
-                cv::resize(current_cls_map_mat_float,
-                           current_cls_map_mat_float,
-                           cv::Size(map_location.width, map_location.height));
-            }
-
-            auto class_map_roi = cv::Mat(merged_map_mat[class_idx], map_location);
-            for (int row_i = 0; row_i < map_location.height; ++row_i) {
-                for (int col_i = 0; col_i < map_location.width; ++col_i) {
-                    float merged_mixel = class_map_roi.at<float>(row_i, col_i);
-                    if (merged_mixel > 0) {
-                        class_map_roi.at<float>(row_i, col_i) =
-                            0.5f * (merged_mixel + current_cls_map_mat_float.at<float>(row_i, col_i));
-                    } else {
-                        class_map_roi.at<float>(row_i, col_i) = current_cls_map_mat_float.at<float>(row_i, col_i);
-                    }
-                }
-            }
+            saliency_maps[class_index](map_location) = tiles_results[i]->saliency_maps[class_index];
         }
-    }
-
-    ov::Tensor merged_map;
-    if (shape_shift) {
-        merged_map = ov::Tensor(ov::element::Type("u8"), {1, num_classes, image_map_h, image_map_w});
-    } else {
-        merged_map = ov::Tensor(ov::element::Type("u8"), {num_classes, image_map_h, image_map_w});
-    }
 
-    for (size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
         if (tile_with_full_img) {
-            auto image_map_cls = wrap_saliency_map_tensor_to_mat(image_saliency_map, shape_shift, class_idx);
+            auto image_map_cls = tiles_results[0]->saliency_maps[class_index];
             cv::resize(image_map_cls, image_map_cls, cv::Size(image_map_w, image_map_h));
-            cv::addWeighted(merged_map_mat[class_idx], 1.0, image_map_cls, 0.5, 0., merged_map_mat[class_idx]);
+            cv::addWeighted(saliency_maps[class_index], 1.0, image_map_cls, 0.5, 0., saliency_maps[class_index]);
+            non_linear_normalization(saliency_maps[class_index]);
         }
-        merged_map_mat[class_idx] = non_linear_normalization(merged_map_mat[class_idx]);
-        auto merged_cls_map_mat = wrap_saliency_map_tensor_to_mat(merged_map, shape_shift, class_idx);
-        merged_map_mat[class_idx].convertTo(merged_cls_map_mat, merged_cls_map_mat.type());
     }
-
-    return merged_map;
+    return saliency_maps;
 }
 
 std::unique_ptr<Scene> DetectionTiler::run(const ImageInputData& inputData) {
diff --git a/tests/cpp/accuracy/test_YOLOv8.cpp b/tests/cpp/accuracy/test_YOLOv8.cpp
index 881812f9..c4ee90bd 100644
--- a/tests/cpp/accuracy/test_YOLOv8.cpp
+++ b/tests/cpp/accuracy/test_YOLOv8.cpp
@@ -60,7 +60,7 @@ TEST_P(AccuracySuit, TestDetector) {
     EXPECT_EQ(ss.str(),
               string{*cached_model(param.model_name)
                           ->infer(cv::imread(data() + "/coco128/images/train2017/" + param.refpath.stem().string() +
-                                             ".jpg"))->detection_result});
+                                             ".jpg"))});
 }
 
 INSTANTIATE_TEST_SUITE_P(YOLOv8, AccuracySuit, testing::ValuesIn([] {
diff --git a/tests/cpp/accuracy/test_accuracy.cpp b/tests/cpp/accuracy/test_accuracy.cpp
index ed5a3a17..4d1d1ae3 100644
--- a/tests/cpp/accuracy/test_accuracy.cpp
+++ b/tests/cpp/accuracy/test_accuracy.cpp
@@ -169,7 +169,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                     } else {
                         result = model->infer(image);
                     }
-                    EXPECT_EQ(std::string{*result->detection_result}, modelData.testData[i].reference[0]);
+                    EXPECT_EQ(std::string{*result}, modelData.testData[i].reference[0]);
                 }
             }
         } else if (modelData.type == "ClassificationModel") {

From e14275831fbc18f47c148230a025c34d1c0547d1 Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Fri, 25 Apr 2025 10:50:36 +0200
Subject: [PATCH 03/16] Implement Classification part: reference adjusted

Due to changes in the structure it was not possible to keep the same
reference. This means that the python string output needs to be changed too
slightly. This can be done without changing the python structure.
---
 src/cpp/models/include/models/results.h     | 26 ++++++++--
 src/cpp/models/src/classification_model.cpp | 56 ++++++++++++---------
 tests/cpp/accuracy/test_accuracy.cpp        |  2 +-
 tests/python/accuracy/public_scope.json     | 20 ++++----
 4 files changed, 63 insertions(+), 41 deletions(-)

diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index 6000698d..bfa4da4f 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -388,7 +388,7 @@ class Label {
     float score;
 
     friend std::ostream& operator<< (std::ostream& os, const Label& label) {
-        return os << label.id << " (" << label.name << "): " << std::fixed << std::setprecision(3) << label.score << "; ";
+        return os << label.id << " (" << label.name << "): " << std::fixed << std::setprecision(3) << label.score;
     }
 };
 
@@ -402,10 +402,16 @@ class Box {
 
         os << int(box.shape.x) << ", " << int(box.shape.y) << ", " << int(box.shape.x + box.shape.width) << ", "
                   << int(box.shape.y + box.shape.height) << ", ";
-        for (auto& label: box.labels) {
-            os << label;
+        for (size_t i = 0; i < box.labels.size(); i++) {
+            os << box.labels[i];
+            if (i == box.labels.size() - 1)  {
+                os << "; ";
+            } else {
+                os << ", ";
+            }
         }
 
+
         return os;
     }
 
@@ -425,7 +431,7 @@ class Scene {
     int64_t frameId;
     std::shared_ptr<MetaData> metaData;
 
-    std::unique_ptr<ClassificationResult> classification_result;
+    //std::unique_ptr<ClassificationResult> classification_result;
     std::unique_ptr<KeypointDetectionResult> keypoint_detection_result;
     std::unique_ptr<AnomalyResult> anomaly_result;
     std::unique_ptr<InstanceSegmentationResult> instance_segmentation_result;
@@ -435,6 +441,8 @@ class Scene {
     std::vector<cv::Mat> saliency_maps;
     std::vector<ov::Tensor> feature_vectors;
 
+    std::map<std::string, ov::Tensor> additional_tensors;
+
     friend std::ostream& operator<<(std::ostream& os, const Scene& scene) {
         for (auto& box: scene.boxes) {
             os << box;
@@ -449,8 +457,16 @@ class Scene {
         if (scene.feature_vectors.empty()){
             os << "[0]";
         } else {
-            os << scene.feature_vectors[0].get_shape();
+            for (auto& feature_vector: scene.feature_vectors){
+                os << feature_vector.get_shape();
+            }
         }
+
+        for (auto& v: scene.additional_tensors) {
+            os << ", " << v.second.get_shape();
+        }
+
+
         return os;
     }
 
diff --git a/src/cpp/models/src/classification_model.cpp b/src/cpp/models/src/classification_model.cpp
index 5ca38cfa..60c2c2ea 100644
--- a/src/cpp/models/src/classification_model.cpp
+++ b/src/cpp/models/src/classification_model.cpp
@@ -295,6 +295,7 @@ std::unique_ptr<ClassificationModel> ClassificationModel::create_model(std::shar
 
 std::unique_ptr<Scene> ClassificationModel::postprocess(InferenceResult& infResult) {
     std::unique_ptr<Scene> result;
+
     if (multilabel) {
         result = get_multilabel_predictions(infResult, output_raw_scores);
     } else if (hierarchical) {
@@ -303,15 +304,18 @@ std::unique_ptr<Scene> ClassificationModel::postprocess(InferenceResult& infResu
         result = get_multiclass_predictions(infResult, output_raw_scores);
     }
 
-    auto& cls_res = result->classification_result;
     auto saliency_map_iter = infResult.outputsData.find(saliency_map_name);
     if (saliency_map_iter != infResult.outputsData.end()) {
-        cls_res->saliency_map = std::move(saliency_map_iter->second);
-        cls_res->saliency_map = reorder_saliency_maps(cls_res->saliency_map);
+        size_t shape_shift = (saliency_map_iter->second.get_shape().size() > 3) ? 1 : 0;
+        auto tensor = reorder_saliency_maps(saliency_map_iter->second);
+        for (size_t i = 0; i < labels.size(); i++){
+            result->saliency_maps.push_back(wrap_saliency_map_tensor_to_mat(tensor, shape_shift, i).clone());
+
+        }
     }
     auto feature_vector_iter = infResult.outputsData.find(feature_vector_name);
     if (feature_vector_iter != infResult.outputsData.end()) {
-        cls_res->feature_vector = std::move(feature_vector_iter->second);
+        result->feature_vectors.push_back(std::move(feature_vector_iter->second));
     }
 
     return result;
@@ -323,38 +327,37 @@ std::unique_ptr<Scene> ClassificationModel::get_multilabel_predictions(Inference
     const float* logitsPtr = logitsTensor.data<float>();
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<ClassificationResult>(infResult.frameId, infResult.metaData);
-
     auto raw_scores = ov::Tensor();
+    std::vector<Label> result;
     float* raw_scoresPtr = nullptr;
     if (add_raw_scores) {
         raw_scores = ov::Tensor(logitsTensor.get_element_type(), logitsTensor.get_shape());
         raw_scoresPtr = raw_scores.data<float>();
-        result->raw_scores = raw_scores;
+        scene->additional_tensors["raw_scores"] = raw_scores;
     }
 
-    result->topLabels.reserve(labels.size());
     for (size_t i = 0; i < labels.size(); ++i) {
         float score = sigmoid(logitsPtr[i]);
         if (score > confidence_threshold) {
-            result->topLabels.emplace_back(i, labels[i], score);
+            result.emplace_back(std::to_string(i), labels[i], score);
         }
         if (add_raw_scores) {
             raw_scoresPtr[i] = score;
         }
     }
 
-    scene->classification_result = std::move(result);
+    const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+    cv::Rect shape(0, 0, internalData.inputImgWidth, internalData.inputImgHeight);
+    scene->boxes.push_back(Box(shape, result));
     return scene;
 }
 
 std::unique_ptr<Scene> ClassificationModel::get_hierarchical_predictions(InferenceResult& infResult,
                                                                               bool add_raw_scores) {
-    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<ClassificationResult>(infResult.frameId, infResult.metaData);
 
     const ov::Tensor& logitsTensor = infResult.outputsData.find(outputNames[0])->second;
     float* logitsPtr = logitsTensor.data<float>();
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
 
     auto raw_scores = ov::Tensor();
     float* raw_scoresPtr = nullptr;
@@ -362,7 +365,7 @@ std::unique_ptr<Scene> ClassificationModel::get_hierarchical_predictions(Inferen
         raw_scores = ov::Tensor(logitsTensor.get_element_type(), logitsTensor.get_shape());
         logitsTensor.copy_to(raw_scores);
         raw_scoresPtr = raw_scores.data<float>();
-        result->raw_scores = raw_scores;
+        scene->additional_tensors["raw_scores"] = raw_scores;
     }
 
     std::vector<std::reference_wrapper<std::string>> predicted_labels;
@@ -398,13 +401,13 @@ std::unique_ptr<Scene> ClassificationModel::get_hierarchical_predictions(Inferen
     }
 
     auto resolved_labels = resolver->resolve_labels(predicted_labels, predicted_scores);
-
-    result->topLabels.reserve(resolved_labels.size());
+    std::vector<Label> result;
     for (const auto& label : resolved_labels) {
-        result->topLabels.emplace_back(hierarchical_info.label_to_idx[label.first], label.first, label.second);
+        result.push_back(Label(std::to_string(hierarchical_info.label_to_idx[label.first]), label.first, label.second));
     }
-
-    scene->classification_result = std::move(result);
+    const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+    cv::Rect shape(0, 0, internalData.inputImgWidth, internalData.inputImgHeight);
+    scene->boxes.push_back(Box(shape, result));
     return scene;
 }
 
@@ -431,30 +434,33 @@ ov::Tensor ClassificationModel::reorder_saliency_maps(const ov::Tensor& source_m
 
 std::unique_ptr<Scene> ClassificationModel::get_multiclass_predictions(InferenceResult& infResult,
                                                                             bool add_raw_scores) {
+
     const ov::Tensor& indicesTensor = infResult.outputsData.find(indices_name)->second;
     const int* indicesPtr = indicesTensor.data<int>();
     const ov::Tensor& scoresTensor = infResult.outputsData.find(scores_name)->second;
     const float* scoresPtr = scoresTensor.data<float>();
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<ClassificationResult>(infResult.frameId, infResult.metaData);
     if (add_raw_scores) {
         const ov::Tensor& logitsTensor = infResult.outputsData.find(raw_scores_name)->second;
-        result->raw_scores = ov::Tensor(logitsTensor.get_element_type(), logitsTensor.get_shape());
-        logitsTensor.copy_to(result->raw_scores);
-        result->raw_scores.set_shape(ov::Shape({result->raw_scores.get_size()}));
+        auto raw_scores = ov::Tensor(logitsTensor.get_element_type(), logitsTensor.get_shape());
+        logitsTensor.copy_to(raw_scores);
+        raw_scores.set_shape(ov::Shape({raw_scores.get_size()}));
+        scene->additional_tensors["raw_scores"] = raw_scores;
     }
 
-    result->topLabels.reserve(scoresTensor.get_size());
+    std::vector<Label> result;
     for (size_t i = 0; i < scoresTensor.get_size(); ++i) {
         int ind = indicesPtr[i];
         if (ind < 0 || ind >= static_cast<int>(labels.size())) {
             throw std::runtime_error("Invalid index for the class label is found during postprocessing");
         }
-        result->topLabels.emplace_back(ind, labels[ind], scoresPtr[i]);
+        result.emplace_back(std::to_string(ind), labels[ind], scoresPtr[i]);
     }
 
-    scene->classification_result = std::move(result);
+    const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+    cv::Rect shape(0, 0, internalData.inputImgWidth, internalData.inputImgHeight);
+    scene->boxes.push_back(Box(shape, result));
     return scene;
 }
 
diff --git a/tests/cpp/accuracy/test_accuracy.cpp b/tests/cpp/accuracy/test_accuracy.cpp
index 4d1d1ae3..72bd376b 100644
--- a/tests/cpp/accuracy/test_accuracy.cpp
+++ b/tests/cpp/accuracy/test_accuracy.cpp
@@ -183,7 +183,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         throw std::runtime_error{"Failed to read the image"};
                     }
                     auto result = model->infer(image);
-                    EXPECT_EQ(std::string{*result->classification_result}, modelData.testData[i].reference[0]);
+                    EXPECT_EQ(std::string{*result}, modelData.testData[i].reference[0]);
                 }
             }
         } else if (modelData.type == "SegmentationModel") {
diff --git a/tests/python/accuracy/public_scope.json b/tests/python/accuracy/public_scope.json
index e244ece1..671d6bc1 100644
--- a/tests/python/accuracy/public_scope.json
+++ b/tests/python/accuracy/public_scope.json
@@ -115,7 +115,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000081.jpg",
-        "reference": ["0 (aeroplane): 0.943, [0], [0], [0]"]
+        "reference": ["0, 0, 640, 425, 0 (aeroplane): 0.943; [0]; [0]"]
       }
     ]
   },
@@ -126,7 +126,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "1 (bicycle): 0.768, 11 (dog): 0.876, 14 (person): 0.922, [0], [0], [0]"
+          "0, 0, 640, 426, 1 (bicycle): 0.768, 11 (dog): 0.876, 14 (person): 0.922; [0]; [0]"
         ]
       }
     ]
@@ -138,7 +138,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "1 (bicycle): 0.825, 11 (dog): 0.873, 14 (person): 0.824, [0], [0], [0]"
+          "0, 0, 640, 426, 1 (bicycle): 0.825, 11 (dog): 0.873, 14 (person): 0.824; [0]; [0]"
         ]
       }
     ]
@@ -150,7 +150,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000471.jpg",
-        "reference": ["105 (194): 0.456, [0], [0], [0]"]
+        "reference": ["0, 0, 640, 427, 105 (194): 0.456; [0]; [0]"]
       }
     ]
   },
@@ -160,7 +160,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000471.jpg",
-        "reference": ["105 (194): 0.456, [0], [0], [196]"]
+        "reference": ["0, 0, 640, 427, 105 (194): 0.456; [0]; [0]"]
       }
     ]
   },
@@ -181,7 +181,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000471.jpg",
-        "reference": ["0 (1): 0.838, [0], [0], [0]"]
+        "reference": ["0, 0, 640, 427, 0 (1): 0.838; [0]; [0]"]
       }
     ]
   },
@@ -192,7 +192,7 @@
       {
         "image": "coco128/images/train2017/000000000471.jpg",
         "reference": [
-          "4 (Circle): 0.943, 5 (Lion): 0.969, 3 (Non-Rigid): 0.503, 6 (Panda): 0.988, [1,7,7,7], [1,7], [0]"
+          "0, 0, 640, 427, 4 (Circle): 0.943, 5 (Lion): 0.969, 3 (Non-Rigid): 0.503, 6 (Panda): 0.988; [1,7,7,7]; [1,7]"
         ]
       }
     ]
@@ -203,7 +203,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000471.jpg",
-        "reference": ["0 (1): 0.849, [0], [0], [0]"]
+        "reference": ["0, 0, 640, 427, 0 (1): 0.849; [0]; [0]"]
       }
     ]
   },
@@ -274,7 +274,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000081.jpg",
-        "reference": ["0 (horse): 0.543, [1,4,7,7], [1,1280,1,1], [0]"]
+        "reference": ["0, 0, 640, 425, 0 (horse): 0.543; [1,4,7,7]; [1,1280,1,1]"]
       }
     ]
   },
@@ -374,7 +374,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000074.jpg",
-        "reference": ["3 (cat): 0.648, [0], [0], [10]"]
+        "reference": ["0, 0, 640, 426, 3 (cat): 0.648; [0]; [0], [10]"]
       }
     ]
   },

From 483f524fc4f5ab37225e5281b89a6a9aa5c45087 Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Thu, 1 May 2025 09:29:23 +0200
Subject: [PATCH 04/16] Transfer anomaly model to new data structure

Test references not redone
---
 src/cpp/models/include/models/results.h |  1 +
 src/cpp/models/src/anomaly_model.cpp    | 36 +++++++++++++++++--------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index bfa4da4f..8963e112 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -442,6 +442,7 @@ class Scene {
     std::vector<ov::Tensor> feature_vectors;
 
     std::map<std::string, ov::Tensor> additional_tensors;
+    std::map<std::string, cv::Mat> masks;
 
     friend std::ostream& operator<<(std::ostream& os, const Scene& scene) {
         for (auto& box: scene.boxes) {
diff --git a/src/cpp/models/src/anomaly_model.cpp b/src/cpp/models/src/anomaly_model.cpp
index 1cd1a9e8..ec826cae 100644
--- a/src/cpp/models/src/anomaly_model.cpp
+++ b/src/cpp/models/src/anomaly_model.cpp
@@ -14,7 +14,6 @@
 #include "models/input_data.h"
 #include "models/internal_model_data.h"
 #include "models/results.h"
-#include "utils/slog.hpp"
 
 std::string AnomalyModel::ModelType = "AnomalyDetection";
 
@@ -67,7 +66,8 @@ std::unique_ptr<Scene> AnomalyModel::postprocess(InferenceResult& infResult) {
         // find the max predicted score
         cv::minMaxLoc(anomaly_map, NULL, &pred_score);
     }
-    pred_label = labels[pred_score > imageThreshold ? 1 : 0];
+    auto label_id = pred_score > imageThreshold ? 1 : 0;
+    pred_label = labels[label_id];
 
     pred_mask = anomaly_map >= pixelThreshold;
     pred_mask.convertTo(pred_mask, CV_8UC1, 1 / 255.);
@@ -83,19 +83,33 @@ std::unique_ptr<Scene> AnomalyModel::postprocess(InferenceResult& infResult) {
     if (!anomaly_map.empty()) {
         cv::resize(anomaly_map, anomaly_map, cv::Size{inputImgSize.inputImgWidth, inputImgSize.inputImgHeight});
     }
+    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+
+    scene->saliency_maps.push_back(anomaly_map);
+    scene->masks["pred_mask"] = std::move(pred_mask);
+    scene->boxes.push_back(
+        Box(
+            cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight),
+            {Label(std::to_string(label_id), pred_label, pred_score)}
+        )
+    );
+
     if (task == "detection") {
         pred_boxes = getBoxes(pred_mask);
+
+        for (auto& rect: pred_boxes) {
+            double box_score;
+            cv::minMaxLoc(anomaly_map(rect), NULL, &box_score);
+            scene->boxes.push_back(
+                Box(
+                    rect,
+                    {Label(std::to_string(label_id), pred_label, box_score)}
+                )
+            );
+
+        }
     }
 
-    auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<AnomalyResult>(infResult.frameId, infResult.metaData);
-    result->anomaly_map = std::move(anomaly_map);
-    result->pred_score = pred_score;
-    result->pred_label = std::move(pred_label);
-    result->pred_mask = std::move(pred_mask);
-    result->pred_boxes = std::move(pred_boxes);
-
-    scene->anomaly_result = std::move(result);
     return scene;
 }
 

From f6c14bcd461c54915b822b7a5a6a93cd8a4684f3 Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Thu, 1 May 2025 09:30:39 +0200
Subject: [PATCH 05/16] Update examples to work again

---
 examples/cpp/asynchronous_api/main.cpp | 11 +++++------
 examples/cpp/synchronous_api/main.cpp  | 10 ++--------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/examples/cpp/asynchronous_api/main.cpp b/examples/cpp/asynchronous_api/main.cpp
index f8edc2cc..0ed44d53 100644
--- a/examples/cpp/asynchronous_api/main.cpp
+++ b/examples/cpp/asynchronous_api/main.cpp
@@ -53,8 +53,8 @@ int main(int argc, char* argv[]) try {
 
     std::cout << "Batch mode inference results:\n";
     for (const auto& result : results) {
-        for (auto& obj : result->objects) {
-            std::cout << " " << std::left << std::setw(9) << obj.confidence << " " << obj.label << "\n";
+        for (auto& obj : result->boxes) {
+            std::cout << obj << std::endl;
         }
         std::cout << std::string(10, '-') << "\n";
     }
@@ -62,15 +62,14 @@ int main(int argc, char* argv[]) try {
     std::cout << "Async mode inference results:\n";
 
     // Set callback to grab results once the inference is done
-    model->setCallback([](std::unique_ptr<ResultBase> result, const ov::AnyMap& callback_args) {
-        auto det_result = std::unique_ptr<DetectionResult>(static_cast<DetectionResult*>(result.release()));
+    model->setCallback([](std::unique_ptr<Scene> result, const ov::AnyMap& callback_args) {
 
         // callback_args can contain arbitrary data
         size_t id = callback_args.find("id")->second.as<size_t>();
 
         std::cout << "Request with id " << id << " is finished\n";
-        for (auto& obj : det_result->objects) {
-            std::cout << " " << std::left << std::setw(9) << obj.confidence << " " << obj.label << "\n";
+        for (auto& obj : result->boxes) {
+            std::cout << " " << obj << std::endl;
         }
         std::cout << std::string(10, '-') << "\n";
     });
diff --git a/examples/cpp/synchronous_api/main.cpp b/examples/cpp/synchronous_api/main.cpp
index 1f79a035..fc7d2122 100644
--- a/examples/cpp/synchronous_api/main.cpp
+++ b/examples/cpp/synchronous_api/main.cpp
@@ -6,11 +6,7 @@
 #include <models/detection_model.h>
 #include <models/input_data.h>
 #include <models/results.h>
-#include <stddef.h>
-
-#include <cstdint>
 #include <exception>
-#include <iomanip>
 #include <iostream>
 #include <opencv2/core.hpp>
 #include <opencv2/imgcodecs.hpp>
@@ -37,10 +33,8 @@ int main(int argc, char* argv[]) try {
     auto result = model->infer(image);
 
     // Process detections
-    for (auto& obj : result->objects) {
-        std::cout << " " << std::left << std::setw(9) << obj.label << " | " << std::setw(10) << obj.confidence << " | "
-                  << std::setw(4) << int(obj.x) << " | " << std::setw(4) << int(obj.y) << " | " << std::setw(4)
-                  << int(obj.x + obj.width) << " | " << std::setw(4) << int(obj.y + obj.height) << "\n";
+    for (auto& obj : result->boxes) {
+        std::cout << obj << std::endl;
     }
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';

From 67d8b9bd8ffee8b45e6b8190d4edcfa7ad0db79e Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Tue, 6 May 2025 06:46:04 +0200
Subject: [PATCH 06/16] Update anomaly public scope

---
 src/cpp/models/include/models/results.h | 9 ++++++++-
 tests/cpp/accuracy/test_accuracy.cpp    | 2 +-
 tests/python/accuracy/public_scope.json | 4 ++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index 8963e112..0fbe5b6d 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -433,7 +433,6 @@ class Scene {
 
     //std::unique_ptr<ClassificationResult> classification_result;
     std::unique_ptr<KeypointDetectionResult> keypoint_detection_result;
-    std::unique_ptr<AnomalyResult> anomaly_result;
     std::unique_ptr<InstanceSegmentationResult> instance_segmentation_result;
     std::unique_ptr<ImageResult> image_result;
 
@@ -455,6 +454,12 @@ class Scene {
             os << "[1," << scene.saliency_maps.size() << "," << scene.saliency_maps[0].rows << "," << scene.saliency_maps[0].cols << "]; ";
         }
 
+        for (auto& m: scene.masks) {
+            double min_mask, max_mask;
+            cv::minMaxLoc(m.second, &min_mask, &max_mask);
+            os << m.first << " min:" << min_mask << " max:" << max_mask << ";";
+        }
+
         if (scene.feature_vectors.empty()){
             os << "[0]";
         } else {
@@ -468,6 +473,8 @@ class Scene {
         }
 
 
+
+
         return os;
     }
 
diff --git a/tests/cpp/accuracy/test_accuracy.cpp b/tests/cpp/accuracy/test_accuracy.cpp
index 72bd376b..857cf507 100644
--- a/tests/cpp/accuracy/test_accuracy.cpp
+++ b/tests/cpp/accuracy/test_accuracy.cpp
@@ -285,7 +285,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         throw std::runtime_error{"Failed to read the image"};
                     }
                     auto result = model->infer(image);
-                    EXPECT_EQ(std::string{*result->anomaly_result}, modelData.testData[i].reference[0]);
+                    EXPECT_EQ(std::string{*result}, modelData.testData[i].reference[0]);
                 }
             }
         } else if (modelData.type == "KeypointDetectionModel") {
diff --git a/tests/python/accuracy/public_scope.json b/tests/python/accuracy/public_scope.json
index 671d6bc1..537d4ac6 100644
--- a/tests/python/accuracy/public_scope.json
+++ b/tests/python/accuracy/public_scope.json
@@ -351,7 +351,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "anomaly_map min:151 max:255;pred_score:1.0;pred_label:Anomaly;pred_mask min:1 max:1;"
+          "0, 0, 640, 426, 1 (Anomaly): 0.854; [1,1,426,640]; pred_mask min:0.000 max:1.000;[0]"
         ]
       }
     ]
@@ -363,7 +363,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "anomaly_map min:124 max:225;pred_score:0.9;pred_label:Anomaly;pred_mask min:0 max:1;"
+          "0, 0, 640, 426, 1 (Anomaly): 0.854; [1,1,426,640]; pred_mask min:0.000 max:1.000;[0]"
         ]
       }
     ]

From 41051e387d51dd446f9bc95655204084a2f0ed21 Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Tue, 6 May 2025 06:51:52 +0200
Subject: [PATCH 07/16] Include keypoint poses into scene

---
 src/cpp/models/include/models/results.h   | 7 ++++++-
 src/cpp/models/src/keypoint_detection.cpp | 4 +---
 tests/cpp/accuracy/test_accuracy.cpp      | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index 0fbe5b6d..4711627f 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -432,11 +432,12 @@ class Scene {
     std::shared_ptr<MetaData> metaData;
 
     //std::unique_ptr<ClassificationResult> classification_result;
-    std::unique_ptr<KeypointDetectionResult> keypoint_detection_result;
     std::unique_ptr<InstanceSegmentationResult> instance_segmentation_result;
     std::unique_ptr<ImageResult> image_result;
 
     std::vector<Box> boxes;
+    std::vector<DetectedKeypoints> poses;
+
     std::vector<cv::Mat> saliency_maps;
     std::vector<ov::Tensor> feature_vectors;
 
@@ -448,6 +449,10 @@ class Scene {
             os << box;
         }
 
+        for (auto& pose: scene.poses) {
+            os << pose;
+        }
+
         if (scene.saliency_maps.empty()){
             os << "[0]; ";
         } else {
diff --git a/src/cpp/models/src/keypoint_detection.cpp b/src/cpp/models/src/keypoint_detection.cpp
index 9554cbb4..2fb481bb 100644
--- a/src/cpp/models/src/keypoint_detection.cpp
+++ b/src/cpp/models/src/keypoint_detection.cpp
@@ -209,7 +209,6 @@ void KeypointDetectionModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& mo
 
 std::unique_ptr<Scene> KeypointDetectionModel::postprocess(InferenceResult& infResult) {
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<KeypointDetectionResult>(infResult.frameId, infResult.metaData);
 
     const ov::Tensor& pred_x_tensor = infResult.outputsData.find(outputNames[0])->second;
     size_t shape_offset = pred_x_tensor.get_shape().size() == 3 ? 1 : 0;
@@ -244,10 +243,9 @@ std::unique_ptr<Scene> KeypointDetectionModel::postprocess(InferenceResult& infR
         }
     }
 
-    result->poses.emplace_back(
+    scene->poses.emplace_back(
         decode_simcc(pred_x_mat, pred_y_mat, {inverted_scale_x, inverted_scale_y}, {pad_left, pad_top}, apply_softmax));
 
-    scene->keypoint_detection_result = std::move(result);
     return scene;
 }
 
diff --git a/tests/cpp/accuracy/test_accuracy.cpp b/tests/cpp/accuracy/test_accuracy.cpp
index 857cf507..eee95e17 100644
--- a/tests/cpp/accuracy/test_accuracy.cpp
+++ b/tests/cpp/accuracy/test_accuracy.cpp
@@ -303,7 +303,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         throw std::runtime_error{"Failed to read the image"};
                     }
                     auto result = model->infer(image);
-                    EXPECT_EQ(std::string{(*result->keypoint_detection_result).poses[0]}, modelData.testData[i].reference[0]);
+                    EXPECT_EQ(std::string{(*result).poses[0]}, modelData.testData[i].reference[0]);
                 }
             }
         }

From 413b865d167cd199a2be44bc678dc2bb27c5a2ea Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Wed, 7 May 2025 11:11:54 +0200
Subject: [PATCH 08/16] Rework semantic segmentation to new scene result

---
 src/cpp/models/include/models/results.h       |  1 -
 .../include/models/segmentation_model.h       |  2 +-
 src/cpp/models/src/segmentation_model.cpp     | 33 ++++++++-----------
 src/cpp/tilers/src/semantic_segmentation.cpp  | 19 ++++-------
 tests/cpp/accuracy/test_accuracy.cpp          |  9 +++--
 tests/python/accuracy/public_scope.json       | 16 ++++-----
 6 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index 4711627f..c1d4cf19 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -433,7 +433,6 @@ class Scene {
 
     //std::unique_ptr<ClassificationResult> classification_result;
     std::unique_ptr<InstanceSegmentationResult> instance_segmentation_result;
-    std::unique_ptr<ImageResult> image_result;
 
     std::vector<Box> boxes;
     std::vector<DetectedKeypoints> poses;
diff --git a/src/cpp/models/include/models/segmentation_model.h b/src/cpp/models/include/models/segmentation_model.h
index d802e026..121cbca1 100644
--- a/src/cpp/models/include/models/segmentation_model.h
+++ b/src/cpp/models/include/models/segmentation_model.h
@@ -37,7 +37,7 @@ class SegmentationModel : public BaseModel {
     virtual std::vector<std::unique_ptr<Scene>> inferBatch(const std::vector<ImageInputData>& inputImgs);
 
     static std::string ModelType;
-    std::vector<Contour> getContours(const ImageResultWithSoftPrediction& imageResult);
+    std::vector<Contour> getContours(const std::unique_ptr<Scene>& scene);
 
 protected:
     void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
diff --git a/src/cpp/models/src/segmentation_model.cpp b/src/cpp/models/src/segmentation_model.cpp
index 19ad5158..cda33d53 100644
--- a/src/cpp/models/src/segmentation_model.cpp
+++ b/src/cpp/models/src/segmentation_model.cpp
@@ -260,42 +260,37 @@ std::unique_ptr<Scene> SegmentationModel::postprocess(InferenceResult& infResult
                cv::INTER_NEAREST);
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
+    scene->masks["hard_prediction"] = hard_prediction;
     if (return_soft_prediction) {
-        ImageResultWithSoftPrediction* result =
-            new ImageResultWithSoftPrediction(infResult.frameId, infResult.metaData);
-        result->resultImage = hard_prediction;
         cv::resize(soft_prediction,
                    soft_prediction,
                    {inputImgSize.inputImgWidth, inputImgSize.inputImgHeight},
                    0.0,
                    0.0,
                    cv::INTER_NEAREST);
-        result->soft_prediction = soft_prediction;
+        scene->masks["soft_prediction"] = soft_prediction;
         auto iter = infResult.outputsData.find(feature_vector_name);
         if (infResult.outputsData.end() != iter) {
-            result->saliency_map = get_activation_map(soft_prediction);
-            result->feature_vector = iter->second;
+            scene->saliency_maps.push_back(get_activation_map(soft_prediction));
+            scene->feature_vectors.push_back(iter->second);
         }
-        scene->image_result = std::unique_ptr<ImageResult>(result);
-    } else {
-        auto result = std::make_unique<ImageResult>(infResult.frameId, infResult.metaData);
-        result->resultImage = hard_prediction;
-        scene->image_result = std::move(result);
     }
     return scene;
 }
 
-std::vector<Contour> SegmentationModel::getContours(const ImageResultWithSoftPrediction& imageResult) {
-    if (imageResult.soft_prediction.channels() == 1) {
+std::vector<Contour> SegmentationModel::getContours(const std::unique_ptr<Scene>& scene) {
+    auto soft_prediction = scene->masks["soft_prediction"];
+    auto hard_prediction = scene->masks["hard_prediction"];
+    if (soft_prediction.channels() == 1) {
         throw std::runtime_error{"Cannot get contours from soft prediction with 1 layer"};
     }
 
     std::vector<Contour> combined_contours = {};
     cv::Mat label_index_map;
     cv::Mat current_label_soft_prediction;
-    for (int index = 1; index < imageResult.soft_prediction.channels(); index++) {
-        cv::extractChannel(imageResult.soft_prediction, current_label_soft_prediction, index);
-        cv::inRange(imageResult.resultImage,
+    for (int index = 1; index < soft_prediction.channels(); index++) {
+        cv::extractChannel(soft_prediction, current_label_soft_prediction, index);
+        cv::inRange(hard_prediction,
                     cv::Scalar(index, index, index),
                     cv::Scalar(index, index, index),
                     label_index_map);
@@ -305,9 +300,9 @@ std::vector<Contour> SegmentationModel::getContours(const ImageResultWithSoftPre
         std::string label = getLabelName(index - 1);
 
         for (unsigned int i = 0; i < contours.size(); i++) {
-            cv::Mat mask = cv::Mat::zeros(imageResult.resultImage.rows,
-                                          imageResult.resultImage.cols,
-                                          imageResult.resultImage.type());
+            cv::Mat mask = cv::Mat::zeros(hard_prediction.rows,
+                                          hard_prediction.cols,
+                                          hard_prediction.type());
             cv::drawContours(mask, contours, i, 255, -1);
             float probability = (float)cv::mean(current_label_soft_prediction, mask)[0];
             combined_contours.push_back({label, probability, contours[i]});
diff --git a/src/cpp/tilers/src/semantic_segmentation.cpp b/src/cpp/tilers/src/semantic_segmentation.cpp
index ac7ff5b7..846f247d 100644
--- a/src/cpp/tilers/src/semantic_segmentation.cpp
+++ b/src/cpp/tilers/src/semantic_segmentation.cpp
@@ -56,8 +56,7 @@ std::unique_ptr<Scene> SemanticSegmentationTiler::run(const ImageInputData& inpu
 
 std::unique_ptr<Scene> SemanticSegmentationTiler::postprocess_tile(std::unique_ptr<Scene> tile_result,
                                                                         const cv::Rect&) {
-    ImageResultWithSoftPrediction* soft = dynamic_cast<ImageResultWithSoftPrediction*>(tile_result->image_result.get());
-    if (!soft) {
+    if (tile_result->masks.find("soft_prediction") == tile_result->masks.end()){
         throw std::runtime_error(
             "SemanticSegmentationTiler requires the underlying model to return ImageResultWithSoftPrediction");
     }
@@ -73,15 +72,15 @@ std::unique_ptr<Scene> SemanticSegmentationTiler::merge_results(
     }
 
     cv::Mat voting_mask(cv::Size(image_size.width, image_size.height), CV_32SC1, cv::Scalar(0));
-    auto* sseg_res = static_cast<ImageResultWithSoftPrediction*>(tiles_results[0]->image_result.get());
+    auto first_soft_prediction = tiles_results[0]->masks["soft_prediction"];
     cv::Mat merged_soft_prediction(cv::Size(image_size.width, image_size.height),
-                                   CV_32FC(sseg_res->soft_prediction.channels()),
+                                   CV_32FC(first_soft_prediction.channels()),
                                    cv::Scalar(0));
 
     for (size_t i = 0; i < tiles_results.size(); ++i) {
-        auto* sseg_res = static_cast<ImageResultWithSoftPrediction*>(tiles_results[i]->image_result.get());
+        auto soft_prediction = tiles_results[i]->masks["soft_prediction"];
         voting_mask(tile_coords[i]) += 1;
-        merged_soft_prediction(tile_coords[i]) += sseg_res->soft_prediction;
+        merged_soft_prediction(tile_coords[i]) += soft_prediction;
     }
 
     normalize_soft_prediction(merged_soft_prediction, voting_mask);
@@ -90,11 +89,7 @@ std::unique_ptr<Scene> SemanticSegmentationTiler::merge_results(
         create_hard_prediction_from_soft_prediction(merged_soft_prediction, soft_threshold, blur_strength);
 
     auto scene = std::make_unique<Scene>();
-    auto result = std::make_unique<ImageResultWithSoftPrediction>();
-    result->resultImage = hard_prediction;
-    if (return_soft_prediction) {
-        result->soft_prediction = merged_soft_prediction;
-    }
-    scene->image_result = std::move(result);
+    scene->masks["hard_prediction"] = hard_prediction;
+    scene->masks["soft_prediction"] = merged_soft_prediction;
     return scene;
 }
diff --git a/tests/cpp/accuracy/test_accuracy.cpp b/tests/cpp/accuracy/test_accuracy.cpp
index eee95e17..24a655bd 100644
--- a/tests/cpp/accuracy/test_accuracy.cpp
+++ b/tests/cpp/accuracy/test_accuracy.cpp
@@ -208,17 +208,16 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         pred = model->infer(image);
                     }
 
-                    ImageResultWithSoftPrediction* soft = dynamic_cast<ImageResultWithSoftPrediction*>(pred->image_result.get());
-                    if (soft) {
-                        const std::vector<Contour>& contours = model->getContours(*soft);
+                    if (pred->masks.find("soft_prediction") != pred->masks.end()) {
+                        cv::Mat soft = pred->masks["soft_prediction"];
+                        const std::vector<Contour>& contours = model->getContours(pred);
                         std::stringstream ss;
-                        ss << *soft << "; ";
                         for (const Contour& contour : contours) {
                             ss << contour << ", ";
                         }
                         ASSERT_EQ(ss.str(), modelData.testData[i].reference[0]);
                     } else {
-                        ASSERT_EQ(std::string{*pred->image_result}, modelData.testData[i].reference[0]);
+                        ASSERT_EQ(std::string{*pred}, modelData.testData[i].reference[0]);
                     }
                 }
             }
diff --git a/tests/python/accuracy/public_scope.json b/tests/python/accuracy/public_scope.json
index 537d4ac6..6e99ed23 100644
--- a/tests/python/accuracy/public_scope.json
+++ b/tests/python/accuracy/public_scope.json
@@ -6,7 +6,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0: 0.537, 1: 0.463, [426,640,2], [0], [0]; object: 0.675, 508, object: 0.527, 65, object: 0.507, 18, object: 0.624, 144, object: 0.538, 67, object: 0.507, 15, object: 0.518, 41, object: 0.507, 8, object: 0.505, 14, object: 0.885, 2138, "
+          "object: 0.675, 508, object: 0.527, 65, object: 0.507, 18, object: 0.624, 144, object: 0.538, 67, object: 0.507, 15, object: 0.518, 41, object: 0.507, 8, object: 0.505, 14, object: 0.885, 2138, "
         ]
       }
     ]
@@ -18,7 +18,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0: 0.992, 1: 0.008, [426,640,2], [0], [0]; object: 0.555, 112, object: 0.506, 17, object: 0.555, 154, object: 0.511, 19, object: 0.514, 52, "
+          "object: 0.555, 112, object: 0.506, 17, object: 0.555, 154, object: 0.511, 19, object: 0.514, 52, "
         ]
       }
     ]
@@ -30,7 +30,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0: 0.563, 1: 0.437, [426,640,2], [0], [0]; object: 0.520, 26, object: 0.530, 42, object: 0.501, 4, object: 0.507, 27, object: 0.503, 8, object: 0.502, 6, object: 0.505, 18, object: 0.504, 13, object: 0.524, 87, object: 0.521, 89, object: 0.757, 2706, "
+          "object: 0.520, 26, object: 0.530, 42, object: 0.501, 4, object: 0.507, 27, object: 0.503, 8, object: 0.502, 6, object: 0.505, 18, object: 0.504, 13, object: 0.524, 87, object: 0.521, 89, object: 0.757, 2706, "
         ]
       }
     ]
@@ -42,7 +42,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0: 0.563, 1: 0.437, [426,640,2], [0], [0]; object: 0.520, 26, object: 0.530, 42, object: 0.501, 4, object: 0.507, 27, object: 0.503, 8, object: 0.502, 6, object: 0.505, 18, object: 0.504, 13, object: 0.524, 87, object: 0.521, 89, object: 0.757, 2706, "
+          "object: 0.520, 26, object: 0.530, 42, object: 0.501, 4, object: 0.507, 27, object: 0.503, 8, object: 0.502, 6, object: 0.505, 18, object: 0.504, 13, object: 0.524, 87, object: 0.521, 89, object: 0.757, 2706, "
         ]
       }
     ]
@@ -55,7 +55,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0: 0.561, 1: 0.439, [426,640,2], [0], [0]; object: 0.519, 26, object: 0.531, 42, object: 0.502, 21, object: 0.505, 9, object: 0.501, 4, object: 0.509, 22, object: 0.524, 85, object: 0.520, 93, object: 0.754, 2564, "
+          "object: 0.519, 26, object: 0.531, 42, object: 0.502, 21, object: 0.505, 9, object: 0.501, 4, object: 0.509, 22, object: 0.524, 85, object: 0.520, 93, object: 0.754, 2564, "
         ]
       }
     ]
@@ -67,7 +67,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0: 0.944, 1: 0.056, [426,640,2], [0], [0]; object: 0.505, 2, object: 0.518, 8, object: 0.512, 5, object: 0.506, 4, object: 0.526, 8, object: 0.529, 21, object: 0.513, 12, object: 0.535, 49, object: 0.505, 2, object: 0.512, 4, object: 0.547, 6, object: 0.511, 6, object: 0.503, 1, object: 0.539, 6, object: 0.543, 39, object: 0.529, 2, object: 0.516, 9, object: 0.565, 157, object: 0.524, 6, object: 0.528, 15, object: 0.521, 18, object: 0.502, 1, object: 0.537, 73, object: 0.513, 4, object: 0.524, 27, object: 0.513, 6, object: 0.538, 65, object: 0.501, 6, object: 0.504, 1, object: 0.507, 4, object: 0.502, 1, object: 0.518, 8, object: 0.530, 11, object: 0.502, 2, object: 0.516, 2, object: 0.506, 1, object: 0.567, 17, object: 0.502, 1, object: 0.512, 7, object: 0.538, 24, object: 0.507, 1, object: 0.534, 12, object: 0.537, 6, object: 0.519, 13, object: 0.505, 2, object: 0.517, 16, object: 0.505, 5, object: 0.506, 20, object: 0.508, 6, object: 0.519, 24, object: 0.507, 4, object: 0.506, 2, object: 0.511, 4, object: 0.556, 47, object: 0.510, 10, object: 0.500, 1, object: 0.504, 5, object: 0.501, 1, object: 0.510, 6, object: 0.549, 13, object: 0.509, 2, object: 0.510, 3, object: 0.514, 1, object: 0.529, 15, object: 0.551, 110, object: 0.504, 2, object: 0.503, 3, object: 0.518, 16, object: 0.511, 14, object: 0.502, 1, object: 0.523, 1, object: 0.533, 16, object: 0.568, 66, object: 0.582, 1793, "
+          "object: 0.505, 2, object: 0.518, 8, object: 0.512, 5, object: 0.506, 4, object: 0.526, 8, object: 0.529, 21, object: 0.513, 12, object: 0.535, 49, object: 0.505, 2, object: 0.512, 4, object: 0.547, 6, object: 0.511, 6, object: 0.503, 1, object: 0.539, 6, object: 0.543, 39, object: 0.529, 2, object: 0.516, 9, object: 0.565, 157, object: 0.524, 6, object: 0.528, 15, object: 0.521, 18, object: 0.502, 1, object: 0.537, 73, object: 0.513, 4, object: 0.524, 27, object: 0.513, 6, object: 0.538, 65, object: 0.501, 6, object: 0.504, 1, object: 0.507, 4, object: 0.502, 1, object: 0.518, 8, object: 0.530, 11, object: 0.502, 2, object: 0.516, 2, object: 0.506, 1, object: 0.567, 17, object: 0.502, 1, object: 0.512, 7, object: 0.538, 24, object: 0.507, 1, object: 0.534, 12, object: 0.537, 6, object: 0.519, 13, object: 0.505, 2, object: 0.517, 16, object: 0.505, 5, object: 0.506, 20, object: 0.508, 6, object: 0.519, 24, object: 0.507, 4, object: 0.506, 2, object: 0.511, 4, object: 0.556, 47, object: 0.510, 10, object: 0.500, 1, object: 0.504, 5, object: 0.501, 1, object: 0.510, 6, object: 0.549, 13, object: 0.509, 2, object: 0.510, 3, object: 0.514, 1, object: 0.529, 15, object: 0.551, 110, object: 0.504, 2, object: 0.503, 3, object: 0.518, 16, object: 0.511, 14, object: 0.502, 1, object: 0.523, 1, object: 0.533, 16, object: 0.568, 66, object: 0.582, 1793, "
         ]
       }
     ]
@@ -297,7 +297,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0: 1.000, 1: 0.000, [426,640,3], [426,640,3], [1,600,1,1]; backpack: 0.505, 2, "
+          "backpack: 0.505, 2, "
         ]
       }
     ]
@@ -439,7 +439,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0: 0.272, 1: 0.728, [3500,3500,5], [0], [0]; background: 1.404, 311, background: 1.397, 44, background: 1.371, 34, background: 1.377, 12, background: 1.356, 155, background: 1.345, 12, background: 1.183, 219, background: 1.524, 8, background: 1.533, 4, background: 1.519, 2, background: 1.524, 4, background: 1.530, 6, background: 1.537, 2, background: 1.514, 4, background: 1.519, 8, background: 1.529, 6, background: 1.550, 6, background: 1.558, 4, background: 1.520, 2, background: 1.529, 4, background: 1.532, 6, background: 1.535, 6, background: 1.530, 2, background: 1.529, 50, background: 1.528, 22, background: 1.527, 38, background: 1.451, 1476, background: 1.345, 2743, background: 1.609, 2987, background: 1.636, 29909, "
+          "background: 1.404, 311, background: 1.397, 44, background: 1.371, 34, background: 1.377, 12, background: 1.356, 155, background: 1.345, 12, background: 1.183, 219, background: 1.524, 8, background: 1.533, 4, background: 1.519, 2, background: 1.524, 4, background: 1.530, 6, background: 1.537, 2, background: 1.514, 4, background: 1.519, 8, background: 1.529, 6, background: 1.550, 6, background: 1.558, 4, background: 1.520, 2, background: 1.529, 4, background: 1.532, 6, background: 1.535, 6, background: 1.530, 2, background: 1.529, 50, background: 1.528, 22, background: 1.527, 38, background: 1.451, 1476, background: 1.345, 2743, background: 1.609, 2987, background: 1.636, 29909, "
         ]
       }
     ]

From f7eb382e05770691738187064bddfd7be728546f Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Mon, 12 May 2025 08:05:55 +0200
Subject: [PATCH 09/16] Implement instance segmentation to new scene object

---
 .../include/models/instance_segmentation.h    |  2 +-
 src/cpp/models/include/models/results.h       | 97 +++++++++++++++++--
 src/cpp/models/src/anomaly_model.cpp          |  4 +-
 src/cpp/models/src/classification_model.cpp   | 12 +--
 src/cpp/models/src/detection_model_ssd.cpp    |  6 +-
 src/cpp/models/src/detection_model_yolo.cpp   |  6 +-
 .../src/detection_model_yolov3_onnx.cpp       |  2 +-
 src/cpp/models/src/detection_model_yolox.cpp  |  2 +-
 src/cpp/models/src/instance_segmentation.cpp  | 50 +++++-----
 .../include/tilers/instance_segmentation.h    |  2 +-
 src/cpp/tilers/src/detection.cpp              |  4 +-
 src/cpp/tilers/src/instance_segmentation.cpp  | 68 ++++++-------
 tests/cpp/accuracy/test_accuracy.cpp          | 20 ++--
 tests/python/accuracy/public_scope.json       | 12 +--
 14 files changed, 181 insertions(+), 106 deletions(-)

diff --git a/src/cpp/models/include/models/instance_segmentation.h b/src/cpp/models/include/models/instance_segmentation.h
index 21c94690..48b742dc 100644
--- a/src/cpp/models/include/models/instance_segmentation.h
+++ b/src/cpp/models/include/models/instance_segmentation.h
@@ -49,4 +49,4 @@ class MaskRCNNModel : public BaseModel {
     float confidence_threshold = 0.5f;
 };
 
-cv::Mat segm_postprocess(const SegmentedObject& box, const cv::Mat& unpadded, int im_h, int im_w);
+cv::Mat segm_postprocess(const Mask& box, const cv::Mat& unpadded, int im_h, int im_w);
diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index c1d4cf19..61510ac8 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -381,22 +381,58 @@ struct KeypointDetectionResult : public ResultBase {
 
 class Label {
 public:
-    Label(std::string id, std::string name, float score):  id(id), name(name), score(score) {}
+    Label(int id, std::string name):  id(id), name(name) {}
 
-    std::string id;
+    int id;
     std::string name;
-    float score;
 
     friend std::ostream& operator<< (std::ostream& os, const Label& label) {
-        return os << label.id << " (" << label.name << "): " << std::fixed << std::setprecision(3) << label.score;
+        return os << label.id << " (" << label.name << ")";
+    }
+};
+
+class LabelScore {
+public:
+    LabelScore(int id, std::string name, float score): label(Label(id, name)), score(score) {}
+    LabelScore(Label label, float score):  label(label), score(score) {}
+
+    Label label;
+    float score;
+
+    friend std::ostream& operator<< (std::ostream& os, const LabelScore& label) {
+        return os << label.label << ": " << std::fixed << std::setprecision(3) << label.score;
     }
 };
 
+class Mask {
+public:
+    Mask(LabelScore label, cv::Rect roi, cv::Mat mask): label(label), roi(roi), mask(mask) {}
+
+    LabelScore label;
+    cv::Rect roi;
+    cv::Mat mask;
+};
+
+static inline std::vector<Contour> getContours(const std::vector<Mask>& segmentedObjects) {
+    std::vector<Contour> combined_contours;
+    std::vector<std::vector<cv::Point>> contours;
+    for (const Mask& obj : segmentedObjects) {
+        cv::findContours(obj.mask, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_NONE);
+        // Assuming one contour output for findContours. Based on OTX this is a safe
+        // assumption
+        if (contours.size() != 1) {
+            throw std::runtime_error("findContours() must have returned only one contour");
+        }
+        combined_contours.push_back({obj.label.label.name, obj.label.score, contours[0]});
+    }
+    return combined_contours;
+}
+
 class Box {
 public:
-    Box(cv::Rect shape, std::vector<Label> labels): shape(shape), labels(labels) {}
+    Box(cv::Rect shape, std::vector<LabelScore> labels): shape(shape), labels(labels) {}
     cv::Rect shape;
-    std::vector<Label> labels;
+    std::vector<LabelScore> labels;
 
     friend std::ostream& operator<< (std::ostream& os, const Box& box) {
 
@@ -422,6 +458,51 @@ class Box {
     }
 };
 
+class RotatedRect {
+public:
+    LabelScore label;
+    cv::RotatedRect shape;
+
+    friend std::ostream& operator<< (std::ostream& os, const RotatedRect& box) {
+
+        os << "RotatedRect: ";
+        os << std::fixed << std::setprecision(3);
+        os << box.shape.center.x << ", " << box.shape.center.y << ", " << box.shape.size.width << ", "
+                  << box.shape.size.height << ", " << box.shape.angle;
+        os << box.label << "; ";
+        return os;
+    }
+
+    explicit operator std::string() {
+        std::stringstream ss;
+        ss << *this;
+        return ss.str();
+    }
+};
+
+static inline std::vector<RotatedRect> get_rotated_rects(std::vector<Mask> masks) {
+    std::vector<RotatedRect> result;
+    result.reserve(masks.size());
+    for (const Mask& m : masks) {
+        cv::Mat mask;
+        m.mask.convertTo(mask, CV_8UC1);
+        std::vector<std::vector<cv::Point>> contours;
+        cv::findContours(mask, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE);
+
+        std::vector<cv::Point> contour = {};
+        for (size_t i = 0; i < contours.size(); i++) {
+            contour.insert(contour.end(), contours[i].begin(), contours[i].end());
+        }
+        if (contour.size() > 0) {
+            std::vector<cv::Point> hull;
+            cv::convexHull(contour, hull);
+
+            result.push_back(RotatedRect{m.label, cv::minAreaRect(hull)});
+        }
+    }
+    return result;
+}
+
 class Scene {
 public:
     Scene(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
@@ -432,7 +513,7 @@ class Scene {
     std::shared_ptr<MetaData> metaData;
 
     //std::unique_ptr<ClassificationResult> classification_result;
-    std::unique_ptr<InstanceSegmentationResult> instance_segmentation_result;
+    //std::unique_ptr<InstanceSegmentationResult> instance_segmentation_result;
 
     std::vector<Box> boxes;
     std::vector<DetectedKeypoints> poses;
@@ -440,6 +521,8 @@ class Scene {
     std::vector<cv::Mat> saliency_maps;
     std::vector<ov::Tensor> feature_vectors;
 
+    std::vector<Mask> new_masks;
+
     std::map<std::string, ov::Tensor> additional_tensors;
     std::map<std::string, cv::Mat> masks;
 
diff --git a/src/cpp/models/src/anomaly_model.cpp b/src/cpp/models/src/anomaly_model.cpp
index ec826cae..35aa2e45 100644
--- a/src/cpp/models/src/anomaly_model.cpp
+++ b/src/cpp/models/src/anomaly_model.cpp
@@ -90,7 +90,7 @@ std::unique_ptr<Scene> AnomalyModel::postprocess(InferenceResult& infResult) {
     scene->boxes.push_back(
         Box(
             cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight),
-            {Label(std::to_string(label_id), pred_label, pred_score)}
+            {LabelScore(label_id, pred_label, pred_score)}
         )
     );
 
@@ -103,7 +103,7 @@ std::unique_ptr<Scene> AnomalyModel::postprocess(InferenceResult& infResult) {
             scene->boxes.push_back(
                 Box(
                     rect,
-                    {Label(std::to_string(label_id), pred_label, box_score)}
+                    {LabelScore(label_id, pred_label, box_score)}
                 )
             );
 
diff --git a/src/cpp/models/src/classification_model.cpp b/src/cpp/models/src/classification_model.cpp
index 60c2c2ea..bf8e3840 100644
--- a/src/cpp/models/src/classification_model.cpp
+++ b/src/cpp/models/src/classification_model.cpp
@@ -328,7 +328,7 @@ std::unique_ptr<Scene> ClassificationModel::get_multilabel_predictions(Inference
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
     auto raw_scores = ov::Tensor();
-    std::vector<Label> result;
+    std::vector<LabelScore> result;
     float* raw_scoresPtr = nullptr;
     if (add_raw_scores) {
         raw_scores = ov::Tensor(logitsTensor.get_element_type(), logitsTensor.get_shape());
@@ -339,7 +339,7 @@ std::unique_ptr<Scene> ClassificationModel::get_multilabel_predictions(Inference
     for (size_t i = 0; i < labels.size(); ++i) {
         float score = sigmoid(logitsPtr[i]);
         if (score > confidence_threshold) {
-            result.emplace_back(std::to_string(i), labels[i], score);
+            result.emplace_back(i, labels[i], score);
         }
         if (add_raw_scores) {
             raw_scoresPtr[i] = score;
@@ -401,9 +401,9 @@ std::unique_ptr<Scene> ClassificationModel::get_hierarchical_predictions(Inferen
     }
 
     auto resolved_labels = resolver->resolve_labels(predicted_labels, predicted_scores);
-    std::vector<Label> result;
+    std::vector<LabelScore> result;
     for (const auto& label : resolved_labels) {
-        result.push_back(Label(std::to_string(hierarchical_info.label_to_idx[label.first]), label.first, label.second));
+        result.push_back(LabelScore(hierarchical_info.label_to_idx[label.first], label.first, label.second));
     }
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
     cv::Rect shape(0, 0, internalData.inputImgWidth, internalData.inputImgHeight);
@@ -449,13 +449,13 @@ std::unique_ptr<Scene> ClassificationModel::get_multiclass_predictions(Inference
         scene->additional_tensors["raw_scores"] = raw_scores;
     }
 
-    std::vector<Label> result;
+    std::vector<LabelScore> result;
     for (size_t i = 0; i < scoresTensor.get_size(); ++i) {
         int ind = indicesPtr[i];
         if (ind < 0 || ind >= static_cast<int>(labels.size())) {
             throw std::runtime_error("Invalid index for the class label is found during postprocessing");
         }
-        result.emplace_back(std::to_string(ind), labels[ind], scoresPtr[i]);
+        result.emplace_back(ind, labels[ind], scoresPtr[i]);
     }
 
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
diff --git a/src/cpp/models/src/detection_model_ssd.cpp b/src/cpp/models/src/detection_model_ssd.cpp
index 3013869a..7a938169 100644
--- a/src/cpp/models/src/detection_model_ssd.cpp
+++ b/src/cpp/models/src/detection_model_ssd.cpp
@@ -108,6 +108,7 @@ std::unique_ptr<Scene> ModelSSD::postprocess(InferenceResult& infResult) {
     if (feature_vector_iter != infResult.outputsData.end()) {
         result->feature_vectors.push_back(std::move(feature_vector_iter->second));
     }
+
     return result;
 }
 
@@ -162,7 +163,7 @@ std::unique_ptr<Scene> ModelSSD::postprocessSingleOutput(InferenceResult& infRes
                         0.f,
                         floatInputImgHeight) - y
                 ),
-                {Label(std::to_string(labelID), getLabelName(labelID), confidence)}
+                {LabelScore(labelID, getLabelName(labelID), confidence)}
             );
             scene->boxes.push_back(box);
         }
@@ -199,6 +200,7 @@ std::unique_ptr<Scene> ModelSSD::postprocessMultipleOutputs(InferenceResult& inf
     float widthScale = scores ? netInputWidth : 1.0f;
     float heightScale = scores ? netInputHeight : 1.0f;
 
+
     for (size_t i = 0; i < numAndStep.detectionsNum; i++) {
         float confidence = scores ? scores[i] : boxes[i * numAndStep.objectSize + 4];
 
@@ -222,7 +224,7 @@ std::unique_ptr<Scene> ModelSSD::postprocessMultipleOutputs(InferenceResult& inf
             if (width * height >= box_area_threshold) {
                 scene->boxes.push_back(Box(
                   cv::Rect(x, y, width, height),
-                  {Label(std::to_string(labels[i]), getLabelName(labels[i]), confidence)}
+                  {LabelScore(labels[i], getLabelName(labels[i]), confidence)}
                 ));
             }
         }
diff --git a/src/cpp/models/src/detection_model_yolo.cpp b/src/cpp/models/src/detection_model_yolo.cpp
index 229c033d..1ca92984 100644
--- a/src/cpp/models/src/detection_model_yolo.cpp
+++ b/src/cpp/models/src/detection_model_yolo.cpp
@@ -282,7 +282,7 @@ std::unique_ptr<Scene> ModelYolo::postprocess(InferenceResult& infResult) {
         for (const auto& obj1 : objects) {
             bool isGoodResult = true;
             for (const auto& obj2 : objects) {
-                if (obj1.labels[0].id == obj2.labels[0].id && obj1.labels[0].score < obj2.labels[0].score &&
+                if (obj1.labels[0].label.id == obj2.labels[0].label.id && obj1.labels[0].score < obj2.labels[0].score &&
                     intersectionOverUnion(obj1, obj2) >= iou_threshold) {  // if obj1 is the same as obj2, condition
                                                                            // expression will evaluate to false anyway
                     isGoodResult = false;
@@ -410,7 +410,7 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
 
                     //--- Checking confidence threshold conformance and adding region to the list
                     if (prob >= confidence_threshold) {
-                        objects.push_back(Box(obj, {Label(std::to_string(j), getLabelName(j), prob)}));
+                        objects.push_back(Box(obj, {LabelScore(j, getLabelName(j), prob)}));
                     }
                 }
             }
@@ -635,7 +635,7 @@ std::unique_ptr<Scene> YOLOv5::postprocess(InferenceResult& infResult) {
 
         scene->boxes.push_back(Box(
             cv::Rect(x, y, width, height),
-            {Label(std::to_string(labelID), label, confidence)}
+            {LabelScore(labelID, label, confidence)}
         ));
     }
 
diff --git a/src/cpp/models/src/detection_model_yolov3_onnx.cpp b/src/cpp/models/src/detection_model_yolov3_onnx.cpp
index 348d3073..fe51302a 100644
--- a/src/cpp/models/src/detection_model_yolov3_onnx.cpp
+++ b/src/cpp/models/src/detection_model_yolov3_onnx.cpp
@@ -164,7 +164,7 @@ std::unique_ptr<Scene> ModelYoloV3ONNX::postprocess(InferenceResult& infResult)
             obj.height = clamp(height, 0.f, static_cast<float>(imgHeight));
             obj.width = clamp(width, 0.f, static_cast<float>(imgWidth));
 
-            scene->boxes.push_back(Box(obj, {Label(std::to_string(classInd), getLabelName(classInd), score)}));
+            scene->boxes.push_back(Box(obj, {LabelScore(classInd, getLabelName(classInd), score)}));
         }
     }
     return scene;
diff --git a/src/cpp/models/src/detection_model_yolox.cpp b/src/cpp/models/src/detection_model_yolox.cpp
index 0dd333e4..fcac9cb4 100644
--- a/src/cpp/models/src/detection_model_yolox.cpp
+++ b/src/cpp/models/src/detection_model_yolox.cpp
@@ -195,7 +195,7 @@ std::unique_ptr<Scene> ModelYoloX::postprocess(InferenceResult& infResult) {
         obj.width =
             clamp(validBoxes[index].right - validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth));
         scene->boxes.push_back(
-            Box(obj, {Label(std::to_string(classes[index]), getLabelName(classes[index]), scores[index])})
+            Box(obj, {LabelScore(classes[index], getLabelName(classes[index]), scores[index])})
         );
     }
     return scene;
diff --git a/src/cpp/models/src/instance_segmentation.cpp b/src/cpp/models/src/instance_segmentation.cpp
index 5af1bf3a..2808195a 100644
--- a/src/cpp/models/src/instance_segmentation.cpp
+++ b/src/cpp/models/src/instance_segmentation.cpp
@@ -43,8 +43,8 @@ cv::Rect expand_box(const cv::Rect2f& box, float scale) {
             cv::Point(int(center.x + w_half), int(center.y + h_half))};
 }
 
-std::vector<cv::Mat_<std::uint8_t>> average_and_normalize(const std::vector<std::vector<cv::Mat>>& saliency_maps) {
-    std::vector<cv::Mat_<std::uint8_t>> aggregated;
+std::vector<cv::Mat> average_and_normalize(const std::vector<std::vector<cv::Mat>>& saliency_maps) {
+    std::vector<cv::Mat> aggregated;
     aggregated.reserve(saliency_maps.size());
     for (const std::vector<cv::Mat>& per_object_maps : saliency_maps) {
         if (per_object_maps.empty()) {
@@ -111,11 +111,11 @@ Lbm filterTensors(const std::map<std::string, ov::Tensor>& infResult) {
 }
 }  // namespace
 
-cv::Mat segm_postprocess(const SegmentedObject& box, const cv::Mat& unpadded, int im_h, int im_w) {
+cv::Mat segm_postprocess(const Mask& box, const cv::Mat& unpadded, int im_h, int im_w) {
     // Add zero border to prevent upsampling artifacts on segment borders.
     cv::Mat raw_cls_mask;
     cv::copyMakeBorder(unpadded, raw_cls_mask, 1, 1, 1, 1, cv::BORDER_CONSTANT, {0});
-    cv::Rect extended_box = expand_box(box, float(raw_cls_mask.cols) / (raw_cls_mask.cols - 2));
+    cv::Rect extended_box = expand_box(box.roi, float(raw_cls_mask.cols) / (raw_cls_mask.cols - 2));
 
     int w = std::max(extended_box.width + 1, 1);
     int h = std::max(extended_box.height + 1, 1);
@@ -302,7 +302,6 @@ std::unique_ptr<Scene> MaskRCNNModel::postprocess(InferenceResult& infResult) {
     const cv::Size& masks_size{int(lbm.masks.get_shape()[3]), int(lbm.masks.get_shape()[2])};
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<InstanceSegmentationResult>(infResult.frameId, infResult.metaData);
 
     std::vector<std::vector<cv::Mat>> saliency_maps;
     bool has_feature_vector_name =
@@ -318,47 +317,48 @@ std::unique_ptr<Scene> MaskRCNNModel::postprocess(InferenceResult& infResult) {
         if (confidence <= confidence_threshold && !has_feature_vector_name) {
             continue;
         }
-        SegmentedObject obj;
-
-        obj.confidence = confidence;
-        obj.labelID = labels_tensor_ptr[i] + 1;
-        if (!labels.empty() && obj.labelID >= labels.size()) {
+        size_t labelID = labels_tensor_ptr[i] + 1;
+        if (!labels.empty() && labelID >= labels.size()) {
             continue;
         }
-        obj.label = getLabelName(obj.labelID);
 
-        obj.x = clamp(round((boxes[i * objectSize + 0] - padLeft) * invertedScaleX), 0.f, floatInputImgWidth);
-        obj.y = clamp(round((boxes[i * objectSize + 1] - padTop) * invertedScaleY), 0.f, floatInputImgHeight);
-        obj.width =
-            clamp(round((boxes[i * objectSize + 2] - padLeft) * invertedScaleX - obj.x), 0.f, floatInputImgWidth);
-        obj.height =
-            clamp(round((boxes[i * objectSize + 3] - padTop) * invertedScaleY - obj.y), 0.f, floatInputImgHeight);
+        LabelScore label(labelID, getLabelName(labelID), confidence);
+
+        cv::Rect roi;
 
-        if (obj.height * obj.width <= 1) {
+        roi.x = clamp(round((boxes[i * objectSize + 0] - padLeft) * invertedScaleX), 0.f, floatInputImgWidth);
+        roi.y = clamp(round((boxes[i * objectSize + 1] - padTop) * invertedScaleY), 0.f, floatInputImgHeight);
+        roi.width =
+            clamp(round((boxes[i * objectSize + 2] - padLeft) * invertedScaleX - roi.x), 0.f, floatInputImgWidth);
+        roi.height =
+            clamp(round((boxes[i * objectSize + 3] - padTop) * invertedScaleY - roi.y), 0.f, floatInputImgHeight);
+
+        if (roi.height * roi.width <= 1) {
             continue;
         }
 
+        Mask mask(label, roi, cv::Mat());
+
         cv::Mat raw_cls_mask{masks_size, CV_32F, masks + masks_size.area() * i};
         cv::Mat resized_mask;
         if (postprocess_semantic_masks || has_feature_vector_name) {
-            resized_mask = segm_postprocess(obj, raw_cls_mask, internalData.inputImgHeight, internalData.inputImgWidth);
+            resized_mask = segm_postprocess(mask, raw_cls_mask, internalData.inputImgHeight, internalData.inputImgWidth);
         } else {
             resized_mask = raw_cls_mask;
         }
-        obj.mask = postprocess_semantic_masks ? resized_mask : raw_cls_mask.clone();
+        mask.mask = postprocess_semantic_masks ? resized_mask : raw_cls_mask.clone();
         if (confidence > confidence_threshold) {
-            result->segmentedObjects.push_back(obj);
+            scene->new_masks.push_back(mask);
         }
         if (has_feature_vector_name && confidence > confidence_threshold) {
-            saliency_maps[obj.labelID - 1].push_back(resized_mask);
+            saliency_maps[labelID - 1].push_back(resized_mask);
         }
     }
-    result->saliency_map = average_and_normalize(saliency_maps);
+    scene->saliency_maps = average_and_normalize(saliency_maps);
     if (has_feature_vector_name) {
-        result->feature_vector = std::move(infResult.outputsData[feature_vector_name]);
+        scene->feature_vectors.push_back(std::move(infResult.outputsData[feature_vector_name]));
     }
 
-    scene->instance_segmentation_result = std::move(result);
     return scene;
 }
 
diff --git a/src/cpp/tilers/include/tilers/instance_segmentation.h b/src/cpp/tilers/include/tilers/instance_segmentation.h
index 43574e82..7154a980 100644
--- a/src/cpp/tilers/include/tilers/instance_segmentation.h
+++ b/src/cpp/tilers/include/tilers/instance_segmentation.h
@@ -24,7 +24,7 @@ class InstanceSegmentationTiler : public TilerBase {
                                                       const cv::Size&,
                                                       const std::vector<cv::Rect>&);
 
-    std::vector<cv::Mat_<std::uint8_t>> merge_saliency_maps(const std::vector<std::unique_ptr<Scene>>&,
+    std::vector<cv::Mat> merge_saliency_maps(const std::vector<std::unique_ptr<Scene>>&,
                                                             const cv::Size&,
                                                             const std::vector<cv::Rect>&);
 
diff --git a/src/cpp/tilers/src/detection.cpp b/src/cpp/tilers/src/detection.cpp
index 9d9eceaa..00070af5 100644
--- a/src/cpp/tilers/src/detection.cpp
+++ b/src/cpp/tilers/src/detection.cpp
@@ -66,9 +66,7 @@ std::unique_ptr<Scene> DetectionTiler::merge_results(const std::vector<std::uniq
 
     for (const auto& result : tiles_results) {
         for (auto& det : result->boxes) {
-            size_t id;
-            sscanf(det.labels[0].id.c_str(), "%zu", &id);
-            all_detections.emplace_back(det.shape.x, det.shape.y, det.shape.x + det.shape.width, det.shape.y + det.shape.height, id);
+            all_detections.emplace_back(det.shape.x, det.shape.y, det.shape.x + det.shape.width, det.shape.y + det.shape.height, det.labels[0].label.id);
             all_scores.push_back(det.labels[0].score);
             all_detections_refs.push_back(det);
         }
diff --git a/src/cpp/tilers/src/instance_segmentation.cpp b/src/cpp/tilers/src/instance_segmentation.cpp
index 27454d96..911a20c0 100644
--- a/src/cpp/tilers/src/instance_segmentation.cpp
+++ b/src/cpp/tilers/src/instance_segmentation.cpp
@@ -56,17 +56,9 @@ std::unique_ptr<Scene> InstanceSegmentationTiler::run(const ImageInputData& inpu
 
 std::unique_ptr<Scene> InstanceSegmentationTiler::postprocess_tile(std::unique_ptr<Scene> tile_result,
                                                                         const cv::Rect& coord) {
-    auto& iseg_res = tile_result->instance_segmentation_result;
-    for (auto& det : iseg_res->segmentedObjects) {
-        det.x += coord.x;
-        det.y += coord.y;
-    }
-
-    if (iseg_res->feature_vector) {
-        auto tmp_feature_vector =
-            ov::Tensor(iseg_res->feature_vector.get_element_type(), iseg_res->feature_vector.get_shape());
-        iseg_res->feature_vector.copy_to(tmp_feature_vector);
-        iseg_res->feature_vector = tmp_feature_vector;
+    for (auto& det : tile_result->new_masks) {
+        det.roi.x += coord.x;
+        det.roi.y += coord.y;
     }
 
     return tile_result;
@@ -77,23 +69,22 @@ std::unique_ptr<Scene> InstanceSegmentationTiler::merge_results(
     const cv::Size& image_size,
     const std::vector<cv::Rect>& tile_coords) {
     auto scene = std::make_unique<Scene>();
-    auto result = std::make_unique<InstanceSegmentationResult>();
 
     std::vector<AnchorLabeled> all_detections;
-    std::vector<std::reference_wrapper<SegmentedObject>> all_detections_ptrs;
+    std::vector<std::reference_wrapper<Mask>> all_detections_ptrs;
     std::vector<float> all_scores;
 
     for (const auto& result : tiles_results) {
-        for (auto& det : result->instance_segmentation_result->segmentedObjects) {
-            all_detections.emplace_back(det.x, det.y, det.x + det.width, det.y + det.height, det.labelID);
-            all_scores.push_back(det.confidence);
+        for (auto& det : result->new_masks) {
+            all_detections.emplace_back(det.roi.x, det.roi.y, det.roi.x + det.roi.width, det.roi.y + det.roi.height, det.label.label.id);
+            all_scores.push_back(det.label.score);
             all_detections_ptrs.push_back(det);
         }
     }
 
     auto keep_idx = multiclass_nms(all_detections, all_scores, iou_threshold, false, max_pred_number);
 
-    result->segmentedObjects.reserve(keep_idx.size());
+    scene->new_masks.reserve(keep_idx.size());
     for (auto idx : keep_idx) {
         if (postprocess_semantic_masks) {
             all_detections_ptrs[idx].get().mask = segm_postprocess(all_detections_ptrs[idx],
@@ -101,26 +92,25 @@ std::unique_ptr<Scene> InstanceSegmentationTiler::merge_results(
                                                                    image_size.height,
                                                                    image_size.width);
         }
-        result->segmentedObjects.push_back(all_detections_ptrs[idx]);
+        scene->new_masks.push_back(all_detections_ptrs[idx]);
     }
 
     if (tiles_results.size()) {
-        auto& iseg_res = tiles_results.begin()->get()->instance_segmentation_result;
-        if (iseg_res->feature_vector) {
-            result->feature_vector =
-                ov::Tensor(iseg_res->feature_vector.get_element_type(), iseg_res->feature_vector.get_shape());
+        auto& feature_vectors = tiles_results.begin()->get()->feature_vectors;
+        if (!feature_vectors.empty()) {
+            scene->feature_vectors.push_back(ov::Tensor(feature_vectors[0].get_element_type(), feature_vectors[0].get_shape()));
         }
     }
 
-    if (result->feature_vector) {
-        float* feature_ptr = result->feature_vector.data<float>();
-        size_t feature_size = result->feature_vector.get_size();
+    if (!scene->feature_vectors.empty()) {
+        auto feature_vector = scene->feature_vectors[0];
+        float* feature_ptr = feature_vector.data<float>();
+        size_t feature_size = feature_vector.get_size();
 
         std::fill(feature_ptr, feature_ptr + feature_size, 0.f);
 
         for (const auto& result : tiles_results) {
-            auto& iseg_res = result->instance_segmentation_result;
-            const float* current_feature_ptr = iseg_res->feature_vector.data<float>();
+            const float* current_feature_ptr = result->feature_vectors[0].data<float>();
 
             for (size_t i = 0; i < feature_size; ++i) {
                 feature_ptr[i] += current_feature_ptr[i];
@@ -132,24 +122,23 @@ std::unique_ptr<Scene> InstanceSegmentationTiler::merge_results(
         }
     }
 
-    result->saliency_map = merge_saliency_maps(tiles_results, image_size, tile_coords);
-
-    scene->instance_segmentation_result = std::move(result);
+    scene->saliency_maps = merge_saliency_maps(tiles_results, image_size, tile_coords);
     return scene;
 }
 
-std::vector<cv::Mat_<std::uint8_t>> InstanceSegmentationTiler::merge_saliency_maps(
+std::vector<cv::Mat> InstanceSegmentationTiler::merge_saliency_maps(
     const std::vector<std::unique_ptr<Scene>>& tiles_results,
     const cv::Size& image_size,
     const std::vector<cv::Rect>& tile_coords) {
-    std::vector<std::vector<cv::Mat_<std::uint8_t>>> all_saliecy_maps;
+    std::vector<std::vector<cv::Mat>> all_saliecy_maps;
     all_saliecy_maps.reserve(tiles_results.size());
+
+
     for (const auto& result : tiles_results) {
-        auto& det_res = result->instance_segmentation_result;
-        all_saliecy_maps.push_back(det_res->saliency_map);
+        all_saliecy_maps.push_back(result->saliency_maps);
     }
 
-    std::vector<cv::Mat_<std::uint8_t>> image_saliency_map;
+    std::vector<cv::Mat> image_saliency_map;
     if (all_saliecy_maps.size()) {
         image_saliency_map = all_saliecy_maps[0];
     }
@@ -158,10 +147,11 @@ std::vector<cv::Mat_<std::uint8_t>> InstanceSegmentationTiler::merge_saliency_ma
         return image_saliency_map;
     }
 
+
     size_t num_classes = image_saliency_map.size();
-    std::vector<cv::Mat_<std::uint8_t>> merged_map(num_classes);
+    std::vector<cv::Mat> merged_map(num_classes);
     for (auto& map : merged_map) {
-        map = cv::Mat_<std::uint8_t>(image_size, 0);
+        map = cv::Mat(image_size, 0);
     }
 
     size_t start_idx = tile_with_full_img ? 1 : 0;
@@ -180,10 +170,10 @@ std::vector<cv::Mat_<std::uint8_t>> InstanceSegmentationTiler::merge_saliency_ma
     }
 
     for (size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
-        auto image_map_cls = tile_with_full_img ? image_saliency_map[class_idx] : cv::Mat_<std::uint8_t>();
+        auto image_map_cls = tile_with_full_img ? image_saliency_map[class_idx] : cv::Mat();
         if (image_map_cls.empty()) {
             if (cv::sum(merged_map[class_idx]) == cv::Scalar(0.)) {
-                merged_map[class_idx] = cv::Mat_<std::uint8_t>();
+                merged_map[class_idx] = cv::Mat();
             }
         } else {
             cv::resize(image_map_cls, image_map_cls, image_size);
diff --git a/tests/cpp/accuracy/test_accuracy.cpp b/tests/cpp/accuracy/test_accuracy.cpp
index 24a655bd..cda28aaf 100644
--- a/tests/cpp/accuracy/test_accuracy.cpp
+++ b/tests/cpp/accuracy/test_accuracy.cpp
@@ -168,6 +168,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         result = tiler.run(image);
                     } else {
                         result = model->infer(image);
+
                     }
                     EXPECT_EQ(std::string{*result}, modelData.testData[i].reference[0]);
                 }
@@ -243,29 +244,28 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         result = model->infer(image);
                     }
 
-                    const std::vector<SegmentedObjectWithRects>& withRects =
-                        add_rotated_rects(result->instance_segmentation_result->segmentedObjects);
+                    auto rotated_rects = get_rotated_rects(result->new_masks);
                     std::stringstream ss;
-                    for (const SegmentedObjectWithRects& obj : withRects) {
+                    for (auto& obj : rotated_rects) {
                         ss << obj << "; ";
                     }
                     size_t filled = 0;
-                    for (const cv::Mat_<std::uint8_t>& cls_map : result->instance_segmentation_result->saliency_map) {
+                    for (const cv::Mat& cls_map : result->saliency_maps) {
                         if (cls_map.data) {
                             ++filled;
                         }
                     }
                     ss << filled << "; ";
-                    try {
-                        ss << result->instance_segmentation_result->feature_vector.get_shape();
-                    } catch (ov::Exception&) {
+                    if (result->feature_vectors.empty()) {
                         ss << "[0]";
+                    } else {
+                        ss << result->feature_vectors[0].get_shape();
                     }
                     ss << "; ";
                     try {
                         // getContours() assumes each instance generates only one contour.
                         // That doesn't hold for some models
-                        for (const Contour& contour : getContours(result->instance_segmentation_result->segmentedObjects)) {
+                        for (const Contour& contour : getContours(result->new_masks)) {
                             ss << contour << "; ";
                         }
                     } catch (const std::runtime_error&) {
@@ -313,7 +313,9 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
     }
 }
 
-INSTANTIATE_TEST_SUITE_P(TestAccuracyPublic, ModelParameterizedTest, testing::ValuesIn(GetTestData(PUBLIC_SCOPE_PATH)));
+INSTANTIATE_TEST_SUITE_P(TestAccuracyPublic, ModelParameterizedTest, testing::ValuesIn(GetTestData(PUBLIC_SCOPE_PATH)), [](const testing::TestParamInfo<ModelData>& info) {
+    return std::to_string(info.index) + "_" + info.param.type;
+});
 
 class InputParser {
 public:
diff --git a/tests/python/accuracy/public_scope.json b/tests/python/accuracy/public_scope.json
index 6e99ed23..13da81d0 100644
--- a/tests/python/accuracy/public_scope.json
+++ b/tests/python/accuracy/public_scope.json
@@ -214,7 +214,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "458, 106, 495, 150, 1 (bicycle): 0.818, 852, RotatedRect: 478.119 130.332 28.677 46.408 46.637; 0, 30, 178, 323, 2 (car): 0.753, 26728, RotatedRect: 79.739 177.262 251.785 156.656 87.397; 0; [0]; bicycle: 0.818, 139; car: 0.753, 622; "
+          "RotatedRect: 478.119, 130.332, 28.677, 46.408, 46.6371 (bicycle): 0.818; ; RotatedRect: 79.739, 177.262, 251.785, 156.656, 87.3972 (car): 0.753; ; 0; [0]; bicycle: 0.818, 139; car: 0.753, 622; "
         ]
       }
     ]
@@ -226,7 +226,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "458, 106, 495, 150, 1 (person): 0.818, 852, RotatedRect: 478.119 130.332 28.677 46.408 46.637; 0, 30, 178, 323, 2 (bicycle): 0.753, 26728, RotatedRect: 79.739 177.262 251.785 156.656 87.397; 0; [0]; person: 0.818, 139; bicycle: 0.753, 622; "
+          "RotatedRect: 478.119, 130.332, 28.677, 46.408, 46.6371 (bicycle): 0.818; ; RotatedRect: 79.739, 177.262, 251.785, 156.656, 87.3972 (car): 0.753; ; 0; [0]; bicycle: 0.818, 139; car: 0.753, 622; "
         ]
       }
     ]
@@ -239,7 +239,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "458, 106, 495, 150, 1 (person): 0.816, 851, RotatedRect: 478.119 130.332 28.677 46.408 46.637; 0, 30, 178, 323, 2 (bicycle): 0.754, 26748, RotatedRect: 79.762 177.261 251.785 156.702 87.397; 0; [0]; person: 0.816, 142; bicycle: 0.754, 622; "
+          "RotatedRect: 210.000, 327.500, 101.000, 296.000, 90.00016 (horse): 0.999; ; RotatedRect: 82.086, 163.312, 307.394, 156.997, 89.6692 (car): 0.999; ; RotatedRect: 305.000, 123.500, 59.000, 18.000, 90.0001 (bicycle): 0.985; ; RotatedRect: 332.500, 116.000, 38.000, 13.000, 90.0001 (bicycle): 0.974; ; RotatedRect: 476.052, 126.972, 27.619, 47.834, 16.9281 (bicycle): 0.918; ; RotatedRect: 369.319, 119.891, 54.848, 34.230, 82.4051 (bicycle): 0.807; ; RotatedRect: 284.000, 127.500, 35.000, 10.000, 90.0001 (bicycle): 0.788; ; 0; [0]; horse: 0.999, 668; car: 0.999, 782; bicycle: 0.985, 127; bicycle: 0.974, 87; bicycle: 0.918, 122; bicycle: 0.807, 140; bicycle: 0.788, 79; "
         ]
       }
     ]
@@ -251,7 +251,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "59, 277, 360, 380, 16 (horse): 0.999, 19053, RotatedRect: 210.000 327.500 101.000 296.000 90.000; 2, 9, 162, 318, 2 (car): 0.999, 31153, RotatedRect: 82.086 163.312 307.394 156.997 89.669; 294, 94, 316, 153, 1 (bicycle): 0.985, 840, RotatedRect: 305.000 123.500 59.000 18.000 90.000; 326, 97, 341, 136, 1 (bicycle): 0.974, 397, RotatedRect: 332.500 116.000 38.000 13.000 90.000; 461, 105, 493, 150, 1 (bicycle): 0.918, 846, RotatedRect: 476.052 126.972 27.619 47.834 16.928; 350, 92, 386, 149, 1 (bicycle): 0.807, 1458, RotatedRect: 369.319 119.891 54.848 34.230 82.405; 279, 110, 291, 146, 1 (bicycle): 0.788, 312, RotatedRect: 284.000 127.500 35.000 10.000 90.000; 0; [0]; horse: 0.999, 668; car: 0.999, 782; bicycle: 0.985, 127; bicycle: 0.974, 87; bicycle: 0.918, 122; bicycle: 0.807, 140; bicycle: 0.788, 79; "
+          "RotatedRect: 210.000, 327.500, 101.000, 296.000, 90.00016 (horse): 0.999; ; RotatedRect: 82.086, 163.312, 307.394, 156.997, 89.6692 (car): 0.999; ; RotatedRect: 305.000, 123.500, 59.000, 18.000, 90.0001 (bicycle): 0.985; ; RotatedRect: 332.500, 116.000, 38.000, 13.000, 90.0001 (bicycle): 0.974; ; RotatedRect: 476.052, 126.972, 27.619, 47.834, 16.9281 (bicycle): 0.918; ; RotatedRect: 369.319, 119.891, 54.848, 34.230, 82.4051 (bicycle): 0.807; ; RotatedRect: 284.000, 127.500, 35.000, 10.000, 90.0001 (bicycle): 0.788; ; 0; [0]; horse: 0.999, 668; car: 0.999, 782; bicycle: 0.985, 127; bicycle: 0.974, 87; bicycle: 0.918, 122; bicycle: 0.807, 140; bicycle: 0.788, 79; "
         ]
       }
     ]
@@ -309,7 +309,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "61, 277, 358, 382, 17 (horse): 0.998, 18312, RotatedRect: 212.000 327.000 290.000 100.000 0.000; 1, 14, 162, 321, 2 (car): 0.994, 25867, RotatedRect: 54.067 173.034 285.208 156.889 61.996; 327, 96, 341, 134, 1 (bicycle): 0.930, 279, RotatedRect: 333.500 114.000 36.000 13.000 90.000; 460, 106, 493, 148, 1 (bicycle): 0.898, 786, RotatedRect: 476.284 126.621 27.308 45.993 19.179; 294, 93, 315, 153, 1 (bicycle): 0.869, 789, RotatedRect: 304.000 124.000 58.000 18.000 90.000; 278, 109, 290, 152, 1 (bicycle): 0.817, 355, RotatedRect: 283.500 130.000 42.000 11.000 90.000; 4, 4, 102, 191, 2 (car): 0.701, 9658, RotatedRect: 51.806 97.259 184.445 95.281 89.246; 270, 93, 290, 152, 1 (bicycle): 0.660, 723, RotatedRect: 280.500 122.500 17.000 59.000 0.000; 322, 114, 343, 152, 18 (sheep): 0.520, 298, RotatedRect: 332.000 133.000 34.000 14.000 90.000; 4; [1,1280,1,1]; "
+          "RotatedRect: 212.000, 327.000, 290.000, 100.000, 0.00017 (horse): 0.998; ; RotatedRect: 54.067, 173.034, 285.208, 156.889, 61.9962 (car): 0.994; ; RotatedRect: 333.500, 114.000, 36.000, 13.000, 90.0001 (bicycle): 0.930; ; RotatedRect: 476.284, 126.621, 27.308, 45.993, 19.1791 (bicycle): 0.898; ; RotatedRect: 304.000, 124.000, 58.000, 18.000, 90.0001 (bicycle): 0.869; ; RotatedRect: 283.500, 130.000, 42.000, 11.000, 90.0001 (bicycle): 0.817; ; RotatedRect: 51.806, 97.259, 184.445, 95.281, 89.2462 (car): 0.701; ; RotatedRect: 280.500, 122.500, 17.000, 59.000, 0.0001 (bicycle): 0.660; ; RotatedRect: 332.000, 133.000, 34.000, 14.000, 90.00018 (sheep): 0.520; ; 4; [1,1280,1,1]; "
         ]
       }
     ]
@@ -325,7 +325,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "1535, 585, 1662, 697, 2 (ellipse): 0.643, 9822, RotatedRect: 1598.500 641.500 111.000 109.000 90.000; 3091, 3097, 3105, 3112, 1 (rectangle): 0.483, 197, RotatedRect: 3097.500 3104.000 14.000 13.000 90.000; 2734, 60, 2867, 324, 1 (rectangle): 0.401, 30622, RotatedRect: 2800.000 188.500 255.000 132.000 90.000; 2; [1,1280,1,1]; ellipse: 0.643, 331; rectangle: 0.483, 48; rectangle: 0.401, 758; "
+          "RotatedRect: 1598.500, 641.500, 111.000, 109.000, 90.0002 (ellipse): 0.643; ; RotatedRect: 3097.500, 3104.000, 14.000, 13.000, 90.0001 (rectangle): 0.483; ; RotatedRect: 2800.000, 188.500, 255.000, 132.000, 90.0001 (rectangle): 0.401; ; 2; [1,1280,1,1]; ellipse: 0.643, 331; rectangle: 0.483, 48; rectangle: 0.401, 758; "
         ]
       }
     ]

From 8897be649c569f0a6ab804158301bf60e69dcc03 Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Mon, 12 May 2025 08:43:08 +0200
Subject: [PATCH 10/16] Migrated to "new_masks".

---
 src/cpp/models/include/models/results.h      | 22 +++++++++++++-------
 src/cpp/models/src/anomaly_model.cpp         | 11 ++++------
 src/cpp/models/src/segmentation_model.cpp    | 14 +++++++++----
 src/cpp/tilers/src/semantic_segmentation.cpp | 11 +++++-----
 tests/cpp/accuracy/test_accuracy.cpp         |  4 ++--
 tests/python/accuracy/public_scope.json      |  2 +-
 6 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index 61510ac8..bfcaf777 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -381,6 +381,7 @@ struct KeypointDetectionResult : public ResultBase {
 
 class Label {
 public:
+    Label() {}
     Label(int id, std::string name):  id(id), name(name) {}
 
     int id;
@@ -393,6 +394,7 @@ class Label {
 
 class LabelScore {
 public:
+    LabelScore() {}
     LabelScore(int id, std::string name, float score): label(Label(id, name)), score(score) {}
     LabelScore(Label label, float score):  label(label), score(score) {}
 
@@ -411,6 +413,15 @@ class Mask {
     LabelScore label;
     cv::Rect roi;
     cv::Mat mask;
+
+    friend std::ostream& operator<< (std::ostream& os, const Mask& mask) {
+
+        double min_mask, max_mask;
+        cv::minMaxLoc(mask.mask, &min_mask, &max_mask);
+        os << mask.label << mask.roi << " min:" << min_mask << " max:" << max_mask << ";";
+        return os;
+    }
+
 };
 
 static inline std::vector<Contour> getContours(const std::vector<Mask>& segmentedObjects) {
@@ -512,9 +523,6 @@ class Scene {
     int64_t frameId;
     std::shared_ptr<MetaData> metaData;
 
-    //std::unique_ptr<ClassificationResult> classification_result;
-    //std::unique_ptr<InstanceSegmentationResult> instance_segmentation_result;
-
     std::vector<Box> boxes;
     std::vector<DetectedKeypoints> poses;
 
@@ -524,7 +532,7 @@ class Scene {
     std::vector<Mask> new_masks;
 
     std::map<std::string, ov::Tensor> additional_tensors;
-    std::map<std::string, cv::Mat> masks;
+    //std::map<std::string, cv::Mat> masks;
 
     friend std::ostream& operator<<(std::ostream& os, const Scene& scene) {
         for (auto& box: scene.boxes) {
@@ -541,10 +549,8 @@ class Scene {
             os << "[1," << scene.saliency_maps.size() << "," << scene.saliency_maps[0].rows << "," << scene.saliency_maps[0].cols << "]; ";
         }
 
-        for (auto& m: scene.masks) {
-            double min_mask, max_mask;
-            cv::minMaxLoc(m.second, &min_mask, &max_mask);
-            os << m.first << " min:" << min_mask << " max:" << max_mask << ";";
+        for (auto& m: scene.new_masks) {
+            os << m;
         }
 
         if (scene.feature_vectors.empty()){
diff --git a/src/cpp/models/src/anomaly_model.cpp b/src/cpp/models/src/anomaly_model.cpp
index 35aa2e45..8fc6a921 100644
--- a/src/cpp/models/src/anomaly_model.cpp
+++ b/src/cpp/models/src/anomaly_model.cpp
@@ -86,13 +86,10 @@ std::unique_ptr<Scene> AnomalyModel::postprocess(InferenceResult& infResult) {
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
 
     scene->saliency_maps.push_back(anomaly_map);
-    scene->masks["pred_mask"] = std::move(pred_mask);
-    scene->boxes.push_back(
-        Box(
-            cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight),
-            {LabelScore(label_id, pred_label, pred_score)}
-        )
-    );
+    auto label = LabelScore(label_id, pred_label, pred_score);
+    auto roi = cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight);
+    scene->new_masks.push_back(Mask(label, roi, pred_mask));
+    scene->boxes.push_back(Box(roi, {label}));
 
     if (task == "detection") {
         pred_boxes = getBoxes(pred_mask);
diff --git a/src/cpp/models/src/segmentation_model.cpp b/src/cpp/models/src/segmentation_model.cpp
index cda33d53..53b92799 100644
--- a/src/cpp/models/src/segmentation_model.cpp
+++ b/src/cpp/models/src/segmentation_model.cpp
@@ -260,7 +260,8 @@ std::unique_ptr<Scene> SegmentationModel::postprocess(InferenceResult& infResult
                cv::INTER_NEAREST);
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    scene->masks["hard_prediction"] = hard_prediction;
+    auto roi = cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight);
+    scene->new_masks.push_back(Mask(LabelScore(0, "hard_prediction", 0), roi, hard_prediction));
     if (return_soft_prediction) {
         cv::resize(soft_prediction,
                    soft_prediction,
@@ -268,19 +269,24 @@ std::unique_ptr<Scene> SegmentationModel::postprocess(InferenceResult& infResult
                    0.0,
                    0.0,
                    cv::INTER_NEAREST);
-        scene->masks["soft_prediction"] = soft_prediction;
+
+        scene->new_masks.push_back(Mask(LabelScore(1, "soft_prediction", 0), roi, soft_prediction));
         auto iter = infResult.outputsData.find(feature_vector_name);
         if (infResult.outputsData.end() != iter) {
             scene->saliency_maps.push_back(get_activation_map(soft_prediction));
             scene->feature_vectors.push_back(iter->second);
         }
     }
+
+    if (return_contours) {
+
+    }
     return scene;
 }
 
 std::vector<Contour> SegmentationModel::getContours(const std::unique_ptr<Scene>& scene) {
-    auto soft_prediction = scene->masks["soft_prediction"];
-    auto hard_prediction = scene->masks["hard_prediction"];
+    auto hard_prediction = scene->new_masks[0].mask;
+    auto soft_prediction = scene->new_masks[1].mask;
     if (soft_prediction.channels() == 1) {
         throw std::runtime_error{"Cannot get contours from soft prediction with 1 layer"};
     }
diff --git a/src/cpp/tilers/src/semantic_segmentation.cpp b/src/cpp/tilers/src/semantic_segmentation.cpp
index 846f247d..6b169aff 100644
--- a/src/cpp/tilers/src/semantic_segmentation.cpp
+++ b/src/cpp/tilers/src/semantic_segmentation.cpp
@@ -56,7 +56,7 @@ std::unique_ptr<Scene> SemanticSegmentationTiler::run(const ImageInputData& inpu
 
 std::unique_ptr<Scene> SemanticSegmentationTiler::postprocess_tile(std::unique_ptr<Scene> tile_result,
                                                                         const cv::Rect&) {
-    if (tile_result->masks.find("soft_prediction") == tile_result->masks.end()){
+    if (tile_result->new_masks.size() < 2){
         throw std::runtime_error(
             "SemanticSegmentationTiler requires the underlying model to return ImageResultWithSoftPrediction");
     }
@@ -72,13 +72,13 @@ std::unique_ptr<Scene> SemanticSegmentationTiler::merge_results(
     }
 
     cv::Mat voting_mask(cv::Size(image_size.width, image_size.height), CV_32SC1, cv::Scalar(0));
-    auto first_soft_prediction = tiles_results[0]->masks["soft_prediction"];
+    auto first_soft_prediction = tiles_results[0]->new_masks[1].mask;
     cv::Mat merged_soft_prediction(cv::Size(image_size.width, image_size.height),
                                    CV_32FC(first_soft_prediction.channels()),
                                    cv::Scalar(0));
 
     for (size_t i = 0; i < tiles_results.size(); ++i) {
-        auto soft_prediction = tiles_results[i]->masks["soft_prediction"];
+        auto soft_prediction = tiles_results[i]->new_masks[1].mask;
         voting_mask(tile_coords[i]) += 1;
         merged_soft_prediction(tile_coords[i]) += soft_prediction;
     }
@@ -89,7 +89,8 @@ std::unique_ptr<Scene> SemanticSegmentationTiler::merge_results(
         create_hard_prediction_from_soft_prediction(merged_soft_prediction, soft_threshold, blur_strength);
 
     auto scene = std::make_unique<Scene>();
-    scene->masks["hard_prediction"] = hard_prediction;
-    scene->masks["soft_prediction"] = merged_soft_prediction;
+    auto roi = cv::Rect(0, 0, image_size.width, image_size.height);
+    scene->new_masks.push_back(Mask(LabelScore(0, "hard_prediction", 0), roi, hard_prediction));
+    scene->new_masks.push_back(Mask(LabelScore(0, "soft_prediction", 0), roi, merged_soft_prediction));
     return scene;
 }
diff --git a/tests/cpp/accuracy/test_accuracy.cpp b/tests/cpp/accuracy/test_accuracy.cpp
index cda28aaf..408486dd 100644
--- a/tests/cpp/accuracy/test_accuracy.cpp
+++ b/tests/cpp/accuracy/test_accuracy.cpp
@@ -209,8 +209,8 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         pred = model->infer(image);
                     }
 
-                    if (pred->masks.find("soft_prediction") != pred->masks.end()) {
-                        cv::Mat soft = pred->masks["soft_prediction"];
+                    if (pred->new_masks.size() == 2) {
+                        cv::Mat soft = pred->new_masks[1].mask;
                         const std::vector<Contour>& contours = model->getContours(pred);
                         std::stringstream ss;
                         for (const Contour& contour : contours) {
diff --git a/tests/python/accuracy/public_scope.json b/tests/python/accuracy/public_scope.json
index 13da81d0..6ec278e6 100644
--- a/tests/python/accuracy/public_scope.json
+++ b/tests/python/accuracy/public_scope.json
@@ -363,7 +363,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0, 0, 640, 426, 1 (Anomaly): 0.854; [1,1,426,640]; pred_mask min:0.000 max:1.000;[0]"
+          "0, 0, 640, 426, 1 (Anomaly): 0.854; [1,1,426,640]; 1 (Anomaly): 0.854[640 x 426 from (0, 0)] min:0.000 max:1.000;[0]"
         ]
       }
     ]

From 223f924f7ce37166380903764edd7b2353b2e125 Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Mon, 12 May 2025 08:47:43 +0200
Subject: [PATCH 11/16] Rename new masks to masks

---
 src/cpp/models/include/models/results.h      |  5 ++---
 src/cpp/models/src/anomaly_model.cpp         |  2 +-
 src/cpp/models/src/instance_segmentation.cpp |  2 +-
 src/cpp/models/src/segmentation_model.cpp    |  8 ++++----
 src/cpp/tilers/src/instance_segmentation.cpp |  8 ++++----
 src/cpp/tilers/src/semantic_segmentation.cpp | 10 +++++-----
 tests/cpp/accuracy/test_accuracy.cpp         |  8 ++++----
 7 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index bfcaf777..cdf31e23 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -529,10 +529,9 @@ class Scene {
     std::vector<cv::Mat> saliency_maps;
     std::vector<ov::Tensor> feature_vectors;
 
-    std::vector<Mask> new_masks;
+    std::vector<Mask> masks;
 
     std::map<std::string, ov::Tensor> additional_tensors;
-    //std::map<std::string, cv::Mat> masks;
 
     friend std::ostream& operator<<(std::ostream& os, const Scene& scene) {
         for (auto& box: scene.boxes) {
@@ -549,7 +548,7 @@ class Scene {
             os << "[1," << scene.saliency_maps.size() << "," << scene.saliency_maps[0].rows << "," << scene.saliency_maps[0].cols << "]; ";
         }
 
-        for (auto& m: scene.new_masks) {
+        for (auto& m: scene.masks) {
             os << m;
         }
 
diff --git a/src/cpp/models/src/anomaly_model.cpp b/src/cpp/models/src/anomaly_model.cpp
index 8fc6a921..e1c61a92 100644
--- a/src/cpp/models/src/anomaly_model.cpp
+++ b/src/cpp/models/src/anomaly_model.cpp
@@ -88,7 +88,7 @@ std::unique_ptr<Scene> AnomalyModel::postprocess(InferenceResult& infResult) {
     scene->saliency_maps.push_back(anomaly_map);
     auto label = LabelScore(label_id, pred_label, pred_score);
     auto roi = cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight);
-    scene->new_masks.push_back(Mask(label, roi, pred_mask));
+    scene->masks.push_back(Mask(label, roi, pred_mask));
     scene->boxes.push_back(Box(roi, {label}));
 
     if (task == "detection") {
diff --git a/src/cpp/models/src/instance_segmentation.cpp b/src/cpp/models/src/instance_segmentation.cpp
index 2808195a..a5ebfdc9 100644
--- a/src/cpp/models/src/instance_segmentation.cpp
+++ b/src/cpp/models/src/instance_segmentation.cpp
@@ -348,7 +348,7 @@ std::unique_ptr<Scene> MaskRCNNModel::postprocess(InferenceResult& infResult) {
         }
         mask.mask = postprocess_semantic_masks ? resized_mask : raw_cls_mask.clone();
         if (confidence > confidence_threshold) {
-            scene->new_masks.push_back(mask);
+            scene->masks.push_back(mask);
         }
         if (has_feature_vector_name && confidence > confidence_threshold) {
             saliency_maps[labelID - 1].push_back(resized_mask);
diff --git a/src/cpp/models/src/segmentation_model.cpp b/src/cpp/models/src/segmentation_model.cpp
index 53b92799..88208a98 100644
--- a/src/cpp/models/src/segmentation_model.cpp
+++ b/src/cpp/models/src/segmentation_model.cpp
@@ -261,7 +261,7 @@ std::unique_ptr<Scene> SegmentationModel::postprocess(InferenceResult& infResult
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
     auto roi = cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight);
-    scene->new_masks.push_back(Mask(LabelScore(0, "hard_prediction", 0), roi, hard_prediction));
+    scene->masks.push_back(Mask(LabelScore(0, "hard_prediction", 0), roi, hard_prediction));
     if (return_soft_prediction) {
         cv::resize(soft_prediction,
                    soft_prediction,
@@ -270,7 +270,7 @@ std::unique_ptr<Scene> SegmentationModel::postprocess(InferenceResult& infResult
                    0.0,
                    cv::INTER_NEAREST);
 
-        scene->new_masks.push_back(Mask(LabelScore(1, "soft_prediction", 0), roi, soft_prediction));
+        scene->masks.push_back(Mask(LabelScore(1, "soft_prediction", 0), roi, soft_prediction));
         auto iter = infResult.outputsData.find(feature_vector_name);
         if (infResult.outputsData.end() != iter) {
             scene->saliency_maps.push_back(get_activation_map(soft_prediction));
@@ -285,8 +285,8 @@ std::unique_ptr<Scene> SegmentationModel::postprocess(InferenceResult& infResult
 }
 
 std::vector<Contour> SegmentationModel::getContours(const std::unique_ptr<Scene>& scene) {
-    auto hard_prediction = scene->new_masks[0].mask;
-    auto soft_prediction = scene->new_masks[1].mask;
+    auto hard_prediction = scene->masks[0].mask;
+    auto soft_prediction = scene->masks[1].mask;
     if (soft_prediction.channels() == 1) {
         throw std::runtime_error{"Cannot get contours from soft prediction with 1 layer"};
     }
diff --git a/src/cpp/tilers/src/instance_segmentation.cpp b/src/cpp/tilers/src/instance_segmentation.cpp
index 911a20c0..4777b465 100644
--- a/src/cpp/tilers/src/instance_segmentation.cpp
+++ b/src/cpp/tilers/src/instance_segmentation.cpp
@@ -56,7 +56,7 @@ std::unique_ptr<Scene> InstanceSegmentationTiler::run(const ImageInputData& inpu
 
 std::unique_ptr<Scene> InstanceSegmentationTiler::postprocess_tile(std::unique_ptr<Scene> tile_result,
                                                                         const cv::Rect& coord) {
-    for (auto& det : tile_result->new_masks) {
+    for (auto& det : tile_result->masks) {
         det.roi.x += coord.x;
         det.roi.y += coord.y;
     }
@@ -75,7 +75,7 @@ std::unique_ptr<Scene> InstanceSegmentationTiler::merge_results(
     std::vector<float> all_scores;
 
     for (const auto& result : tiles_results) {
-        for (auto& det : result->new_masks) {
+        for (auto& det : result->masks) {
             all_detections.emplace_back(det.roi.x, det.roi.y, det.roi.x + det.roi.width, det.roi.y + det.roi.height, det.label.label.id);
             all_scores.push_back(det.label.score);
             all_detections_ptrs.push_back(det);
@@ -84,7 +84,7 @@ std::unique_ptr<Scene> InstanceSegmentationTiler::merge_results(
 
     auto keep_idx = multiclass_nms(all_detections, all_scores, iou_threshold, false, max_pred_number);
 
-    scene->new_masks.reserve(keep_idx.size());
+    scene->masks.reserve(keep_idx.size());
     for (auto idx : keep_idx) {
         if (postprocess_semantic_masks) {
             all_detections_ptrs[idx].get().mask = segm_postprocess(all_detections_ptrs[idx],
@@ -92,7 +92,7 @@ std::unique_ptr<Scene> InstanceSegmentationTiler::merge_results(
                                                                    image_size.height,
                                                                    image_size.width);
         }
-        scene->new_masks.push_back(all_detections_ptrs[idx]);
+        scene->masks.push_back(all_detections_ptrs[idx]);
     }
 
     if (tiles_results.size()) {
diff --git a/src/cpp/tilers/src/semantic_segmentation.cpp b/src/cpp/tilers/src/semantic_segmentation.cpp
index 6b169aff..3a866bf1 100644
--- a/src/cpp/tilers/src/semantic_segmentation.cpp
+++ b/src/cpp/tilers/src/semantic_segmentation.cpp
@@ -56,7 +56,7 @@ std::unique_ptr<Scene> SemanticSegmentationTiler::run(const ImageInputData& inpu
 
 std::unique_ptr<Scene> SemanticSegmentationTiler::postprocess_tile(std::unique_ptr<Scene> tile_result,
                                                                         const cv::Rect&) {
-    if (tile_result->new_masks.size() < 2){
+    if (tile_result->masks.size() < 2){
         throw std::runtime_error(
             "SemanticSegmentationTiler requires the underlying model to return ImageResultWithSoftPrediction");
     }
@@ -72,13 +72,13 @@ std::unique_ptr<Scene> SemanticSegmentationTiler::merge_results(
     }
 
     cv::Mat voting_mask(cv::Size(image_size.width, image_size.height), CV_32SC1, cv::Scalar(0));
-    auto first_soft_prediction = tiles_results[0]->new_masks[1].mask;
+    auto first_soft_prediction = tiles_results[0]->masks[1].mask;
     cv::Mat merged_soft_prediction(cv::Size(image_size.width, image_size.height),
                                    CV_32FC(first_soft_prediction.channels()),
                                    cv::Scalar(0));
 
     for (size_t i = 0; i < tiles_results.size(); ++i) {
-        auto soft_prediction = tiles_results[i]->new_masks[1].mask;
+        auto soft_prediction = tiles_results[i]->masks[1].mask;
         voting_mask(tile_coords[i]) += 1;
         merged_soft_prediction(tile_coords[i]) += soft_prediction;
     }
@@ -90,7 +90,7 @@ std::unique_ptr<Scene> SemanticSegmentationTiler::merge_results(
 
     auto scene = std::make_unique<Scene>();
     auto roi = cv::Rect(0, 0, image_size.width, image_size.height);
-    scene->new_masks.push_back(Mask(LabelScore(0, "hard_prediction", 0), roi, hard_prediction));
-    scene->new_masks.push_back(Mask(LabelScore(0, "soft_prediction", 0), roi, merged_soft_prediction));
+    scene->masks.push_back(Mask(LabelScore(0, "hard_prediction", 0), roi, hard_prediction));
+    scene->masks.push_back(Mask(LabelScore(0, "soft_prediction", 0), roi, merged_soft_prediction));
     return scene;
 }
diff --git a/tests/cpp/accuracy/test_accuracy.cpp b/tests/cpp/accuracy/test_accuracy.cpp
index 408486dd..911491a8 100644
--- a/tests/cpp/accuracy/test_accuracy.cpp
+++ b/tests/cpp/accuracy/test_accuracy.cpp
@@ -209,8 +209,8 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         pred = model->infer(image);
                     }
 
-                    if (pred->new_masks.size() == 2) {
-                        cv::Mat soft = pred->new_masks[1].mask;
+                    if (pred->masks.size() == 2) {
+                        cv::Mat soft = pred->masks[1].mask;
                         const std::vector<Contour>& contours = model->getContours(pred);
                         std::stringstream ss;
                         for (const Contour& contour : contours) {
@@ -244,7 +244,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                         result = model->infer(image);
                     }
 
-                    auto rotated_rects = get_rotated_rects(result->new_masks);
+                    auto rotated_rects = get_rotated_rects(result->masks);
                     std::stringstream ss;
                     for (auto& obj : rotated_rects) {
                         ss << obj << "; ";
@@ -265,7 +265,7 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
                     try {
                         // getContours() assumes each instance generates only one contour.
                         // That doesn't hold for some models
-                        for (const Contour& contour : getContours(result->new_masks)) {
+                        for (const Contour& contour : getContours(result->masks)) {
                             ss << contour << "; ";
                         }
                     } catch (const std::runtime_error&) {

From 5bd921d57517abe03cdb8555cab3fb0cd6d5986a Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Wed, 14 May 2025 11:25:29 +0200
Subject: [PATCH 12/16] Remove return contours. Accident in commit

---
 src/cpp/models/src/segmentation_model.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/cpp/models/src/segmentation_model.cpp b/src/cpp/models/src/segmentation_model.cpp
index 88208a98..197c923b 100644
--- a/src/cpp/models/src/segmentation_model.cpp
+++ b/src/cpp/models/src/segmentation_model.cpp
@@ -278,9 +278,6 @@ std::unique_ptr<Scene> SegmentationModel::postprocess(InferenceResult& infResult
         }
     }
 
-    if (return_contours) {
-
-    }
     return scene;
 }
 

From bed642ea4b331a2b9d4397bf895d4e39e3e7be54 Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Wed, 14 May 2025 12:01:50 +0200
Subject: [PATCH 13/16] Clean up results.h

---
 src/cpp/models/include/models/results.h       | 307 +-----------------
 src/cpp/models/src/detection_model_ssd.cpp    |   2 -
 .../src/detection_model_yolov3_onnx.cpp       |   1 -
 3 files changed, 11 insertions(+), 299 deletions(-)

diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index cdf31e23..883efd9f 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -41,42 +41,6 @@ struct ResultBase {
     }
 };
 
-struct AnomalyResult : public ResultBase {
-    AnomalyResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
-        : ResultBase(frameId, metaData) {}
-    cv::Mat anomaly_map;
-    std::vector<cv::Rect> pred_boxes;
-    std::string pred_label;
-    cv::Mat pred_mask;
-    double pred_score;
-
-    friend std::ostream& operator<<(std::ostream& os, const AnomalyResult& prediction) {
-        double min_anomaly_map, max_anomaly_map;
-        cv::minMaxLoc(prediction.anomaly_map, &min_anomaly_map, &max_anomaly_map);
-        double min_pred_mask, max_pred_mask;
-        cv::minMaxLoc(prediction.pred_mask, &min_pred_mask, &max_pred_mask);
-        os << "anomaly_map min:" << min_anomaly_map << " max:" << max_anomaly_map << ";";
-        os << "pred_score:" << std::fixed << std::setprecision(1) << prediction.pred_score << ";";
-        os << "pred_label:" << prediction.pred_label << ";";
-        os << std::fixed << std::setprecision(0) << "pred_mask min:" << min_pred_mask << " max:" << max_pred_mask
-           << ";";
-
-        if (!prediction.pred_boxes.empty()) {
-            os << "pred_boxes:";
-            for (const cv::Rect& box : prediction.pred_boxes) {
-                os << box << ",";
-            }
-        }
-
-        return os;
-    }
-    explicit operator std::string() {
-        std::stringstream ss;
-        ss << *this;
-        return ss.str();
-    }
-};
-
 struct InferenceResult : public ResultBase {
     std::shared_ptr<InternalModelData> internalModelData;
     std::map<std::string, ov::Tensor> outputsData;
@@ -98,257 +62,6 @@ struct InferenceResult : public ResultBase {
     }
 };
 
-struct ClassificationResult : public ResultBase {
-    ClassificationResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
-        : ResultBase(frameId, metaData) {}
-
-    friend std::ostream& operator<<(std::ostream& os, const ClassificationResult& prediction) {
-        for (const ClassificationResult::Classification& classification : prediction.topLabels) {
-            os << classification << ", ";
-        }
-        try {
-            os << prediction.saliency_map.get_shape() << ", ";
-        } catch (ov::Exception&) {
-            os << "[0], ";
-        }
-        try {
-            os << prediction.feature_vector.get_shape() << ", ";
-        } catch (ov::Exception&) {
-            os << "[0], ";
-        }
-        try {
-            os << prediction.raw_scores.get_shape();
-        } catch (ov::Exception&) {
-            os << "[0]";
-        }
-        return os;
-    }
-
-    explicit operator std::string() {
-        std::stringstream ss;
-        ss << *this;
-        return ss.str();
-    }
-
-    struct Classification {
-        size_t id;
-        std::string label;
-        float score;
-
-        Classification(size_t id, const std::string& label, float score) : id(id), label(label), score(score) {}
-
-        friend std::ostream& operator<<(std::ostream& os, const Classification& prediction) {
-            return os << prediction.id << " (" << prediction.label << "): " << std::fixed << std::setprecision(3)
-                      << prediction.score;
-        }
-    };
-
-    std::vector<Classification> topLabels;
-    ov::Tensor saliency_map, feature_vector,
-        raw_scores;  // Contains "raw_scores", "saliency_map" and "feature_vector" model outputs if such exist
-};
-
-struct DetectedObject : public cv::Rect2f {
-    size_t labelID;
-    std::string label;
-    float confidence;
-
-    friend std::ostream& operator<<(std::ostream& os, const DetectedObject& detection) {
-        return os << int(detection.x) << ", " << int(detection.y) << ", " << int(detection.x + detection.width) << ", "
-                  << int(detection.y + detection.height) << ", " << detection.labelID << " (" << detection.label
-                  << "): " << std::fixed << std::setprecision(3) << detection.confidence;
-    }
-};
-
-struct DetectionResult : public ResultBase {
-    DetectionResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
-        : ResultBase(frameId, metaData) {}
-    std::vector<DetectedObject> objects;
-    ov::Tensor saliency_map, feature_vector;  // Contan "saliency_map" and "feature_vector" model outputs if such exist
-
-    friend std::ostream& operator<<(std::ostream& os, const DetectionResult& prediction) {
-        for (const DetectedObject& obj : prediction.objects) {
-            os << obj << "; ";
-        }
-        try {
-            os << prediction.saliency_map.get_shape() << "; ";
-        } catch (ov::Exception&) {
-            os << "[0]; ";
-        }
-        try {
-            os << prediction.feature_vector.get_shape();
-        } catch (ov::Exception&) {
-            os << "[0]";
-        }
-        return os;
-    }
-
-    explicit operator std::string() {
-        std::stringstream ss;
-        ss << *this;
-        return ss.str();
-    }
-};
-
-struct RetinaFaceDetectionResult : public DetectionResult {
-    RetinaFaceDetectionResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
-        : DetectionResult(frameId, metaData) {}
-    std::vector<cv::Point2f> landmarks;
-};
-
-struct SegmentedObject : DetectedObject {
-    cv::Mat mask;
-
-    friend std::ostream& operator<<(std::ostream& os, const SegmentedObject& prediction) {
-        return os << static_cast<const DetectedObject&>(prediction) << ", " << cv::countNonZero(prediction.mask > 0.5);
-    }
-};
-
-struct SegmentedObjectWithRects : SegmentedObject {
-    cv::RotatedRect rotated_rect;
-
-    SegmentedObjectWithRects(const SegmentedObject& segmented_object) : SegmentedObject(segmented_object) {}
-
-    friend std::ostream& operator<<(std::ostream& os, const SegmentedObjectWithRects& prediction) {
-        os << static_cast<const SegmentedObject&>(prediction) << std::fixed << std::setprecision(3);
-        auto rect = prediction.rotated_rect;
-        os << ", RotatedRect: " << rect.center.x << ' ' << rect.center.y << ' ' << rect.size.width << ' '
-           << rect.size.height << ' ' << rect.angle;
-        return os;
-    }
-};
-
-static inline std::vector<SegmentedObjectWithRects> add_rotated_rects(std::vector<SegmentedObject> segmented_objects) {
-    std::vector<SegmentedObjectWithRects> objects_with_rects;
-    objects_with_rects.reserve(segmented_objects.size());
-    for (const SegmentedObject& segmented_object : segmented_objects) {
-        objects_with_rects.push_back(SegmentedObjectWithRects{segmented_object});
-        cv::Mat mask;
-        segmented_object.mask.convertTo(mask, CV_8UC1);
-        std::vector<std::vector<cv::Point>> contours;
-        cv::findContours(mask, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE);
-
-        std::vector<cv::Point> contour = {};
-        for (size_t i = 0; i < contours.size(); i++) {
-            contour.insert(contour.end(), contours[i].begin(), contours[i].end());
-        }
-        if (contour.size() > 0) {
-            std::vector<cv::Point> hull;
-            cv::convexHull(contour, hull);
-            objects_with_rects.back().rotated_rect = cv::minAreaRect(hull);
-        }
-    }
-    return objects_with_rects;
-}
-
-struct InstanceSegmentationResult : ResultBase {
-    InstanceSegmentationResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
-        : ResultBase(frameId, metaData) {}
-    std::vector<SegmentedObject> segmentedObjects;
-    // Contains per class saliency_maps and "feature_vector" model output if feature_vector exists
-    std::vector<cv::Mat_<std::uint8_t>> saliency_map;
-    ov::Tensor feature_vector;
-};
-
-struct ImageResult : public ResultBase {
-    ImageResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
-        : ResultBase(frameId, metaData) {}
-    cv::Mat resultImage;
-    friend std::ostream& operator<<(std::ostream& os, const ImageResult& prediction) {
-        cv::Mat predicted_mask[] = {prediction.resultImage};
-        int nimages = 1;
-        int* channels = nullptr;
-        cv::Mat mask;
-        cv::Mat outHist;
-        int dims = 1;
-        int histSize[] = {256};
-        float range[] = {0, 256};
-        const float* ranges[] = {range};
-        cv::calcHist(predicted_mask, nimages, channels, mask, outHist, dims, histSize, ranges);
-
-        os << std::fixed << std::setprecision(3);
-        for (int i = 0; i < range[1]; ++i) {
-            const float count = outHist.at<float>(i);
-            if (count > 0) {
-                os << i << ": " << count / prediction.resultImage.total() << ", ";
-            }
-        }
-        return os;
-    }
-    explicit operator std::string() {
-        std::stringstream ss;
-        ss << *this;
-        return ss.str();
-    }
-};
-
-struct ImageResultWithSoftPrediction : public ImageResult {
-    ImageResultWithSoftPrediction(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
-        : ImageResult(frameId, metaData) {}
-    cv::Mat soft_prediction;
-    // Contain per class saliency_maps and "feature_vector" model output if feature_vector exists
-    cv::Mat saliency_map;  // Requires return_soft_prediction==true
-    ov::Tensor feature_vector;
-    friend std::ostream& operator<<(std::ostream& os, const ImageResultWithSoftPrediction& prediction) {
-        os << static_cast<const ImageResult&>(prediction) << '[';
-        for (int i = 0; i < prediction.soft_prediction.dims; ++i) {
-            os << prediction.soft_prediction.size[i] << ',';
-        }
-        os << prediction.soft_prediction.channels() << "], [";
-        if (prediction.saliency_map.data) {
-            for (int i = 0; i < prediction.saliency_map.dims; ++i) {
-                os << prediction.saliency_map.size[i] << ',';
-            }
-            os << prediction.saliency_map.channels() << "], ";
-        } else {
-            os << "0], ";
-        }
-        try {
-            os << prediction.feature_vector.get_shape();
-        } catch (ov::Exception&) {
-            os << "[0]";
-        }
-        return os;
-    }
-};
-
-struct Contour {
-    std::string label;
-    float probability;
-    std::vector<cv::Point> shape;
-
-    friend std::ostream& operator<<(std::ostream& os, const Contour& contour) {
-        return os << contour.label << ": " << std::fixed << std::setprecision(3) << contour.probability << ", "
-                  << contour.shape.size();
-    }
-};
-
-static inline std::vector<Contour> getContours(const std::vector<SegmentedObject>& segmentedObjects) {
-    std::vector<Contour> combined_contours;
-    std::vector<std::vector<cv::Point>> contours;
-    for (const SegmentedObject& obj : segmentedObjects) {
-        cv::findContours(obj.mask, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_NONE);
-        // Assuming one contour output for findContours. Based on OTX this is a safe
-        // assumption
-        if (contours.size() != 1) {
-            throw std::runtime_error("findContours() must have returned only one contour");
-        }
-        combined_contours.push_back({obj.label, obj.confidence, contours[0]});
-    }
-    return combined_contours;
-}
-
-struct HumanPose {
-    std::vector<cv::Point2f> keypoints;
-    float score;
-};
-
-struct HumanPoseResult : public ResultBase {
-    HumanPoseResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
-        : ResultBase(frameId, metaData) {}
-    std::vector<HumanPose> poses;
-};
-
 struct DetectedKeypoints {
     std::vector<cv::Point2f> keypoints;
     std::vector<float> scores;
@@ -373,12 +86,6 @@ struct DetectedKeypoints {
     }
 };
 
-struct KeypointDetectionResult : public ResultBase {
-    KeypointDetectionResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
-        : ResultBase(frameId, metaData) {}
-    std::vector<DetectedKeypoints> poses;
-};
-
 class Label {
 public:
     Label() {}
@@ -424,6 +131,17 @@ class Mask {
 
 };
 
+struct Contour {
+    std::string label;
+    float probability;
+    std::vector<cv::Point> shape;
+
+    friend std::ostream& operator<<(std::ostream& os, const Contour& contour) {
+        return os << contour.label << ": " << std::fixed << std::setprecision(3) << contour.probability << ", "
+                  << contour.shape.size();
+    }
+};
+
 static inline std::vector<Contour> getContours(const std::vector<Mask>& segmentedObjects) {
     std::vector<Contour> combined_contours;
     std::vector<std::vector<cv::Point>> contours;
@@ -564,9 +282,6 @@ class Scene {
             os << ", " << v.second.get_shape();
         }
 
-
-
-
         return os;
     }
 
diff --git a/src/cpp/models/src/detection_model_ssd.cpp b/src/cpp/models/src/detection_model_ssd.cpp
index 7a938169..69299997 100644
--- a/src/cpp/models/src/detection_model_ssd.cpp
+++ b/src/cpp/models/src/detection_model_ssd.cpp
@@ -180,7 +180,6 @@ std::unique_ptr<Scene> ModelSSD::postprocessMultipleOutputs(InferenceResult& inf
         namesWithoutXai.size() > 2 ? infResult.outputsData[namesWithoutXai[2]].data<float>() : nullptr;
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
 
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
     float floatInputImgWidth = float(internalData.inputImgWidth),
@@ -230,7 +229,6 @@ std::unique_ptr<Scene> ModelSSD::postprocessMultipleOutputs(InferenceResult& inf
         }
     }
 
-    //scene->detection_result = std::move(result);
     return scene;
 }
 
diff --git a/src/cpp/models/src/detection_model_yolov3_onnx.cpp b/src/cpp/models/src/detection_model_yolov3_onnx.cpp
index fe51302a..7827e2b9 100644
--- a/src/cpp/models/src/detection_model_yolov3_onnx.cpp
+++ b/src/cpp/models/src/detection_model_yolov3_onnx.cpp
@@ -134,7 +134,6 @@ std::unique_ptr<Scene> ModelYoloV3ONNX::postprocess(InferenceResult& infResult)
 
     // Generate detection results
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
-    auto result = std::make_unique<DetectionResult>(infResult.frameId, infResult.metaData);
     size_t numberOfBoxes = indicesShape.size() == 3 ? indicesShape[1] : indicesShape[0];
     size_t indicesStride = indicesShape.size() == 3 ? indicesShape[2] : indicesShape[1];
 

From 476ffa11b81620ec926823b487a5b1df61305ef6 Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Wed, 14 May 2025 12:06:28 +0200
Subject: [PATCH 14/16] Move public_scope changes to cpp folder

The public scope reference output is different now
Until python output will match the cpp we will split the reference output.
---
 .github/workflows/test_accuracy.yml     |   2 +-
 tests/cpp/accuracy/public_scope.json    | 447 ++++++++++++++++++++++++
 tests/python/accuracy/public_scope.json |  52 +--
 3 files changed, 474 insertions(+), 27 deletions(-)
 create mode 100644 tests/cpp/accuracy/public_scope.json

diff --git a/.github/workflows/test_accuracy.yml b/.github/workflows/test_accuracy.yml
index 61c877ca..f94157b4 100644
--- a/.github/workflows/test_accuracy.yml
+++ b/.github/workflows/test_accuracy.yml
@@ -46,4 +46,4 @@ jobs:
           make -j
       - name: Run CPP Test
         run: |
-          build/test_accuracy -d data -p tests/python/accuracy/public_scope.json
+          build/test_accuracy -d data -p tests/cpp/accuracy/public_scope.json
diff --git a/tests/cpp/accuracy/public_scope.json b/tests/cpp/accuracy/public_scope.json
new file mode 100644
index 00000000..6ec278e6
--- /dev/null
+++ b/tests/cpp/accuracy/public_scope.json
@@ -0,0 +1,447 @@
+[
+  {
+    "name": "otx_models/Lite-hrnet-18.xml",
+    "type": "SegmentationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "object: 0.675, 508, object: 0.527, 65, object: 0.507, 18, object: 0.624, 144, object: 0.538, 67, object: 0.507, 15, object: 0.518, 41, object: 0.507, 8, object: 0.505, 14, object: 0.885, 2138, "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/Lite-hrnet-18_mod2.xml",
+    "type": "SegmentationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "object: 0.555, 112, object: 0.506, 17, object: 0.555, 154, object: 0.511, 19, object: 0.514, 52, "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/Lite-hrnet-s_mod2.xml",
+    "type": "SegmentationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "object: 0.520, 26, object: 0.530, 42, object: 0.501, 4, object: 0.507, 27, object: 0.503, 8, object: 0.502, 6, object: 0.505, 18, object: 0.504, 13, object: 0.524, 87, object: 0.521, 89, object: 0.757, 2706, "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/Lite-hrnet-s_mod2.onnx",
+    "type": "SegmentationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "object: 0.520, 26, object: 0.530, 42, object: 0.501, 4, object: 0.507, 27, object: 0.503, 8, object: 0.502, 6, object: 0.505, 18, object: 0.504, 13, object: 0.524, 87, object: 0.521, 89, object: 0.757, 2706, "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/Lite-hrnet-s_mod2.onnx",
+    "type": "SegmentationModel",
+    "force_ort": "True",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "object: 0.519, 26, object: 0.531, 42, object: 0.502, 21, object: 0.505, 9, object: 0.501, 4, object: 0.509, 22, object: 0.524, 85, object: 0.520, 93, object: 0.754, 2564, "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/Lite-hrnet-x-mod3.xml",
+    "type": "SegmentationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "object: 0.505, 2, object: 0.518, 8, object: 0.512, 5, object: 0.506, 4, object: 0.526, 8, object: 0.529, 21, object: 0.513, 12, object: 0.535, 49, object: 0.505, 2, object: 0.512, 4, object: 0.547, 6, object: 0.511, 6, object: 0.503, 1, object: 0.539, 6, object: 0.543, 39, object: 0.529, 2, object: 0.516, 9, object: 0.565, 157, object: 0.524, 6, object: 0.528, 15, object: 0.521, 18, object: 0.502, 1, object: 0.537, 73, object: 0.513, 4, object: 0.524, 27, object: 0.513, 6, object: 0.538, 65, object: 0.501, 6, object: 0.504, 1, object: 0.507, 4, object: 0.502, 1, object: 0.518, 8, object: 0.530, 11, object: 0.502, 2, object: 0.516, 2, object: 0.506, 1, object: 0.567, 17, object: 0.502, 1, object: 0.512, 7, object: 0.538, 24, object: 0.507, 1, object: 0.534, 12, object: 0.537, 6, object: 0.519, 13, object: 0.505, 2, object: 0.517, 16, object: 0.505, 5, object: 0.506, 20, object: 0.508, 6, object: 0.519, 24, object: 0.507, 4, object: 0.506, 2, object: 0.511, 4, object: 0.556, 47, object: 0.510, 10, object: 0.500, 1, object: 0.504, 5, object: 0.501, 1, object: 0.510, 6, object: 0.549, 13, object: 0.509, 2, object: 0.510, 3, object: 0.514, 1, object: 0.529, 15, object: 0.551, 110, object: 0.504, 2, object: 0.503, 3, object: 0.518, 16, object: 0.511, 14, object: 0.502, 1, object: 0.523, 1, object: 0.533, 16, object: 0.568, 66, object: 0.582, 1793, "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/det_mobilenetv2_atss_bccd.xml",
+    "type": "DetectionModel",
+    "test_data": [
+      {
+        "image": "BloodImage_00007.jpg",
+        "reference": [
+          "494, 159, 637, 308, 2 (WBC): 0.697; 28, 139, 135, 228, 1 (RBC): 0.628; 535, 375, 638, 479, 1 (RBC): 0.524; 513, 8, 633, 152, 1 (RBC): 0.430; 21, 291, 143, 399, 1 (RBC): 0.422; 196, 86, 410, 286, 1 (RBC): 0.422; [0]; [0]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/det_mobilenetv2_atss_bccd_onnx.onnx",
+    "type": "DetectionModel",
+    "test_data": [
+      {
+        "image": "BloodImage_00007.jpg",
+        "reference": [
+          "494, 159, 637, 308, 2 (WBC): 0.697; 28, 139, 135, 228, 1 (RBC): 0.628; 535, 375, 638, 479, 1 (RBC): 0.524; 513, 8, 633, 152, 1 (RBC): 0.430; 21, 291, 143, 399, 1 (RBC): 0.422; 196, 86, 410, 286, 1 (RBC): 0.422; [0]; [0]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/det_mobilenetv2_atss_bccd_onnx.onnx",
+    "type": "DetectionModel",
+    "force_ort": "True",
+    "test_data": [
+      {
+        "image": "BloodImage_00007.jpg",
+        "reference": [
+          "494, 159, 637, 308, 2 (WBC): 0.700; 28, 139, 136, 228, 1 (RBC): 0.623; 535, 374, 638, 479, 1 (RBC): 0.535; 513, 8, 633, 151, 1 (RBC): 0.454; 197, 86, 412, 286, 1 (RBC): 0.405; 21, 291, 143, 399, 1 (RBC): 0.401; [0]; [0]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/mlc_mobilenetv3_large_voc.xml",
+    "type": "ClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000081.jpg",
+        "reference": ["0, 0, 640, 425, 0 (aeroplane): 0.943; [0]; [0]"]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/mlc_efficient_b0_voc.xml",
+    "type": "ClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "0, 0, 640, 426, 1 (bicycle): 0.768, 11 (dog): 0.876, 14 (person): 0.922; [0]; [0]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/mlc_efficient_v2s_voc.xml",
+    "type": "ClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "0, 0, 640, 426, 1 (bicycle): 0.825, 11 (dog): 0.873, 14 (person): 0.824; [0]; [0]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/cls_mobilenetv3_large_cars.xml",
+    "type": "ClassificationModel",
+    "check_extra_rt_info": "True",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000471.jpg",
+        "reference": ["0, 0, 640, 427, 105 (194): 0.456; [0]; [0]"]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/cls_mobilenetv3_large_cars.onnx",
+    "type": "ClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000471.jpg",
+        "reference": ["0, 0, 640, 427, 105 (194): 0.456; [0]; [0]"]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/cls_mobilenetv3_large_cars.onnx",
+    "type": "ClassificationModel",
+    "force_ort": "True",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000471.jpg",
+        "reference": ["105 (194): 0.062, [0], [0], [196]"]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/cls_efficient_b0_cars.xml",
+    "type": "ClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000471.jpg",
+        "reference": ["0, 0, 640, 427, 0 (1): 0.838; [0]; [0]"]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/cls_efficient_b0_shuffled_outputs.xml",
+    "type": "ClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000471.jpg",
+        "reference": [
+          "0, 0, 640, 427, 4 (Circle): 0.943, 5 (Lion): 0.969, 3 (Non-Rigid): 0.503, 6 (Panda): 0.988; [1,7,7,7]; [1,7]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/cls_efficient_v2s_cars.xml",
+    "type": "ClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000471.jpg",
+        "reference": ["0, 0, 640, 427, 0 (1): 0.849; [0]; [0]"]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/is_efficientnetb2b_maskrcnn_coco_reduced.xml",
+    "type": "MaskRCNNModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "RotatedRect: 478.119, 130.332, 28.677, 46.408, 46.6371 (bicycle): 0.818; ; RotatedRect: 79.739, 177.262, 251.785, 156.656, 87.3972 (car): 0.753; ; 0; [0]; bicycle: 0.818, 139; car: 0.753, 622; "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/is_efficientnetb2b_maskrcnn_coco_reduced_onnx.onnx",
+    "type": "MaskRCNNModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "RotatedRect: 478.119, 130.332, 28.677, 46.408, 46.6371 (bicycle): 0.818; ; RotatedRect: 79.739, 177.262, 251.785, 156.656, 87.3972 (car): 0.753; ; 0; [0]; bicycle: 0.818, 139; car: 0.753, 622; "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/is_efficientnetb2b_maskrcnn_coco_reduced_onnx.onnx",
+    "type": "MaskRCNNModel",
+    "force_ort": "True",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "RotatedRect: 210.000, 327.500, 101.000, 296.000, 90.00016 (horse): 0.999; ; RotatedRect: 82.086, 163.312, 307.394, 156.997, 89.6692 (car): 0.999; ; RotatedRect: 305.000, 123.500, 59.000, 18.000, 90.0001 (bicycle): 0.985; ; RotatedRect: 332.500, 116.000, 38.000, 13.000, 90.0001 (bicycle): 0.974; ; RotatedRect: 476.052, 126.972, 27.619, 47.834, 16.9281 (bicycle): 0.918; ; RotatedRect: 369.319, 119.891, 54.848, 34.230, 82.4051 (bicycle): 0.807; ; RotatedRect: 284.000, 127.500, 35.000, 10.000, 90.0001 (bicycle): 0.788; ; 0; [0]; horse: 0.999, 668; car: 0.999, 782; bicycle: 0.985, 127; bicycle: 0.974, 87; bicycle: 0.918, 122; bicycle: 0.807, 140; bicycle: 0.788, 79; "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/is_resnet50_maskrcnn_coco_reduced.xml",
+    "type": "MaskRCNNModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "RotatedRect: 210.000, 327.500, 101.000, 296.000, 90.00016 (horse): 0.999; ; RotatedRect: 82.086, 163.312, 307.394, 156.997, 89.6692 (car): 0.999; ; RotatedRect: 305.000, 123.500, 59.000, 18.000, 90.0001 (bicycle): 0.985; ; RotatedRect: 332.500, 116.000, 38.000, 13.000, 90.0001 (bicycle): 0.974; ; RotatedRect: 476.052, 126.972, 27.619, 47.834, 16.9281 (bicycle): 0.918; ; RotatedRect: 369.319, 119.891, 54.848, 34.230, 82.4051 (bicycle): 0.807; ; RotatedRect: 284.000, 127.500, 35.000, 10.000, 90.0001 (bicycle): 0.788; ; 0; [0]; horse: 0.999, 668; car: 0.999, 782; bicycle: 0.985, 127; bicycle: 0.974, 87; bicycle: 0.918, 122; bicycle: 0.807, 140; bicycle: 0.788, 79; "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/mobilenet_v3_large_hc_cf.xml",
+    "type": "ClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000081.jpg",
+        "reference": [
+          "3 (equilateral): 0.596, 1 (multi a): 0.922, 2 (multi b): 0.696, 5 (triangle): 0.993, [0], [0], [0]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/classification_model_with_xai_head.xml",
+    "type": "ClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000081.jpg",
+        "reference": ["0, 0, 640, 425, 0 (horse): 0.543; [1,4,7,7]; [1,1280,1,1]"]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/detection_model_with_xai_head.xml",
+    "type": "DetectionModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "61, 277, 355, 379, 1 (person): 0.364; 461, 105, 495, 149, 1 (person): 0.305; [1,2,6,8]; [1,320,1,1]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/segmentation_model_with_xai_head.xml",
+    "type": "SegmentationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "backpack: 0.505, 2, "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/maskrcnn_model_with_xai_head.xml",
+    "type": "MaskRCNNModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "RotatedRect: 212.000, 327.000, 290.000, 100.000, 0.00017 (horse): 0.998; ; RotatedRect: 54.067, 173.034, 285.208, 156.889, 61.9962 (car): 0.994; ; RotatedRect: 333.500, 114.000, 36.000, 13.000, 90.0001 (bicycle): 0.930; ; RotatedRect: 476.284, 126.621, 27.308, 45.993, 19.1791 (bicycle): 0.898; ; RotatedRect: 304.000, 124.000, 58.000, 18.000, 90.0001 (bicycle): 0.869; ; RotatedRect: 283.500, 130.000, 42.000, 11.000, 90.0001 (bicycle): 0.817; ; RotatedRect: 51.806, 97.259, 184.445, 95.281, 89.2462 (car): 0.701; ; RotatedRect: 280.500, 122.500, 17.000, 59.000, 0.0001 (bicycle): 0.660; ; RotatedRect: 332.000, 133.000, 34.000, 14.000, 90.00018 (sheep): 0.520; ; 4; [1,1280,1,1]; "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/maskrcnn_xai_tiling.xml",
+    "type": "MaskRCNNModel",
+    "tiler": "InstanceSegmentationTiler",
+    "extra_model": "otx_models/tile_classifier.xml",
+    "extra_type": "ImageModel",
+    "input_res": "(3500,3500)",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "RotatedRect: 1598.500, 641.500, 111.000, 109.000, 90.0002 (ellipse): 0.643; ; RotatedRect: 3097.500, 3104.000, 14.000, 13.000, 90.0001 (rectangle): 0.483; ; RotatedRect: 2800.000, 188.500, 255.000, 132.000, 90.0001 (rectangle): 0.401; ; 2; [1,1280,1,1]; ellipse: 0.643, 331; rectangle: 0.483, 48; rectangle: 0.401, 758; "
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/detection_model_with_xai_head.xml",
+    "type": "DetectionModel",
+    "tiler": "DetectionTiler",
+    "input_res": "(3500,3500)",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "336, 2275, 1944, 3114, 1 (person): 0.361; 2523, 862, 2709, 1224, 1 (person): 0.313; [1,2,35,46]; [1,320,1,1]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/anomaly_padim_bottle_mvtec.xml",
+    "type": "AnomalyDetection",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "0, 0, 640, 426, 1 (Anomaly): 0.854; [1,1,426,640]; pred_mask min:0.000 max:1.000;[0]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/anomaly_stfpm_bottle_mvtec.xml",
+    "type": "AnomalyDetection",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "0, 0, 640, 426, 1 (Anomaly): 0.854; [1,1,426,640]; 1 (Anomaly): 0.854[640 x 426 from (0, 0)] min:0.000 max:1.000;[0]"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/deit-tiny.xml",
+    "type": "ClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": ["0, 0, 640, 426, 3 (cat): 0.648; [0]; [0], [10]"]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/action_cls_xd3_kinetic.xml",
+    "type": "ActionClassificationModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000471.jpg",
+        "reference": ["38 (WritingOnBoard): 0.096, [0], [0], [0]"]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/sam_vit_b_zsl_decoder.xml",
+    "type": "SAMDecoder",
+    "prompter": "SAMLearnableVisualPrompter",
+    "encoder": "otx_models/sam_vit_b_zsl_encoder.xml",
+    "encoder_type": "SAMImageEncoder",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000471.jpg",
+        "reference": [
+          "mask sum: 108565; [385.0, 315.0] iou: 0.930 [335.0, 414.0] iou: 0.763 [44.0, 205.0] iou: 0.665 [605.0, 224.0] iou: 0.653, mask sum: 73931; [175.0, 215.0] iou: 0.781 [124.0, 165.0] iou: 0.651"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/sam_vit_b_zsl_decoder.xml",
+    "type": "SAMDecoder",
+    "prompter": "SAMVisualPrompter",
+    "encoder": "otx_models/sam_vit_b_zsl_encoder.xml",
+    "encoder_type": "SAMImageEncoder",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000471.jpg",
+        "reference": [
+          "upscaled_masks min:-25.907 max:11.185;hard_predictions shape:(4, 427, 640);"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/rtmpose_tiny.xml",
+    "type": "KeypointDetectionModel",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000471.jpg",
+        "reference": [
+          "keypoints: (17, 2), keypoints_x_sum: 2930.000, scores: (17,) 14.061"
+        ]
+      }
+    ]
+  },
+  {
+    "name": "otx_models/segnext_t_tiling.xml",
+    "type": "SegmentationModel",
+    "tiler": "SemanticSegmentationTiler",
+    "input_res": "(3500,3500)",
+    "test_data": [
+      {
+        "image": "coco128/images/train2017/000000000074.jpg",
+        "reference": [
+          "background: 1.404, 311, background: 1.397, 44, background: 1.371, 34, background: 1.377, 12, background: 1.356, 155, background: 1.345, 12, background: 1.183, 219, background: 1.524, 8, background: 1.533, 4, background: 1.519, 2, background: 1.524, 4, background: 1.530, 6, background: 1.537, 2, background: 1.514, 4, background: 1.519, 8, background: 1.529, 6, background: 1.550, 6, background: 1.558, 4, background: 1.520, 2, background: 1.529, 4, background: 1.532, 6, background: 1.535, 6, background: 1.530, 2, background: 1.529, 50, background: 1.528, 22, background: 1.527, 38, background: 1.451, 1476, background: 1.345, 2743, background: 1.609, 2987, background: 1.636, 29909, "
+        ]
+      }
+    ]
+  }
+]
diff --git a/tests/python/accuracy/public_scope.json b/tests/python/accuracy/public_scope.json
index 6ec278e6..e244ece1 100644
--- a/tests/python/accuracy/public_scope.json
+++ b/tests/python/accuracy/public_scope.json
@@ -6,7 +6,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "object: 0.675, 508, object: 0.527, 65, object: 0.507, 18, object: 0.624, 144, object: 0.538, 67, object: 0.507, 15, object: 0.518, 41, object: 0.507, 8, object: 0.505, 14, object: 0.885, 2138, "
+          "0: 0.537, 1: 0.463, [426,640,2], [0], [0]; object: 0.675, 508, object: 0.527, 65, object: 0.507, 18, object: 0.624, 144, object: 0.538, 67, object: 0.507, 15, object: 0.518, 41, object: 0.507, 8, object: 0.505, 14, object: 0.885, 2138, "
         ]
       }
     ]
@@ -18,7 +18,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "object: 0.555, 112, object: 0.506, 17, object: 0.555, 154, object: 0.511, 19, object: 0.514, 52, "
+          "0: 0.992, 1: 0.008, [426,640,2], [0], [0]; object: 0.555, 112, object: 0.506, 17, object: 0.555, 154, object: 0.511, 19, object: 0.514, 52, "
         ]
       }
     ]
@@ -30,7 +30,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "object: 0.520, 26, object: 0.530, 42, object: 0.501, 4, object: 0.507, 27, object: 0.503, 8, object: 0.502, 6, object: 0.505, 18, object: 0.504, 13, object: 0.524, 87, object: 0.521, 89, object: 0.757, 2706, "
+          "0: 0.563, 1: 0.437, [426,640,2], [0], [0]; object: 0.520, 26, object: 0.530, 42, object: 0.501, 4, object: 0.507, 27, object: 0.503, 8, object: 0.502, 6, object: 0.505, 18, object: 0.504, 13, object: 0.524, 87, object: 0.521, 89, object: 0.757, 2706, "
         ]
       }
     ]
@@ -42,7 +42,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "object: 0.520, 26, object: 0.530, 42, object: 0.501, 4, object: 0.507, 27, object: 0.503, 8, object: 0.502, 6, object: 0.505, 18, object: 0.504, 13, object: 0.524, 87, object: 0.521, 89, object: 0.757, 2706, "
+          "0: 0.563, 1: 0.437, [426,640,2], [0], [0]; object: 0.520, 26, object: 0.530, 42, object: 0.501, 4, object: 0.507, 27, object: 0.503, 8, object: 0.502, 6, object: 0.505, 18, object: 0.504, 13, object: 0.524, 87, object: 0.521, 89, object: 0.757, 2706, "
         ]
       }
     ]
@@ -55,7 +55,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "object: 0.519, 26, object: 0.531, 42, object: 0.502, 21, object: 0.505, 9, object: 0.501, 4, object: 0.509, 22, object: 0.524, 85, object: 0.520, 93, object: 0.754, 2564, "
+          "0: 0.561, 1: 0.439, [426,640,2], [0], [0]; object: 0.519, 26, object: 0.531, 42, object: 0.502, 21, object: 0.505, 9, object: 0.501, 4, object: 0.509, 22, object: 0.524, 85, object: 0.520, 93, object: 0.754, 2564, "
         ]
       }
     ]
@@ -67,7 +67,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "object: 0.505, 2, object: 0.518, 8, object: 0.512, 5, object: 0.506, 4, object: 0.526, 8, object: 0.529, 21, object: 0.513, 12, object: 0.535, 49, object: 0.505, 2, object: 0.512, 4, object: 0.547, 6, object: 0.511, 6, object: 0.503, 1, object: 0.539, 6, object: 0.543, 39, object: 0.529, 2, object: 0.516, 9, object: 0.565, 157, object: 0.524, 6, object: 0.528, 15, object: 0.521, 18, object: 0.502, 1, object: 0.537, 73, object: 0.513, 4, object: 0.524, 27, object: 0.513, 6, object: 0.538, 65, object: 0.501, 6, object: 0.504, 1, object: 0.507, 4, object: 0.502, 1, object: 0.518, 8, object: 0.530, 11, object: 0.502, 2, object: 0.516, 2, object: 0.506, 1, object: 0.567, 17, object: 0.502, 1, object: 0.512, 7, object: 0.538, 24, object: 0.507, 1, object: 0.534, 12, object: 0.537, 6, object: 0.519, 13, object: 0.505, 2, object: 0.517, 16, object: 0.505, 5, object: 0.506, 20, object: 0.508, 6, object: 0.519, 24, object: 0.507, 4, object: 0.506, 2, object: 0.511, 4, object: 0.556, 47, object: 0.510, 10, object: 0.500, 1, object: 0.504, 5, object: 0.501, 1, object: 0.510, 6, object: 0.549, 13, object: 0.509, 2, object: 0.510, 3, object: 0.514, 1, object: 0.529, 15, object: 0.551, 110, object: 0.504, 2, object: 0.503, 3, object: 0.518, 16, object: 0.511, 14, object: 0.502, 1, object: 0.523, 1, object: 0.533, 16, object: 0.568, 66, object: 0.582, 1793, "
+          "0: 0.944, 1: 0.056, [426,640,2], [0], [0]; object: 0.505, 2, object: 0.518, 8, object: 0.512, 5, object: 0.506, 4, object: 0.526, 8, object: 0.529, 21, object: 0.513, 12, object: 0.535, 49, object: 0.505, 2, object: 0.512, 4, object: 0.547, 6, object: 0.511, 6, object: 0.503, 1, object: 0.539, 6, object: 0.543, 39, object: 0.529, 2, object: 0.516, 9, object: 0.565, 157, object: 0.524, 6, object: 0.528, 15, object: 0.521, 18, object: 0.502, 1, object: 0.537, 73, object: 0.513, 4, object: 0.524, 27, object: 0.513, 6, object: 0.538, 65, object: 0.501, 6, object: 0.504, 1, object: 0.507, 4, object: 0.502, 1, object: 0.518, 8, object: 0.530, 11, object: 0.502, 2, object: 0.516, 2, object: 0.506, 1, object: 0.567, 17, object: 0.502, 1, object: 0.512, 7, object: 0.538, 24, object: 0.507, 1, object: 0.534, 12, object: 0.537, 6, object: 0.519, 13, object: 0.505, 2, object: 0.517, 16, object: 0.505, 5, object: 0.506, 20, object: 0.508, 6, object: 0.519, 24, object: 0.507, 4, object: 0.506, 2, object: 0.511, 4, object: 0.556, 47, object: 0.510, 10, object: 0.500, 1, object: 0.504, 5, object: 0.501, 1, object: 0.510, 6, object: 0.549, 13, object: 0.509, 2, object: 0.510, 3, object: 0.514, 1, object: 0.529, 15, object: 0.551, 110, object: 0.504, 2, object: 0.503, 3, object: 0.518, 16, object: 0.511, 14, object: 0.502, 1, object: 0.523, 1, object: 0.533, 16, object: 0.568, 66, object: 0.582, 1793, "
         ]
       }
     ]
@@ -115,7 +115,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000081.jpg",
-        "reference": ["0, 0, 640, 425, 0 (aeroplane): 0.943; [0]; [0]"]
+        "reference": ["0 (aeroplane): 0.943, [0], [0], [0]"]
       }
     ]
   },
@@ -126,7 +126,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0, 0, 640, 426, 1 (bicycle): 0.768, 11 (dog): 0.876, 14 (person): 0.922; [0]; [0]"
+          "1 (bicycle): 0.768, 11 (dog): 0.876, 14 (person): 0.922, [0], [0], [0]"
         ]
       }
     ]
@@ -138,7 +138,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0, 0, 640, 426, 1 (bicycle): 0.825, 11 (dog): 0.873, 14 (person): 0.824; [0]; [0]"
+          "1 (bicycle): 0.825, 11 (dog): 0.873, 14 (person): 0.824, [0], [0], [0]"
         ]
       }
     ]
@@ -150,7 +150,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000471.jpg",
-        "reference": ["0, 0, 640, 427, 105 (194): 0.456; [0]; [0]"]
+        "reference": ["105 (194): 0.456, [0], [0], [0]"]
       }
     ]
   },
@@ -160,7 +160,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000471.jpg",
-        "reference": ["0, 0, 640, 427, 105 (194): 0.456; [0]; [0]"]
+        "reference": ["105 (194): 0.456, [0], [0], [196]"]
       }
     ]
   },
@@ -181,7 +181,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000471.jpg",
-        "reference": ["0, 0, 640, 427, 0 (1): 0.838; [0]; [0]"]
+        "reference": ["0 (1): 0.838, [0], [0], [0]"]
       }
     ]
   },
@@ -192,7 +192,7 @@
       {
         "image": "coco128/images/train2017/000000000471.jpg",
         "reference": [
-          "0, 0, 640, 427, 4 (Circle): 0.943, 5 (Lion): 0.969, 3 (Non-Rigid): 0.503, 6 (Panda): 0.988; [1,7,7,7]; [1,7]"
+          "4 (Circle): 0.943, 5 (Lion): 0.969, 3 (Non-Rigid): 0.503, 6 (Panda): 0.988, [1,7,7,7], [1,7], [0]"
         ]
       }
     ]
@@ -203,7 +203,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000471.jpg",
-        "reference": ["0, 0, 640, 427, 0 (1): 0.849; [0]; [0]"]
+        "reference": ["0 (1): 0.849, [0], [0], [0]"]
       }
     ]
   },
@@ -214,7 +214,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "RotatedRect: 478.119, 130.332, 28.677, 46.408, 46.6371 (bicycle): 0.818; ; RotatedRect: 79.739, 177.262, 251.785, 156.656, 87.3972 (car): 0.753; ; 0; [0]; bicycle: 0.818, 139; car: 0.753, 622; "
+          "458, 106, 495, 150, 1 (bicycle): 0.818, 852, RotatedRect: 478.119 130.332 28.677 46.408 46.637; 0, 30, 178, 323, 2 (car): 0.753, 26728, RotatedRect: 79.739 177.262 251.785 156.656 87.397; 0; [0]; bicycle: 0.818, 139; car: 0.753, 622; "
         ]
       }
     ]
@@ -226,7 +226,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "RotatedRect: 478.119, 130.332, 28.677, 46.408, 46.6371 (bicycle): 0.818; ; RotatedRect: 79.739, 177.262, 251.785, 156.656, 87.3972 (car): 0.753; ; 0; [0]; bicycle: 0.818, 139; car: 0.753, 622; "
+          "458, 106, 495, 150, 1 (person): 0.818, 852, RotatedRect: 478.119 130.332 28.677 46.408 46.637; 0, 30, 178, 323, 2 (bicycle): 0.753, 26728, RotatedRect: 79.739 177.262 251.785 156.656 87.397; 0; [0]; person: 0.818, 139; bicycle: 0.753, 622; "
         ]
       }
     ]
@@ -239,7 +239,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "RotatedRect: 210.000, 327.500, 101.000, 296.000, 90.00016 (horse): 0.999; ; RotatedRect: 82.086, 163.312, 307.394, 156.997, 89.6692 (car): 0.999; ; RotatedRect: 305.000, 123.500, 59.000, 18.000, 90.0001 (bicycle): 0.985; ; RotatedRect: 332.500, 116.000, 38.000, 13.000, 90.0001 (bicycle): 0.974; ; RotatedRect: 476.052, 126.972, 27.619, 47.834, 16.9281 (bicycle): 0.918; ; RotatedRect: 369.319, 119.891, 54.848, 34.230, 82.4051 (bicycle): 0.807; ; RotatedRect: 284.000, 127.500, 35.000, 10.000, 90.0001 (bicycle): 0.788; ; 0; [0]; horse: 0.999, 668; car: 0.999, 782; bicycle: 0.985, 127; bicycle: 0.974, 87; bicycle: 0.918, 122; bicycle: 0.807, 140; bicycle: 0.788, 79; "
+          "458, 106, 495, 150, 1 (person): 0.816, 851, RotatedRect: 478.119 130.332 28.677 46.408 46.637; 0, 30, 178, 323, 2 (bicycle): 0.754, 26748, RotatedRect: 79.762 177.261 251.785 156.702 87.397; 0; [0]; person: 0.816, 142; bicycle: 0.754, 622; "
         ]
       }
     ]
@@ -251,7 +251,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "RotatedRect: 210.000, 327.500, 101.000, 296.000, 90.00016 (horse): 0.999; ; RotatedRect: 82.086, 163.312, 307.394, 156.997, 89.6692 (car): 0.999; ; RotatedRect: 305.000, 123.500, 59.000, 18.000, 90.0001 (bicycle): 0.985; ; RotatedRect: 332.500, 116.000, 38.000, 13.000, 90.0001 (bicycle): 0.974; ; RotatedRect: 476.052, 126.972, 27.619, 47.834, 16.9281 (bicycle): 0.918; ; RotatedRect: 369.319, 119.891, 54.848, 34.230, 82.4051 (bicycle): 0.807; ; RotatedRect: 284.000, 127.500, 35.000, 10.000, 90.0001 (bicycle): 0.788; ; 0; [0]; horse: 0.999, 668; car: 0.999, 782; bicycle: 0.985, 127; bicycle: 0.974, 87; bicycle: 0.918, 122; bicycle: 0.807, 140; bicycle: 0.788, 79; "
+          "59, 277, 360, 380, 16 (horse): 0.999, 19053, RotatedRect: 210.000 327.500 101.000 296.000 90.000; 2, 9, 162, 318, 2 (car): 0.999, 31153, RotatedRect: 82.086 163.312 307.394 156.997 89.669; 294, 94, 316, 153, 1 (bicycle): 0.985, 840, RotatedRect: 305.000 123.500 59.000 18.000 90.000; 326, 97, 341, 136, 1 (bicycle): 0.974, 397, RotatedRect: 332.500 116.000 38.000 13.000 90.000; 461, 105, 493, 150, 1 (bicycle): 0.918, 846, RotatedRect: 476.052 126.972 27.619 47.834 16.928; 350, 92, 386, 149, 1 (bicycle): 0.807, 1458, RotatedRect: 369.319 119.891 54.848 34.230 82.405; 279, 110, 291, 146, 1 (bicycle): 0.788, 312, RotatedRect: 284.000 127.500 35.000 10.000 90.000; 0; [0]; horse: 0.999, 668; car: 0.999, 782; bicycle: 0.985, 127; bicycle: 0.974, 87; bicycle: 0.918, 122; bicycle: 0.807, 140; bicycle: 0.788, 79; "
         ]
       }
     ]
@@ -274,7 +274,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000081.jpg",
-        "reference": ["0, 0, 640, 425, 0 (horse): 0.543; [1,4,7,7]; [1,1280,1,1]"]
+        "reference": ["0 (horse): 0.543, [1,4,7,7], [1,1280,1,1], [0]"]
       }
     ]
   },
@@ -297,7 +297,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "backpack: 0.505, 2, "
+          "0: 1.000, 1: 0.000, [426,640,3], [426,640,3], [1,600,1,1]; backpack: 0.505, 2, "
         ]
       }
     ]
@@ -309,7 +309,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "RotatedRect: 212.000, 327.000, 290.000, 100.000, 0.00017 (horse): 0.998; ; RotatedRect: 54.067, 173.034, 285.208, 156.889, 61.9962 (car): 0.994; ; RotatedRect: 333.500, 114.000, 36.000, 13.000, 90.0001 (bicycle): 0.930; ; RotatedRect: 476.284, 126.621, 27.308, 45.993, 19.1791 (bicycle): 0.898; ; RotatedRect: 304.000, 124.000, 58.000, 18.000, 90.0001 (bicycle): 0.869; ; RotatedRect: 283.500, 130.000, 42.000, 11.000, 90.0001 (bicycle): 0.817; ; RotatedRect: 51.806, 97.259, 184.445, 95.281, 89.2462 (car): 0.701; ; RotatedRect: 280.500, 122.500, 17.000, 59.000, 0.0001 (bicycle): 0.660; ; RotatedRect: 332.000, 133.000, 34.000, 14.000, 90.00018 (sheep): 0.520; ; 4; [1,1280,1,1]; "
+          "61, 277, 358, 382, 17 (horse): 0.998, 18312, RotatedRect: 212.000 327.000 290.000 100.000 0.000; 1, 14, 162, 321, 2 (car): 0.994, 25867, RotatedRect: 54.067 173.034 285.208 156.889 61.996; 327, 96, 341, 134, 1 (bicycle): 0.930, 279, RotatedRect: 333.500 114.000 36.000 13.000 90.000; 460, 106, 493, 148, 1 (bicycle): 0.898, 786, RotatedRect: 476.284 126.621 27.308 45.993 19.179; 294, 93, 315, 153, 1 (bicycle): 0.869, 789, RotatedRect: 304.000 124.000 58.000 18.000 90.000; 278, 109, 290, 152, 1 (bicycle): 0.817, 355, RotatedRect: 283.500 130.000 42.000 11.000 90.000; 4, 4, 102, 191, 2 (car): 0.701, 9658, RotatedRect: 51.806 97.259 184.445 95.281 89.246; 270, 93, 290, 152, 1 (bicycle): 0.660, 723, RotatedRect: 280.500 122.500 17.000 59.000 0.000; 322, 114, 343, 152, 18 (sheep): 0.520, 298, RotatedRect: 332.000 133.000 34.000 14.000 90.000; 4; [1,1280,1,1]; "
         ]
       }
     ]
@@ -325,7 +325,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "RotatedRect: 1598.500, 641.500, 111.000, 109.000, 90.0002 (ellipse): 0.643; ; RotatedRect: 3097.500, 3104.000, 14.000, 13.000, 90.0001 (rectangle): 0.483; ; RotatedRect: 2800.000, 188.500, 255.000, 132.000, 90.0001 (rectangle): 0.401; ; 2; [1,1280,1,1]; ellipse: 0.643, 331; rectangle: 0.483, 48; rectangle: 0.401, 758; "
+          "1535, 585, 1662, 697, 2 (ellipse): 0.643, 9822, RotatedRect: 1598.500 641.500 111.000 109.000 90.000; 3091, 3097, 3105, 3112, 1 (rectangle): 0.483, 197, RotatedRect: 3097.500 3104.000 14.000 13.000 90.000; 2734, 60, 2867, 324, 1 (rectangle): 0.401, 30622, RotatedRect: 2800.000 188.500 255.000 132.000 90.000; 2; [1,1280,1,1]; ellipse: 0.643, 331; rectangle: 0.483, 48; rectangle: 0.401, 758; "
         ]
       }
     ]
@@ -351,7 +351,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0, 0, 640, 426, 1 (Anomaly): 0.854; [1,1,426,640]; pred_mask min:0.000 max:1.000;[0]"
+          "anomaly_map min:151 max:255;pred_score:1.0;pred_label:Anomaly;pred_mask min:1 max:1;"
         ]
       }
     ]
@@ -363,7 +363,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "0, 0, 640, 426, 1 (Anomaly): 0.854; [1,1,426,640]; 1 (Anomaly): 0.854[640 x 426 from (0, 0)] min:0.000 max:1.000;[0]"
+          "anomaly_map min:124 max:225;pred_score:0.9;pred_label:Anomaly;pred_mask min:0 max:1;"
         ]
       }
     ]
@@ -374,7 +374,7 @@
     "test_data": [
       {
         "image": "coco128/images/train2017/000000000074.jpg",
-        "reference": ["0, 0, 640, 426, 3 (cat): 0.648; [0]; [0], [10]"]
+        "reference": ["3 (cat): 0.648, [0], [0], [10]"]
       }
     ]
   },
@@ -439,7 +439,7 @@
       {
         "image": "coco128/images/train2017/000000000074.jpg",
         "reference": [
-          "background: 1.404, 311, background: 1.397, 44, background: 1.371, 34, background: 1.377, 12, background: 1.356, 155, background: 1.345, 12, background: 1.183, 219, background: 1.524, 8, background: 1.533, 4, background: 1.519, 2, background: 1.524, 4, background: 1.530, 6, background: 1.537, 2, background: 1.514, 4, background: 1.519, 8, background: 1.529, 6, background: 1.550, 6, background: 1.558, 4, background: 1.520, 2, background: 1.529, 4, background: 1.532, 6, background: 1.535, 6, background: 1.530, 2, background: 1.529, 50, background: 1.528, 22, background: 1.527, 38, background: 1.451, 1476, background: 1.345, 2743, background: 1.609, 2987, background: 1.636, 29909, "
+          "0: 0.272, 1: 0.728, [3500,3500,5], [0], [0]; background: 1.404, 311, background: 1.397, 44, background: 1.371, 34, background: 1.377, 12, background: 1.356, 155, background: 1.345, 12, background: 1.183, 219, background: 1.524, 8, background: 1.533, 4, background: 1.519, 2, background: 1.524, 4, background: 1.530, 6, background: 1.537, 2, background: 1.514, 4, background: 1.519, 8, background: 1.529, 6, background: 1.550, 6, background: 1.558, 4, background: 1.520, 2, background: 1.529, 4, background: 1.532, 6, background: 1.535, 6, background: 1.530, 2, background: 1.529, 50, background: 1.528, 22, background: 1.527, 38, background: 1.451, 1476, background: 1.345, 2743, background: 1.609, 2987, background: 1.636, 29909, "
         ]
       }
     ]

From f0c424cc4e582517aee95558606bb75e5dde194d Mon Sep 17 00:00:00 2001
From: Ronald Hecker <ronald.hecker@intel.com>
Date: Wed, 14 May 2025 12:51:26 +0200
Subject: [PATCH 15/16] Fix precommit tests

---
 tests/cpp/precommit/test_model_config.cpp | 40 +++++++++++------------
 tests/cpp/precommit/test_sanity.cpp       | 10 +++---
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/tests/cpp/precommit/test_model_config.cpp b/tests/cpp/precommit/test_model_config.cpp
index 1f68e37e..5053b670 100644
--- a/tests/cpp/precommit/test_model_config.cpp
+++ b/tests/cpp/precommit/test_model_config.cpp
@@ -122,14 +122,14 @@ TEST_P(ClassificationModelParameterizedTestSaveLoad, TestClassificationCorrectne
     auto ov_model = model->getModel();
     ov::serialize(ov_model, TMP_MODEL_FILE);
 
-    auto result = model->infer(image)->topLabels;
+    auto result = model->infer(image)->boxes;
 
     auto model_restored = ClassificationModel::create_model(TMP_MODEL_FILE, {}, preload, "CPU");
     auto result_data = model_restored->infer(image);
-    auto result_restored = result_data->topLabels;
+    auto result_restored = result_data->boxes;
 
-    EXPECT_EQ(result_restored[0].id, result[0].id);
-    EXPECT_EQ(result_restored[0].score, result[0].score);
+    EXPECT_EQ(result_restored[0].labels[0].label.id, result[0].labels[0].label.id);
+    EXPECT_EQ(result_restored[0].labels[0].score, result[0].labels[0].score);
 }
 
 TEST_P(ClassificationModelParameterizedTestSaveLoad, TestClassificationCorrectnessAfterSaveLoadWithAdapter) {
@@ -143,15 +143,15 @@ TEST_P(ClassificationModelParameterizedTestSaveLoad, TestClassificationCorrectne
     auto model = ClassificationModel::create_model(DATA_DIR + "/" + model_path, {}, preload, "CPU");
     auto ov_model = model->getModel();
     ov::serialize(ov_model, TMP_MODEL_FILE);
-    auto result = model->infer(image)->topLabels;
+    auto result = model->infer(image)->boxes;
 
     std::shared_ptr<InferenceAdapter> adapter = std::make_shared<MockAdapter>(TMP_MODEL_FILE);
     auto model_restored = ClassificationModel::create_model(adapter);
     auto result_data = model_restored->infer(image);
-    auto result_restored = result_data->topLabels;
+    auto result_restored = result_data->boxes;
 
-    EXPECT_EQ(result_restored[0].id, result[0].id);
-    EXPECT_EQ(result_restored[0].score, result[0].score);
+    EXPECT_EQ(result_restored[0].labels[0].label.id, result[0].labels[0].label.id);
+    EXPECT_EQ(result_restored[0].labels[0].score, result[0].labels[0].score);
 }
 
 TEST_P(SSDModelParameterizedTest, TestDetectionDefaultConfig) {
@@ -206,7 +206,7 @@ TEST_P(DetectionModelParameterizedTestSaveLoad, TestDetctionCorrectnessAfterSave
     auto ov_model = model->getModel();
     ov::serialize(ov_model, TMP_MODEL_FILE);
 
-    auto result = model->infer(image)->objects;
+    auto result = model->infer(image)->boxes;
 
     image = cv::imread(DATA_DIR + "/" + IMAGE_PATH);
     if (!image.data) {
@@ -214,15 +214,15 @@ TEST_P(DetectionModelParameterizedTestSaveLoad, TestDetctionCorrectnessAfterSave
     }
     auto model_restored = DetectionModel::create_model(TMP_MODEL_FILE, {}, "", preload, "CPU");
     auto result_data = model_restored->infer(image);
-    auto result_restored = result_data->objects;
+    auto result_restored = result_data->boxes;
 
     ASSERT_EQ(result.size(), result_restored.size());
 
     for (size_t i = 0; i < result.size(); i++) {
-        ASSERT_EQ(result[i].x, result_restored[i].x);
-        ASSERT_EQ(result[i].y, result_restored[i].y);
-        ASSERT_EQ(result[i].width, result_restored[i].width);
-        ASSERT_EQ(result[i].height, result_restored[i].height);
+        ASSERT_EQ(result[i].shape.x, result_restored[i].shape.x);
+        ASSERT_EQ(result[i].shape.y, result_restored[i].shape.y);
+        ASSERT_EQ(result[i].shape.width, result_restored[i].shape.width);
+        ASSERT_EQ(result[i].shape.height, result_restored[i].shape.height);
     }
 }
 
@@ -237,7 +237,7 @@ TEST_P(DetectionModelParameterizedTestSaveLoad, TestDetctionCorrectnessAfterSave
     auto model = DetectionModel::create_model(DATA_DIR + "/" + model_path, {}, "", preload, "CPU");
     auto ov_model = model->getModel();
     ov::serialize(ov_model, TMP_MODEL_FILE);
-    auto result = model->infer(image)->objects;
+    auto result = model->infer(image)->boxes;
 
     image = cv::imread(DATA_DIR + "/" + IMAGE_PATH);
     if (!image.data) {
@@ -247,15 +247,15 @@ TEST_P(DetectionModelParameterizedTestSaveLoad, TestDetctionCorrectnessAfterSave
     std::shared_ptr<InferenceAdapter> adapter = std::make_shared<MockAdapter>(TMP_MODEL_FILE);
     auto model_restored = DetectionModel::create_model(adapter);
     auto result_data = model_restored->infer(image);
-    auto result_restored = result_data->objects;
+    auto result_restored = result_data->boxes;
 
     ASSERT_EQ(result.size(), result_restored.size());
 
     for (size_t i = 0; i < result.size(); i++) {
-        ASSERT_EQ(result[i].x, result_restored[i].x);
-        ASSERT_EQ(result[i].y, result_restored[i].y);
-        ASSERT_EQ(result[i].width, result_restored[i].width);
-        ASSERT_EQ(result[i].height, result_restored[i].height);
+        ASSERT_EQ(result[i].shape.x, result_restored[i].shape.x);
+        ASSERT_EQ(result[i].shape.y, result_restored[i].shape.y);
+        ASSERT_EQ(result[i].shape.width, result_restored[i].shape.width);
+        ASSERT_EQ(result[i].shape.height, result_restored[i].shape.height);
     }
 }
 
diff --git a/tests/cpp/precommit/test_sanity.cpp b/tests/cpp/precommit/test_sanity.cpp
index 9aa5a5c6..1a50e68e 100644
--- a/tests/cpp/precommit/test_sanity.cpp
+++ b/tests/cpp/precommit/test_sanity.cpp
@@ -73,17 +73,17 @@ TEST_P(ModelParameterizedTest, SynchronousInference) {
         bool preload = true;
         auto model = DetectionModel::create_model(DATA_DIR + "/" + model_path, {}, "", preload, "CPU");
         auto result = model->infer(image);
-        EXPECT_GT(result->objects.size(), 0);
+        EXPECT_GT(result->boxes.size(), 0);
     } else if ("ClassificationModel" == GetParam().type) {
         bool preload = true;
         auto model = ClassificationModel::create_model(DATA_DIR + "/" + model_path, {}, preload, "CPU");
-        std::unique_ptr<ClassificationResult> result = model->infer(image);
-        ASSERT_GT(result->topLabels.size(), 0);
-        EXPECT_GT(result->topLabels.front().score, 0.0f);
+        auto result = model->infer(image);
+        ASSERT_GT(result->boxes.size(), 0);
+        EXPECT_GT(result->boxes.front().labels.front().score, 0.0f);
     } else if ("SegmentationModel" == GetParam().type) {
         bool preload = true;
         auto model = SegmentationModel::create_model(DATA_DIR + "/" + model_path, {}, preload, "CPU");
-        auto result = model->infer(image)->asRef<ImageResultWithSoftPrediction>();
+        auto result = model->infer(image);
         ASSERT_GT(model->getContours(result).size(), 0);
     }
 }

From 6b9ccc43ebd439d07da182f346c673c6f96bc537 Mon Sep 17 00:00:00 2001
From: "Hecker, Ronald" <ronald.hecker@intel.com>
Date: Wed, 21 May 2025 13:18:30 +0200
Subject: [PATCH 16/16] Remove ResultBase and clean up label

---
 src/cpp/models/include/models/results.h       | 56 +++++--------------
 src/cpp/models/src/anomaly_model.cpp          |  4 +-
 src/cpp/models/src/classification_model.cpp   |  8 +--
 src/cpp/models/src/detection_model_ssd.cpp    |  4 +-
 src/cpp/models/src/detection_model_yolo.cpp   |  6 +-
 .../src/detection_model_yolov3_onnx.cpp       |  2 +-
 src/cpp/models/src/detection_model_yolox.cpp  |  2 +-
 src/cpp/models/src/instance_segmentation.cpp  |  4 +-
 src/cpp/models/src/segmentation_model.cpp     |  4 +-
 src/cpp/tilers/src/detection.cpp              |  2 +-
 src/cpp/tilers/src/instance_segmentation.cpp  |  4 +-
 src/cpp/tilers/src/semantic_segmentation.cpp  |  4 +-
 12 files changed, 33 insertions(+), 67 deletions(-)

diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h
index 883efd9f..fa93b9ee 100644
--- a/src/cpp/models/include/models/results.h
+++ b/src/cpp/models/include/models/results.h
@@ -17,33 +17,16 @@
 
 struct MetaData;
 
-struct ResultBase {
-    ResultBase(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+struct InferenceResult {
+    InferenceResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
         : frameId(frameId),
           metaData(metaData) {}
-    virtual ~ResultBase() {}
 
+    std::shared_ptr<InternalModelData> internalModelData;
+    std::map<std::string, ov::Tensor> outputsData;
     int64_t frameId;
-
     std::shared_ptr<MetaData> metaData;
-    bool IsEmpty() {
-        return frameId < 0;
-    }
-
-    template <class T>
-    T& asRef() {
-        return dynamic_cast<T&>(*this);
-    }
-
-    template <class T>
-    const T& asRef() const {
-        return dynamic_cast<const T&>(*this);
-    }
-};
 
-struct InferenceResult : public ResultBase {
-    std::shared_ptr<InternalModelData> internalModelData;
-    std::map<std::string, ov::Tensor> outputsData;
 
     /// Returns the first output tensor
     /// This function is a useful addition to direct access to outputs list as many models have only one output
@@ -89,35 +72,22 @@ struct DetectedKeypoints {
 class Label {
 public:
     Label() {}
-    Label(int id, std::string name):  id(id), name(name) {}
+    Label(int id, std::string name, float score): id(id), name(name), score(score) {}
 
     int id;
     std::string name;
-
-    friend std::ostream& operator<< (std::ostream& os, const Label& label) {
-        return os << label.id << " (" << label.name << ")";
-    }
-};
-
-class LabelScore {
-public:
-    LabelScore() {}
-    LabelScore(int id, std::string name, float score): label(Label(id, name)), score(score) {}
-    LabelScore(Label label, float score):  label(label), score(score) {}
-
-    Label label;
     float score;
 
-    friend std::ostream& operator<< (std::ostream& os, const LabelScore& label) {
-        return os << label.label << ": " << std::fixed << std::setprecision(3) << label.score;
+    friend std::ostream& operator<< (std::ostream& os, const Label& label) {
+        return os << label.id << " (" << label.name << ")" << ": " << std::fixed << std::setprecision(3) << label.score;
     }
 };
 
 class Mask {
 public:
-    Mask(LabelScore label, cv::Rect roi, cv::Mat mask): label(label), roi(roi), mask(mask) {}
+    Mask(Label label, cv::Rect roi, cv::Mat mask): label(label), roi(roi), mask(mask) {}
 
-    LabelScore label;
+    Label label;
     cv::Rect roi;
     cv::Mat mask;
 
@@ -152,16 +122,16 @@ static inline std::vector<Contour> getContours(const std::vector<Mask>& segmente
         if (contours.size() != 1) {
             throw std::runtime_error("findContours() must have returned only one contour");
         }
-        combined_contours.push_back({obj.label.label.name, obj.label.score, contours[0]});
+        combined_contours.push_back({obj.label.name, obj.label.score, contours[0]});
     }
     return combined_contours;
 }
 
 class Box {
 public:
-    Box(cv::Rect shape, std::vector<LabelScore> labels): shape(shape), labels(labels) {}
+    Box(cv::Rect shape, std::vector<Label> labels): shape(shape), labels(labels) {}
     cv::Rect shape;
-    std::vector<LabelScore> labels;
+    std::vector<Label> labels;
 
     friend std::ostream& operator<< (std::ostream& os, const Box& box) {
 
@@ -189,7 +159,7 @@ class Box {
 
 class RotatedRect {
 public:
-    LabelScore label;
+    Label label;
     cv::RotatedRect shape;
 
     friend std::ostream& operator<< (std::ostream& os, const RotatedRect& box) {
diff --git a/src/cpp/models/src/anomaly_model.cpp b/src/cpp/models/src/anomaly_model.cpp
index e1c61a92..566f00c2 100644
--- a/src/cpp/models/src/anomaly_model.cpp
+++ b/src/cpp/models/src/anomaly_model.cpp
@@ -86,7 +86,7 @@ std::unique_ptr<Scene> AnomalyModel::postprocess(InferenceResult& infResult) {
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
 
     scene->saliency_maps.push_back(anomaly_map);
-    auto label = LabelScore(label_id, pred_label, pred_score);
+    auto label = Label(label_id, pred_label, pred_score);
     auto roi = cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight);
     scene->masks.push_back(Mask(label, roi, pred_mask));
     scene->boxes.push_back(Box(roi, {label}));
@@ -100,7 +100,7 @@ std::unique_ptr<Scene> AnomalyModel::postprocess(InferenceResult& infResult) {
             scene->boxes.push_back(
                 Box(
                     rect,
-                    {LabelScore(label_id, pred_label, box_score)}
+                    {Label(label_id, pred_label, box_score)}
                 )
             );
 
diff --git a/src/cpp/models/src/classification_model.cpp b/src/cpp/models/src/classification_model.cpp
index bf8e3840..65a043a5 100644
--- a/src/cpp/models/src/classification_model.cpp
+++ b/src/cpp/models/src/classification_model.cpp
@@ -328,7 +328,7 @@ std::unique_ptr<Scene> ClassificationModel::get_multilabel_predictions(Inference
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
     auto raw_scores = ov::Tensor();
-    std::vector<LabelScore> result;
+    std::vector<Label> result;
     float* raw_scoresPtr = nullptr;
     if (add_raw_scores) {
         raw_scores = ov::Tensor(logitsTensor.get_element_type(), logitsTensor.get_shape());
@@ -401,9 +401,9 @@ std::unique_ptr<Scene> ClassificationModel::get_hierarchical_predictions(Inferen
     }
 
     auto resolved_labels = resolver->resolve_labels(predicted_labels, predicted_scores);
-    std::vector<LabelScore> result;
+    std::vector<Label> result;
     for (const auto& label : resolved_labels) {
-        result.push_back(LabelScore(hierarchical_info.label_to_idx[label.first], label.first, label.second));
+        result.push_back(Label(hierarchical_info.label_to_idx[label.first], label.first, label.second));
     }
     const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
     cv::Rect shape(0, 0, internalData.inputImgWidth, internalData.inputImgHeight);
@@ -449,7 +449,7 @@ std::unique_ptr<Scene> ClassificationModel::get_multiclass_predictions(Inference
         scene->additional_tensors["raw_scores"] = raw_scores;
     }
 
-    std::vector<LabelScore> result;
+    std::vector<Label> result;
     for (size_t i = 0; i < scoresTensor.get_size(); ++i) {
         int ind = indicesPtr[i];
         if (ind < 0 || ind >= static_cast<int>(labels.size())) {
diff --git a/src/cpp/models/src/detection_model_ssd.cpp b/src/cpp/models/src/detection_model_ssd.cpp
index 69299997..a25b0371 100644
--- a/src/cpp/models/src/detection_model_ssd.cpp
+++ b/src/cpp/models/src/detection_model_ssd.cpp
@@ -163,7 +163,7 @@ std::unique_ptr<Scene> ModelSSD::postprocessSingleOutput(InferenceResult& infRes
                         0.f,
                         floatInputImgHeight) - y
                 ),
-                {LabelScore(labelID, getLabelName(labelID), confidence)}
+                {Label(labelID, getLabelName(labelID), confidence)}
             );
             scene->boxes.push_back(box);
         }
@@ -223,7 +223,7 @@ std::unique_ptr<Scene> ModelSSD::postprocessMultipleOutputs(InferenceResult& inf
             if (width * height >= box_area_threshold) {
                 scene->boxes.push_back(Box(
                   cv::Rect(x, y, width, height),
-                  {LabelScore(labels[i], getLabelName(labels[i]), confidence)}
+                  {Label(labels[i], getLabelName(labels[i]), confidence)}
                 ));
             }
         }
diff --git a/src/cpp/models/src/detection_model_yolo.cpp b/src/cpp/models/src/detection_model_yolo.cpp
index 1ca92984..f9e3d2f2 100644
--- a/src/cpp/models/src/detection_model_yolo.cpp
+++ b/src/cpp/models/src/detection_model_yolo.cpp
@@ -282,7 +282,7 @@ std::unique_ptr<Scene> ModelYolo::postprocess(InferenceResult& infResult) {
         for (const auto& obj1 : objects) {
             bool isGoodResult = true;
             for (const auto& obj2 : objects) {
-                if (obj1.labels[0].label.id == obj2.labels[0].label.id && obj1.labels[0].score < obj2.labels[0].score &&
+                if (obj1.labels[0].id == obj2.labels[0].id && obj1.labels[0].score < obj2.labels[0].score &&
                     intersectionOverUnion(obj1, obj2) >= iou_threshold) {  // if obj1 is the same as obj2, condition
                                                                            // expression will evaluate to false anyway
                     isGoodResult = false;
@@ -410,7 +410,7 @@ void ModelYolo::parseYOLOOutput(const std::string& output_name,
 
                     //--- Checking confidence threshold conformance and adding region to the list
                     if (prob >= confidence_threshold) {
-                        objects.push_back(Box(obj, {LabelScore(j, getLabelName(j), prob)}));
+                        objects.push_back(Box(obj, {Label(j, getLabelName(j), prob)}));
                     }
                 }
             }
@@ -635,7 +635,7 @@ std::unique_ptr<Scene> YOLOv5::postprocess(InferenceResult& infResult) {
 
         scene->boxes.push_back(Box(
             cv::Rect(x, y, width, height),
-            {LabelScore(labelID, label, confidence)}
+            {Label(labelID, label, confidence)}
         ));
     }
 
diff --git a/src/cpp/models/src/detection_model_yolov3_onnx.cpp b/src/cpp/models/src/detection_model_yolov3_onnx.cpp
index 7827e2b9..a460a318 100644
--- a/src/cpp/models/src/detection_model_yolov3_onnx.cpp
+++ b/src/cpp/models/src/detection_model_yolov3_onnx.cpp
@@ -163,7 +163,7 @@ std::unique_ptr<Scene> ModelYoloV3ONNX::postprocess(InferenceResult& infResult)
             obj.height = clamp(height, 0.f, static_cast<float>(imgHeight));
             obj.width = clamp(width, 0.f, static_cast<float>(imgWidth));
 
-            scene->boxes.push_back(Box(obj, {LabelScore(classInd, getLabelName(classInd), score)}));
+            scene->boxes.push_back(Box(obj, {Label(classInd, getLabelName(classInd), score)}));
         }
     }
     return scene;
diff --git a/src/cpp/models/src/detection_model_yolox.cpp b/src/cpp/models/src/detection_model_yolox.cpp
index fcac9cb4..4b9e1ec7 100644
--- a/src/cpp/models/src/detection_model_yolox.cpp
+++ b/src/cpp/models/src/detection_model_yolox.cpp
@@ -195,7 +195,7 @@ std::unique_ptr<Scene> ModelYoloX::postprocess(InferenceResult& infResult) {
         obj.width =
             clamp(validBoxes[index].right - validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth));
         scene->boxes.push_back(
-            Box(obj, {LabelScore(classes[index], getLabelName(classes[index]), scores[index])})
+            Box(obj, {Label(classes[index], getLabelName(classes[index]), scores[index])})
         );
     }
     return scene;
diff --git a/src/cpp/models/src/instance_segmentation.cpp b/src/cpp/models/src/instance_segmentation.cpp
index a5ebfdc9..624fadad 100644
--- a/src/cpp/models/src/instance_segmentation.cpp
+++ b/src/cpp/models/src/instance_segmentation.cpp
@@ -8,8 +8,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <fstream>
-#include <limits>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <openvino/openvino.hpp>
@@ -322,7 +320,7 @@ std::unique_ptr<Scene> MaskRCNNModel::postprocess(InferenceResult& infResult) {
             continue;
         }
 
-        LabelScore label(labelID, getLabelName(labelID), confidence);
+        Label label(labelID, getLabelName(labelID), confidence);
 
         cv::Rect roi;
 
diff --git a/src/cpp/models/src/segmentation_model.cpp b/src/cpp/models/src/segmentation_model.cpp
index 197c923b..4c722385 100644
--- a/src/cpp/models/src/segmentation_model.cpp
+++ b/src/cpp/models/src/segmentation_model.cpp
@@ -261,7 +261,7 @@ std::unique_ptr<Scene> SegmentationModel::postprocess(InferenceResult& infResult
 
     auto scene = std::make_unique<Scene>(infResult.frameId, infResult.metaData);
     auto roi = cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight);
-    scene->masks.push_back(Mask(LabelScore(0, "hard_prediction", 0), roi, hard_prediction));
+    scene->masks.push_back(Mask(Label(0, "hard_prediction", 0), roi, hard_prediction));
     if (return_soft_prediction) {
         cv::resize(soft_prediction,
                    soft_prediction,
@@ -270,7 +270,7 @@ std::unique_ptr<Scene> SegmentationModel::postprocess(InferenceResult& infResult
                    0.0,
                    cv::INTER_NEAREST);
 
-        scene->masks.push_back(Mask(LabelScore(1, "soft_prediction", 0), roi, soft_prediction));
+        scene->masks.push_back(Mask(Label(1, "soft_prediction", 0), roi, soft_prediction));
         auto iter = infResult.outputsData.find(feature_vector_name);
         if (infResult.outputsData.end() != iter) {
             scene->saliency_maps.push_back(get_activation_map(soft_prediction));
diff --git a/src/cpp/tilers/src/detection.cpp b/src/cpp/tilers/src/detection.cpp
index 00070af5..94fa6827 100644
--- a/src/cpp/tilers/src/detection.cpp
+++ b/src/cpp/tilers/src/detection.cpp
@@ -66,7 +66,7 @@ std::unique_ptr<Scene> DetectionTiler::merge_results(const std::vector<std::uniq
 
     for (const auto& result : tiles_results) {
         for (auto& det : result->boxes) {
-            all_detections.emplace_back(det.shape.x, det.shape.y, det.shape.x + det.shape.width, det.shape.y + det.shape.height, det.labels[0].label.id);
+            all_detections.emplace_back(det.shape.x, det.shape.y, det.shape.x + det.shape.width, det.shape.y + det.shape.height, det.labels[0].id);
             all_scores.push_back(det.labels[0].score);
             all_detections_refs.push_back(det);
         }
diff --git a/src/cpp/tilers/src/instance_segmentation.cpp b/src/cpp/tilers/src/instance_segmentation.cpp
index 4777b465..93b9e439 100644
--- a/src/cpp/tilers/src/instance_segmentation.cpp
+++ b/src/cpp/tilers/src/instance_segmentation.cpp
@@ -13,8 +13,6 @@
 #include <utils/nms.hpp>
 #include <vector>
 
-#include "utils/common.hpp"
-
 namespace {
 class MaskRCNNModelParamsSetter {
 public:
@@ -76,7 +74,7 @@ std::unique_ptr<Scene> InstanceSegmentationTiler::merge_results(
 
     for (const auto& result : tiles_results) {
         for (auto& det : result->masks) {
-            all_detections.emplace_back(det.roi.x, det.roi.y, det.roi.x + det.roi.width, det.roi.y + det.roi.height, det.label.label.id);
+            all_detections.emplace_back(det.roi.x, det.roi.y, det.roi.x + det.roi.width, det.roi.y + det.roi.height, det.label.id);
             all_scores.push_back(det.label.score);
             all_detections_ptrs.push_back(det);
         }
diff --git a/src/cpp/tilers/src/semantic_segmentation.cpp b/src/cpp/tilers/src/semantic_segmentation.cpp
index 3a866bf1..5671d043 100644
--- a/src/cpp/tilers/src/semantic_segmentation.cpp
+++ b/src/cpp/tilers/src/semantic_segmentation.cpp
@@ -90,7 +90,7 @@ std::unique_ptr<Scene> SemanticSegmentationTiler::merge_results(
 
     auto scene = std::make_unique<Scene>();
     auto roi = cv::Rect(0, 0, image_size.width, image_size.height);
-    scene->masks.push_back(Mask(LabelScore(0, "hard_prediction", 0), roi, hard_prediction));
-    scene->masks.push_back(Mask(LabelScore(0, "soft_prediction", 0), roi, merged_soft_prediction));
+    scene->masks.push_back(Mask(Label(0, "hard_prediction", 0), roi, hard_prediction));
+    scene->masks.push_back(Mask(Label(0, "soft_prediction", 0), roi, merged_soft_prediction));
     return scene;
 }