Implement suggestions

aljazkonec1 · aljazkonec1 · commit 2261acad7a67 · 2025-11-13T13:04:08.000+01:00
diff --git a/bindings/python/src/pipeline/CommonBindings.cpp b/bindings/python/src/pipeline/CommonBindings.cpp
@@ -100,16 +100,22 @@ void CommonBindings::bind(pybind11::module& m, void* pCallstack) {
     ///////////////////////////////////////////////////////////////////////
 
     keypoint.def(py::init<>())
-        .def(py::init<Point3f, float, uint32_t>(), py::arg("coordinates"), py::arg("confidence") = 0.f, py::arg("label") = 0)
-        .def(py::init<Point2f, float, uint32_t>(), py::arg("coordinates"), py::arg("confidence") = 0.f, py::arg("label") = 0)
-        .def(py::init<float, float, float, float, uint32_t>(), py::arg("x"), py::arg("y"), py::arg("z"), py::arg("confidence") = 0.f, py::arg("label") = 0)
-        .def_readwrite("imageCoordinates", &Keypoint::imageCoordinates)
-        .def_readwrite("confidence", &Keypoint::confidence)
-        .def_readwrite("label", &Keypoint::label);
+        .def(py::init<Point3f, float, uint32_t>(), py::arg("coordinates"), py::arg("confidence") = 0.f, py::arg("label") = 0, DOC(dai, Keypoint, Keypoint))
+        .def(py::init<Point2f, float, uint32_t>(), py::arg("coordinates"), py::arg("confidence") = 0.f, py::arg("label") = 0, DOC(dai, Keypoint, Keypoint))
+        .def(py::init<float, float, float, float, uint32_t>(),
+             py::arg("x"),
+             py::arg("y"),
+             py::arg("z"),
+             py::arg("confidence") = 0.f,
+             py::arg("label") = 0,
+             DOC(dai, Keypoint, Keypoint))
+        .def_readwrite("imageCoordinates", &Keypoint::imageCoordinates, DOC(dai, Keypoint, imageCoordinates))
+        .def_readwrite("confidence", &Keypoint::confidence, DOC(dai, Keypoint, confidence))
+        .def_readwrite("label", &Keypoint::label, DOC(dai, Keypoint, label));
 
     keypointsList.def(py::init<>())
-        .def(py::init<std::vector<Keypoint>, std::vector<Edge>>(), py::arg("keypoints"), py::arg("edges"))
-        .def(py::init<std::vector<Keypoint>>(), py::arg("keypoints"))
+        .def(py::init<std::vector<Keypoint>, std::vector<Edge>>(), py::arg("keypoints"), py::arg("edges"), DOC(dai, KeypointsListT, KeypointsListT))
+        .def(py::init<std::vector<Keypoint>>(), py::arg("keypoints"), DOC(dai, KeypointsListT, KeypointsListT))
         .def(
             "setKeypoints",
             [](KeypointsList& self, const std::vector<Keypoint>& kps) { self.Base::setKeypoints(kps); },
@@ -422,14 +428,12 @@ void CommonBindings::bind(pybind11::module& m, void* pCallstack) {
         .def_readwrite("anchors", &DetectionParserOptions::anchors)
         .def_readwrite("anchorMasks", &DetectionParserOptions::anchorMasks)
         .def_readwrite("iouThreshold", &DetectionParserOptions::iouThreshold)
-        .def_readwrite("inputWidth", &DetectionParserOptions::inputWidth)
-        .def_readwrite("inputHeight", &DetectionParserOptions::inputHeight)
         .def_readwrite("decodingFamily", &DetectionParserOptions::decodingFamily)
         .def_readwrite("keypointEdges", &DetectionParserOptions::keypointEdges)
         .def_readwrite("anchorsV2", &DetectionParserOptions::anchorsV2)
         .def_readwrite("decodeKeypoints", &DetectionParserOptions::decodeKeypoints)
         .def_readwrite("numKeypoints", &DetectionParserOptions::nKeypoints)
-        .def_readwrite("outputNames", &DetectionParserOptions::outputNames);
+        .def_readwrite("outputNames", &DetectionParserOptions::outputNamesToUse);
 
     cameraExposureOffset.value("START", CameraExposureOffset::START).value("MIDDLE", CameraExposureOffset::MIDDLE).value("END", CameraExposureOffset::END);
 
diff --git a/bindings/python/src/pipeline/datatype/ImgDetectionsBindings.cpp b/bindings/python/src/pipeline/datatype/ImgDetectionsBindings.cpp
@@ -125,13 +125,12 @@ void bind_imgdetections(pybind11::module& m, void* pCallstack) {
         .def("setTransformation", [](ImgDetections& msg, const std::optional<ImgTransformation>& transformation) { msg.transformation = transformation; })
         .def("getSegmentationMaskWidth", &ImgDetections::getSegmentationMaskWidth, DOC(dai, ImgDetectionsT, getSegmentationMaskWidth))
         .def("getSegmentationMaskHeight", &ImgDetections::getSegmentationMaskHeight, DOC(dai, ImgDetectionsT, getSegmentationMaskHeight))
-        .def("setMask", &ImgDetections::setMask, py::arg("mask"), py::arg("width"), py::arg("height"), DOC(dai, ImgDetectionsT, setMask))
+        .def(
+            "setMask", &ImgDetections::setSegmentationMask, py::arg("mask"), py::arg("width"), py::arg("height"), DOC(dai, ImgDetectionsT, setSegmentationMask))
         .def("getMaskData", &ImgDetections::getMaskData, DOC(dai, ImgDetectionsT, getMaskData))
-        .def("getSegmentationMaskAsImgFrame", &ImgDetections::getSegmentationMaskAsImgFrame, DOC(dai, ImgDetectionsT, getSegmentationMaskAsImgFrame))
+        .def("getSegmentationMaskAsImgFrame", &ImgDetections::getSegmentationMask, DOC(dai, ImgDetectionsT, getSegmentationMask))
 #ifdef DEPTHAI_HAVE_OPENCV_SUPPORT
-        .def(
-            "getSegmentationMask", [](ImgDetections& self) { return self.getSegmentationMask(false); }, DOC(dai, ImgDetectionsT, getSegmentationMask))
-        .def("setSegmentationMask", &ImgDetections::setSegmentationMask, py::arg("mask"), DOC(dai, ImgDetectionsT, setSegmentationMask))
+        .def("setSegmentationMask", &ImgDetections::setCvSegmentationMask, py::arg("mask"), DOC(dai, ImgDetectionsT, setCvSegmentationMask))
         .def(
             "getCvSegmentationMask",
             [](ImgDetections& self) { return self.getCvSegmentationMask(&g_numpyAllocator); },
diff --git a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp
@@ -68,7 +68,7 @@ void bind_detectionparser(pybind11::module& m, void* pCallstack) {
         .def("setSubtype", &DetectionParser::setSubtype, py::arg("subtype"), DOC(dai, node, DetectionParser, setSubtype))
         .def("setDecodeKeypoints", &DetectionParser::setDecodeKeypoints, py::arg("decode"), DOC(dai, node, DetectionParser, setDecodeKeypoints))
         .def("setDecodeSegmentation", &DetectionParser::setDecodeSegmentation, py::arg("decode"), DOC(dai, node, DetectionParser, setDecodeSegmentation))
-        .def("setNKeypoints", &DetectionParser::setNKeypoints, py::arg("nKeypoints"), DOC(dai, node, DetectionParser, setNKeypoints))
+        .def("setNumKeypoints", &DetectionParser::setNumKeypoints, py::arg("numKeypoints"), DOC(dai, node, DetectionParser, setNumKeypoints))
         .def("setClasses", &DetectionParser::setClasses, py::arg("classes"), DOC(dai, node, DetectionParser, setClasses))
         .def("setStrides", &DetectionParser::setStrides, py::arg("strides"), DOC(dai, node, DetectionParser, setStrides))
         .def("setKeypointEdges", &DetectionParser::setKeypointEdges, py::arg("edges"), DOC(dai, node, DetectionParser, setKeypointEdges))
diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp
@@ -24,7 +24,6 @@ int main() {
     dai::NNModelDescription modelDescription;
     modelDescription.model = "luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39";
     detectionNetwork->build(cameraNode, modelDescription);
-    auto labelMap = detectionNetwork->getClasses();
 
     // Create output queues
     auto qRgb = detectionNetwork->passthrough.createOutputQueue();
@@ -62,7 +61,7 @@ int main() {
                 auto bbox = frameNorm(frame, dai::Point2f(detection.xmin, detection.ymin), dai::Point2f(detection.xmax, detection.ymax));
 
                 // Draw label
-                cv::putText(frame, labelMap.value()[detection.label], cv::Point(bbox.x + 10, bbox.y + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, textColor);
+                cv::putText(frame, detection.labelName, cv::Point(bbox.x + 10, bbox.y + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, textColor);
 
                 // Draw confidence
                 cv::putText(frame,
diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp
@@ -4,6 +4,7 @@
 #include <cstddef>
 #include <cstdio>
 #include <iostream>
+#include <map>
 #include <opencv2/core.hpp>
 #include <opencv2/opencv.hpp>
 
@@ -29,7 +30,6 @@ int main() {
     dai::NNModelDescription modelDescription;
     modelDescription.model = "luxonis/yolov8-instance-segmentation-large:coco-640x480";
     detectionNetwork->build(cameraNode, modelDescription);
-    auto labelMap = detectionNetwork->getClasses();
 
     // Create output queues
     auto qRgb = detectionNetwork->passthrough.createOutputQueue();
@@ -65,24 +65,30 @@ int main() {
         if(inDet != nullptr) {
             counter++;
 
-            // Get all labels as sorted list
             auto labels = std::set<int>();
+            std::map<int, std::string> labelNameByIndex;
             for(const auto& detection : inDet->detections) {
                 labels.insert(detection.label);
+                labelNameByIndex.emplace(detection.label, detection.labelName);
+            }
+
+            std::vector<std::string> labelNames;
+            labelNames.reserve(labelNameByIndex.size());
+            for(const auto& label : labels) {
+                const auto it = labelNameByIndex.find(label);
+                if(it != labelNameByIndex.end()) {
+                    labelNames.push_back(it->second);
+                }
             }
             std::list<int> labelsList(labels.begin(), labels.end());
             labelsList.sort();
             std::vector<int> labelsVector(labelsList.begin(), labelsList.end());
 
-            std::vector<std::string> labelMaps;
-            for(const auto& label : labelsList) {
-                labelMaps.push_back(labelMap->at(label));
-            };
             cv::putText(sidePanel, "Press index to filter by class:", cv::Point(10, 20), cv::FONT_HERSHEY_TRIPLEX, 0.7, cv::Scalar(0, 0, 0), 1);
 
-            for(size_t i = 0; i < labelMaps.size(); i++) {
+            for(size_t i = 0; i < labelNames.size(); i++) {
                 cv::putText(sidePanel,
-                            std::to_string(i) + " - " + labelMaps[i],
+                            std::to_string(i + 1) + " - " + labelNames[i],
                             cv::Point(10, 40 + static_cast<int>(i) * 20),
                             cv::FONT_HERSHEY_TRIPLEX,
                             0.7,
@@ -98,7 +104,7 @@ int main() {
             } else if(key >= '1' && key <= '9') {
                 int index = key - '1';
                 if(index < static_cast<int>(labelsList.size())) {
-                    std::printf("Filtering by label: %s\n", labelMaps[index].c_str());
+                    std::printf("Filtering by label: %s\n", labelNames[index].c_str());
                     filteredLabel = labelsVector[index];
                 }
             }
@@ -111,7 +117,7 @@ int main() {
                 std::optional<cv::Mat> segmentationMask;
 
                 if(filteredLabel == -1) {
-                    segmentationMask = inDet->getSegmentationMask();
+                    segmentationMask = inDet->getCvSegmentationMask();
                 } else {
                     segmentationMask = inDet->getCvSegmentationMaskByClass(filteredLabel);
                     detections.erase(
@@ -137,7 +143,7 @@ int main() {
                     auto bbox = frameNorm(frame, dai::Point2f(detection.xmin, detection.ymin), dai::Point2f(detection.xmax, detection.ymax));
 
                     // Draw label
-                    cv::putText(frame, labelMap.value()[detection.label], cv::Point(bbox.x + 10, bbox.y + 20), cv::FONT_HERSHEY_TRIPLEX, 0.7, textColor);
+                    cv::putText(frame, detection.labelName, cv::Point(bbox.x + 10, bbox.y + 20), cv::FONT_HERSHEY_TRIPLEX, 0.7, textColor);
 
                     // Draw confidence
                     cv::putText(frame,
@@ -155,13 +161,9 @@ int main() {
                 // cv::imshow("side panel", sidePanel);
                 // Show the frame
                 cv::imshow("rgb", frame);
-
-                auto currentTime = std::chrono::steady_clock::now();
-                float fps = counter / std::chrono::duration<float>(currentTime - startTime).count();
-                std::cout << "FPS: " << fps << std::endl;
             }
         }
     }
 
     return 0;
-}
+}
diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py b/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py
@@ -37,7 +37,7 @@ def displayFrame(name, frame):
             )
             cv2.putText(
                 frame,
-                labelMap[detection.label],
+                detection.labelName,
                 (bbox[0] + 10, bbox[1] + 20),
                 cv2.FONT_HERSHEY_TRIPLEX,
                 0.5,
diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py b/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
@@ -10,7 +10,7 @@
     cameraNode = pipeline.create(dai.node.Camera).build()
     detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-large:coco-640x480"))
     labelMap = detectionNetwork.getClasses()
-
+    assert labelMap is not None
     qRgb = detectionNetwork.passthrough.createOutputQueue()
     qDet = detectionNetwork.out.createOutputQueue()
 
@@ -37,7 +37,7 @@ def displayFrame(frame):
             )
             cv2.putText(
                 frame,
-                labelMap[detection.label],
+                detection.labelName,
                 (bbox[0] + 10, bbox[1] + 20),
                 cv2.FONT_HERSHEY_TRIPLEX,
                 0.7,
diff --git a/include/depthai/common/DetectionParserOptions.hpp b/include/depthai/common/DetectionParserOptions.hpp
@@ -20,8 +20,6 @@ struct DetectionParserOptions {
     DetectionNetworkType nnFamily;
     std::string subtype;
     float confidenceThreshold;
-    int inputWidth;
-    int inputHeight;
 
     /// YOLO specific network properties
     YoloDecodingFamily decodingFamily = YoloDecodingFamily::TLBR;  // top left bottom right anchor free
@@ -35,7 +33,7 @@ struct DetectionParserOptions {
     std::vector<int> strides = {8, 16, 32};
     std::vector<float> anchors;
     std::map<std::string, std::vector<int>> anchorMasks;
-    std::vector<std::string> outputNames;
+    std::vector<std::string> outputNamesToUse;
     /// see YoloDetectionNetwork::setAnchors() for format
     std::vector<std::vector<std::vector<float>>> anchorsV2;
     float iouThreshold;
@@ -46,8 +44,6 @@ DEPTHAI_SERIALIZE_EXT(DetectionParserOptions,
                       nnFamily,
                       subtype,
                       confidenceThreshold,
-                      inputWidth,
-                      inputHeight,
                       decodingFamily,
                       decodeKeypoints,
                       decodeSegmentation,
@@ -58,7 +54,7 @@ DEPTHAI_SERIALIZE_EXT(DetectionParserOptions,
                       strides,
                       anchors,
                       anchorMasks,
-                      outputNames,
+                      outputNamesToUse,
                       anchorsV2,
                       iouThreshold,
                       keypointEdges);
diff --git a/include/depthai/pipeline/datatype/ImgDetectionsT.hpp b/include/depthai/pipeline/datatype/ImgDetectionsT.hpp
@@ -8,15 +8,6 @@
 #include "depthai/pipeline/datatype/Buffer.hpp"
 #include "depthai/pipeline/datatype/ImgFrame.hpp"
 
-#ifdef DEPTHAI_XTENSOR_SUPPORT
-    #include <xtensor/containers/xadapt.hpp>
-    #include <xtensor/containers/xbuffer_adaptor.hpp>
-    #include <xtensor/core/xlayout.hpp>
-    #include <xtensor/core/xmath.hpp>
-    #include <xtensor/core/xtensor_forward.hpp>
-
-#endif
-
 #ifdef DEPTHAI_HAVE_OPENCV_SUPPORT
     #include <opencv2/core/mat.hpp>
     #include <opencv2/opencv.hpp>
@@ -39,42 +30,6 @@ class ImgDetectionsT : public Buffer {
     ImgDetectionsT() = default;
     virtual ~ImgDetectionsT() = default;
 
-    // Iterator support
-    using value_type = DetectionT;
-    using iterator = typename std::vector<DetectionT>::iterator;
-    using const_iterator = typename std::vector<DetectionT>::const_iterator;
-
-    iterator begin() noexcept {
-        return detections.begin();
-    }
-    iterator end() noexcept {
-        return detections.end();
-    }
-    const_iterator begin() const noexcept {
-        return detections.begin();
-    }
-    const_iterator end() const noexcept {
-        return detections.end();
-    }
-    const_iterator cbegin() const noexcept {
-        return detections.cbegin();
-    }
-    const_iterator cend() const noexcept {
-        return detections.cend();
-    }
-    bool empty() const noexcept {
-        return detections.empty();
-    }
-    size_t size() const noexcept {
-        return detections.size();
-    }
-    value_type& operator[](size_t i) {
-        return detections[i];
-    }
-    const value_type& operator[](size_t i) const {
-        return detections[i];
-    }
-
     /*
      * Common API
      */
@@ -93,38 +48,14 @@ class ImgDetectionsT : public Buffer {
      * Sets the segmentation mask from a vector of bytes, along with width and height.
      * The size of the vector must be equal to width * height.
      */
-    void setMask(const std::vector<std::uint8_t>& mask, size_t width, size_t height);
+    void setSegmentationMask(const std::vector<std::uint8_t>& mask, size_t width, size_t height);
 
     /*
      * Returns a copy of the segmentation mask data as a vector of bytes. If mask data is not set, returns std::nullopt.
      */
     std::optional<std::vector<std::uint8_t>> getMaskData() const;
 
-    std::optional<dai::ImgFrame> getSegmentationMaskAsImgFrame() const;
-
-    // Optional - xtensor support
-#ifdef DEPTHAI_XTENSOR_SUPPORT
-    /**
-     * @note This API only available if xtensor support is enabled
-     */
-    using XArray2D = xt::xtensor<std::uint8_t, 2, xt::layout_type::row_major>;
-
-    /**
-     * Returns a copy of the segmentation mask data as a 2D array. If mask data is not set, returns std::nullopt.
-     */
-    std::optional<XArray2D> getTensorSegmentationMask() const;
-
-    /**
-     * Sets the segmentation mask from a 2D xtensor array.
-     */
-    ImgDetectionsT& setTensorSegmentationMask(XArray2D mask);
-
-    /*
-     * Returns a binary mask where pixels belonging to the instance index are set to 1, others to 0. If mask data is not set, returns std::nullopt.
-     */
-    std::optional<XArray2D> getTensorSegmentationMaskByIndex(uint8_t index) const;
-
-#endif
+    std::optional<dai::ImgFrame> getSegmentationMask() const;
 
 // Optional - OpenCV support
 #ifdef DEPTHAI_HAVE_OPENCV_SUPPORT
@@ -137,14 +68,7 @@ class ImgDetectionsT : public Buffer {
      *
      * @param frame Input cv::Mat frame from which to copy the data
      */
-    ImgDetectionsT& setSegmentationMask(cv::Mat mask);
-
-    /**
-     * Retrieves data as cv::Mat with specified width, height and type. If mask data is not set, returns std::nullopt.
-     *
-     * @param copy If false only a reference to data is made, otherwise a copy
-     */
-    std::optional<cv::Mat> getSegmentationMask(bool copy = false);
+    void setCvSegmentationMask(cv::Mat mask);
 
     /**
      * Retrieves data as cv::Mat with specified width and height. If mask data is not set, returns std::nullopt.
diff --git a/include/depthai/pipeline/node/DetectionParser.hpp b/include/depthai/pipeline/node/DetectionParser.hpp
@@ -194,7 +194,7 @@ class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, Detec
     /**
      * Set number of keypoints to decode. Automatically enables keypoints decoding.
      */
-    void setNKeypoints(int nKeypoints);
+    void setNumKeypoints(int numKeypoints);
 
     /**
      * Set strides for yolo models
diff --git a/src/pipeline/datatype/ImgDetectionsT.cpp b/src/pipeline/datatype/ImgDetectionsT.cpp
diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp
diff --git a/tests/src/onhost_tests/pipeline/datatype/imgdetections_test.cpp b/tests/src/onhost_tests/pipeline/datatype/imgdetections_test.cpp

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ def displayFrame(name, frame):`
`37`	`37`	`)`
`38`	`38`	`cv2.putText(`
`39`	`39`	`frame,`
`40`		`- labelMap[detection.label],`
	`40`	`+ detection.labelName,`
`41`	`41`	`(bbox[0] + 10, bbox[1] + 20),`
`42`	`42`	`cv2.FONT_HERSHEY_TRIPLEX,`
`43`	`43`	`0.5,`