diff --git a/CMakeLists.txt b/CMakeLists.txt index c040fa97c..4987f9f7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -311,6 +311,7 @@ set(TARGET_CORE_SOURCES src/pipeline/node/ImageAlign.cpp src/pipeline/node/ToF.cpp src/pipeline/node/DetectionParser.cpp + src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp src/pipeline/node/test/MyProducer.cpp src/pipeline/node/test/MyConsumer.cpp src/pipeline/node/UVC.cpp diff --git a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp index 83389ff28..b12c3d1a8 100644 --- a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp +++ b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp @@ -65,6 +65,7 @@ void bind_detectionparser(pybind11::module& m, void* pCallstack) { DOC(dai, node, DetectionParser, setAnchors, 2)) .def("setAnchorMasks", &DetectionParser::setAnchorMasks, py::arg("anchorMasks"), DOC(dai, node, DetectionParser, setAnchorMasks)) .def("setIouThreshold", &DetectionParser::setIouThreshold, py::arg("thresh"), DOC(dai, node, DetectionParser, setIouThreshold)) + .def("setRunOnHost", &DetectionParser::setRunOnHost, py::arg("runOnHost"), DOC(dai, node, DetectionParser, setRunOnHost)) .def("setSubtype", &DetectionParser::setSubtype, py::arg("subtype"), DOC(dai, node, DetectionParser, setSubtype)) .def("setDecodeKeypoints", &DetectionParser::setDecodeKeypoints, py::arg("decode"), DOC(dai, node, DetectionParser, setDecodeKeypoints)) .def("setDecodeSegmentation", &DetectionParser::setDecodeSegmentation, py::arg("decode"), DOC(dai, node, DetectionParser, setDecodeSegmentation)) @@ -78,6 +79,7 @@ void bind_detectionparser(pybind11::module& m, void* pCallstack) { .def("getAnchors", &DetectionParser::getAnchors, DOC(dai, node, DetectionParser, getAnchors)) .def("getAnchorMasks", &DetectionParser::getAnchorMasks, DOC(dai, node, DetectionParser, getAnchorMasks)) .def("getIouThreshold", &DetectionParser::getIouThreshold, DOC(dai, node, DetectionParser, getIouThreshold)) + .def("runOnHost", &DetectionParser::runOnHost, DOC(dai, node, DetectionParser, runOnHost)) .def("getSubtype", &DetectionParser::getSubtype, DOC(dai, node, DetectionParser, getSubtype)) .def("getNkeypoints", &DetectionParser::getNKeypoints, DOC(dai, node, DetectionParser, getNKeypoints)) .def("getDecodeKeypoints", &DetectionParser::getDecodeKeypoints, DOC(dai, node, DetectionParser, getDecodeKeypoints)) diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake index 41cf4cb40..f6ae0d22b 100644 --- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake +++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake @@ -3,4 +3,4 @@ set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot") # "version if applicable" -set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+777a261c32c4daf6a0b31093701d6f930f6b8caf") +set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+62ce59c3c4a4a53a9b0773fe83dabbecdc4553e9") diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake index 6cad4c819..7c6bb3df4 100644 --- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake +++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake @@ -2,7 +2,7 @@ set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot") # "full commit hash of device side binary" -set(DEPTHAI_DEVICE_SIDE_COMMIT "b18d476e0adba7541664251cce8c1ba71db16daa") +set(DEPTHAI_DEVICE_SIDE_COMMIT "8741ce89206d2a5299acc3382c7496e1ee205fcb") # "version if applicable" set(DEPTHAI_DEVICE_SIDE_VERSION "") diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt index 9a3adc4e6..8c3ba6ecf 100644 --- a/examples/cpp/DetectionNetwork/CMakeLists.txt +++ b/examples/cpp/DetectionNetwork/CMakeLists.txt @@ -23,8 +23,8 @@ dai_set_example_test_labels(detection_network ondevice rvc2_all rvc4 rvc4rgb ci) dai_add_example(detection_network_remap detection_network_remap.cpp ON OFF) dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci) -dai_add_example(detection_and_segmentation RVC4/detection_and_segmentation.cpp ON OFF) +dai_add_example(detection_and_segmentation detection_and_segmentation.cpp ON OFF) dai_set_example_test_labels(detection_and_segmentation rvc4) -dai_add_example(detection_and_keypoints RVC4/detection_and_keypoints.cpp ON OFF) +dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF) dai_set_example_test_labels(detection_and_keypoints rvc4) \ No newline at end of file diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp similarity index 96% rename from examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp rename to examples/cpp/DetectionNetwork/detection_and_keypoints.cpp index 46dced90f..f36545af6 100644 --- a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp +++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp @@ -22,7 +22,7 @@ int main() { auto detectionNetwork = pipeline.create(); dai::NNModelDescription modelDescription; - modelDescription.model = "luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39"; + modelDescription.model = "luxonis/yolov8-nano-pose-estimation:coco-512x288"; detectionNetwork->build(cameraNode, modelDescription); auto labelMap = detectionNetwork->getClasses(); @@ -94,7 +94,6 @@ int main() { auto currentTime = std::chrono::steady_clock::now(); float fps = counter / std::chrono::duration(currentTime - startTime).count(); - std::cout << "FPS: " << fps << std::endl; } if(cv::waitKey(1) == 'q') { diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp similarity index 92% rename from examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp rename to examples/cpp/DetectionNetwork/detection_and_segmentation.cpp index 690fa18a5..0f0048dce 100644 --- a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp +++ b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -16,8 +17,16 @@ cv::Rect frameNorm(const cv::Mat& frame, const dai::Point2f& topLeft, const dai: } int main() { + std::string modelName = "luxonis/yolov8-instance-segmentation-large:coco-640x352"; + bool setRunOnHost = false; + auto device = std::make_shared(); + + if(device->getPlatformAsString() == "RVC2") { + modelName = "luxonis/yolov8-instance-segmentation-nano:coco-512x288"; + setRunOnHost = true; + } // Create pipeline - dai::Pipeline pipeline; + dai::Pipeline pipeline{device}; // Create and configure camera node auto cameraNode = pipeline.create(); @@ -27,8 +36,10 @@ int main() { auto detectionNetwork = pipeline.create(); dai::NNModelDescription modelDescription; - modelDescription.model = "luxonis/yolov8-instance-segmentation-large:coco-640x480"; + + modelDescription.model = modelName; detectionNetwork->build(cameraNode, modelDescription); + detectionNetwork->detectionParser->setRunOnHost(setRunOnHost); auto labelMap = detectionNetwork->getClasses(); // Create output queues @@ -120,10 +131,10 @@ int main() { detections.begin(), detections.end(), [filteredLabel](const dai::ImgDetection& det) { return det.label != filteredLabel; }), detections.end()); } - if(segmentationMask) { cv::Mat lut(1, 256, CV_8U); - for(int i = 0; i < 256; ++i) lut.at(i) = (i == 255) ? 255 : cv::saturate_cast(i * 25); + for(int i = 0; i < 256; ++i) lut.at(i) = (i >= 255) ? 255 : cv::saturate_cast(i * 25); + cv::Mat scaledMask; cv::LUT(*segmentationMask, lut, scaledMask); @@ -158,8 +169,6 @@ int main() { cv::imshow("rgb", frame); auto currentTime = std::chrono::steady_clock::now(); - float fps = counter / std::chrono::duration(currentTime - startTime).count(); - std::cout << "FPS: " << fps << std::endl; } } } diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py b/examples/python/DetectionNetwork/detection_and_keypoints.py similarity index 95% rename from examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py rename to examples/python/DetectionNetwork/detection_and_keypoints.py index e8334e3b0..4459be138 100644 --- a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py +++ b/examples/python/DetectionNetwork/detection_and_keypoints.py @@ -7,8 +7,8 @@ # Create pipeline with dai.Pipeline() as pipeline: - cameraNode = pipeline.create(dai.node.Camera).build() - detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39")) + cameraNode = pipeline.create(dai.node.Camera).build(sensorFps=12) + detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-nano-pose-estimation:coco-512x288")) labelMap = detectionNetwork.getClasses() qRgb = detectionNetwork.passthrough.createOutputQueue() diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py b/examples/python/DetectionNetwork/detection_and_segmentation.py similarity index 92% rename from examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py rename to examples/python/DetectionNetwork/detection_and_segmentation.py index 5f995798d..6d24eff15 100644 --- a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py +++ b/examples/python/DetectionNetwork/detection_and_segmentation.py @@ -5,10 +5,19 @@ import numpy as np import time +model_name = "luxonis/yolov8-instance-segmentation-large:coco-640x480" +setRunOnHost = False +device = dai.Device() +if device.getPlatformAsString() == "RVC2": + model_name = "luxonis/yolov8-instance-segmentation-nano:coco-512x288" + setRunOnHost = True + # Create pipeline -with dai.Pipeline() as pipeline: +with dai.Pipeline(device) as pipeline: cameraNode = pipeline.create(dai.node.Camera).build() - detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-large:coco-640x480")) + + detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription(model_name)) + detectionNetwork.detectionParser.setRunOnHost(setRunOnHost) labelMap = detectionNetwork.getClasses() qRgb = detectionNetwork.passthrough.createOutputQueue() diff --git a/include/depthai/pipeline/node/DetectionParser.hpp b/include/depthai/pipeline/node/DetectionParser.hpp index 60f289e01..3d78cd649 100644 --- a/include/depthai/pipeline/node/DetectionParser.hpp +++ b/include/depthai/pipeline/node/DetectionParser.hpp @@ -12,6 +12,8 @@ #include #include "depthai/common/YoloDecodingFamily.hpp" +#include "depthai/pipeline/datatype/ImgDetections.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" namespace dai { namespace node { @@ -20,7 +22,7 @@ namespace node { * @brief DetectionParser node. Parses detection results from different neural networks and is being used internally by MobileNetDetectionNetwork and * YoloDetectionNetwork. */ -class DetectionParser : public DeviceNodeCRTP { +class DetectionParser : public DeviceNodeCRTP, public HostRunnable { public: constexpr static const char* NAME = "DetectionParser"; using DeviceNodeCRTP::DeviceNodeCRTP; @@ -268,7 +270,23 @@ class DetectionParser : public DeviceNodeCRTP decodeMobilenet(std::shared_ptr nnData, float confidenceThr); + private: + bool runOnHostVar = false; void setNNArchiveBlob(const NNArchive& nnArchive); void setNNArchiveSuperblob(const NNArchive& nnArchive, int numShaves); void setNNArchiveOther(const NNArchive& nnArchive); @@ -276,6 +294,15 @@ class DetectionParser : public DeviceNodeCRTP& outputs); + // host runnable requirements + void buildStage1() override; + void decodeYolo(std::shared_ptr nnData, std::shared_ptr outDetections); + std::vector inTensorInfo; + uint32_t imgWidth; + uint32_t imgHeight; + uint32_t imgSizesSet = false; + // + std::optional mArchive; std::optional archiveConfig; diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp index 22368d453..c5b448eae 100644 --- a/src/pipeline/node/DetectionParser.cpp +++ b/src/pipeline/node/DetectionParser.cpp @@ -13,6 +13,8 @@ #include "nn_archive/NNArchive.hpp" #include "nn_archive/v1/Head.hpp" #include "pipeline/ThreadedNodeImpl.hpp" +#include "pipeline/datatype/NNData.hpp" +#include "pipeline/utilities/DetectionParser/DetectionParserUtils.hpp" // internal headers #include "utility/ErrorMacros.hpp" @@ -370,5 +372,187 @@ std::vector DetectionParser::getStrides() const { return properties.parser.strides; } +void DetectionParser::setRunOnHost(bool runOnHost) { + if(runOnHost) { + pimpl->logger->warn("Detection parser set to run on host."); + } + runOnHostVar = runOnHost; +} + +/** + * Check if the node is set to run on host + */ +bool DetectionParser::runOnHost() const { + return runOnHostVar; +} + +void DetectionParser::run() { + auto& logger = pimpl->logger; + logger->info("Detection parser running on host."); + + using namespace std::chrono; + while(isRunning()) { + auto tAbsoluteBeginning = steady_clock::now(); + std::shared_ptr inputData; + inputData = input.get(); + if(!inputData) { + logger->error("Error while receiving NN frame."); + continue; + } + auto tAfterMessageBeginning = steady_clock::now(); + + if(!imgSizesSet) { + const bool containsTransformation = inputData->transformation.has_value(); + if(containsTransformation) { + std::tie(imgWidth, imgHeight) = inputData->transformation->getSize(); + } else { + logger->warn("No image size provided for detection parser. Skipping processing and sending empty detections."); + continue; + } + + imgSizesSet = true; + } + + auto outDetections = std::make_shared(); + + switch(properties.parser.nnFamily) { + case DetectionNetworkType::YOLO: { + decodeYolo(inputData, outDetections); + break; + } + case DetectionNetworkType::MOBILENET: { + auto dets = decodeMobilenet(inputData, properties.parser.confidenceThreshold); // TODO (aljaz) update to shared pointer + outDetections->detections = dets; + break; + } + default: { + logger->error("Unknown NN family. 'YOLO' and 'MOBILENET' are supported."); + break; + } + } + + auto tBeforeSend = steady_clock::now(); + + // Copy over seq and ts + outDetections->setSequenceNum(inputData->getSequenceNum()); + outDetections->setTimestamp(inputData->getTimestamp()); + outDetections->setTimestampDevice(inputData->getTimestampDevice()); + outDetections->transformation = inputData->transformation; + // Send detections + out.send(outDetections); + + auto tAbsoluteEnd = steady_clock::now(); + logger->debug("Detection parser total took {}ms, processing {}ms, getting_frames {}ms, sending_frames {}ms", + duration_cast(tAbsoluteEnd - tAbsoluteBeginning).count() / 1000, + duration_cast(tBeforeSend - tAfterMessageBeginning).count() / 1000, + duration_cast(tAfterMessageBeginning - tAbsoluteBeginning).count() / 1000, + duration_cast(tAbsoluteEnd - tBeforeSend).count() / 1000); + } +} + +void DetectionParser::buildStage1() { + auto& logger = pimpl->logger; + + // Grab dimensions from input tensor info + if(properties.networkInputs.size() > 0) { + if(properties.networkInputs.size() > 1) { + logger->warn("Detection parser supports only single input networks, assuming first input"); + } + for(const auto& kv : properties.networkInputs) { + const dai::TensorInfo& tensorInfo = kv.second; + inTensorInfo.push_back(tensorInfo); + } + } + if(inTensorInfo.size() > 0) { + int numDimensions = inTensorInfo[0].numDimensions; + if(numDimensions < 2) { + logger->error("Number of input dimensions is less than 2"); + } else { + imgSizesSet = true; + imgWidth = inTensorInfo[0].dims[numDimensions - 1]; + imgHeight = inTensorInfo[0].dims[numDimensions - 2]; + } + } else { + logger->info("Unable to read input tensor height and width from static inputs. The node will try to get input sizes at runtime."); + } +} + +std::vector DetectionParser::decodeMobilenet(std::shared_ptr nnData, float confidenceThr) { + auto& logger = pimpl->logger; + + if(!nnData) { + return {}; + } + int maxDetections = 100; + std::vector detections; + std::string tensorName; + for(const auto& tensor : nnData->getAllLayers()) { + if(tensor.offset == 0) { + tensorName = tensor.name; + } + } + + auto tensorData = nnData->getTensor(tensorName); + maxDetections = tensorData.size() / 7; + if(static_cast(tensorData.size()) < maxDetections * 7) { + logger->error("Error while parsing Mobilenet. Vector not long enough, expected size: {}, real size {}", maxDetections * 7, tensorData.size()); + return {}; + } + + struct raw_Detection { // need to update it to include more + float header; + float label; + float confidence; + float xmin; + float ymin; + float xmax; + float ymax; + }; + + float* rawPtr = tensorData.data(); + for(int i = 0; i < maxDetections; i++) { + raw_Detection temp; + // TODO This is likely unnecessary optimisation + memcpy(&temp, &rawPtr[i * 7], sizeof(raw_Detection)); + + // if header == -1, stop sooner + if(temp.header == -1.0f) break; + + float currentConfidence = temp.confidence; + if(currentConfidence >= confidenceThr) { + dai::ImgDetection d; + d.label = temp.label; + + d.confidence = currentConfidence; + + d.xmin = temp.xmin; + d.ymin = temp.ymin; + d.xmax = temp.xmax; + d.ymax = temp.ymax; + + detections.push_back(d); + } + } + return detections; +} + +void DetectionParser::decodeYolo(std::shared_ptr nnData, std::shared_ptr outDetections) { + auto& logger = pimpl->logger; + switch(properties.parser.decodingFamily) { + case YoloDecodingFamily::R1AF: // anchor free: yolo v6r1 + utilities::DetectionParserUtils::decodeR1AF(nnData, outDetections, properties, logger); + break; + case YoloDecodingFamily::v3AB: // anchor based yolo v3 v3-Tiny + utilities::DetectionParserUtils::decodeV3AB(nnData, outDetections, properties, logger); + break; + case YoloDecodingFamily::v5AB: // anchor based yolo v5, v7, P + utilities::DetectionParserUtils::decodeV5AB(nnData, outDetections, properties, logger); + break; + case YoloDecodingFamily::TLBR: // top left bottom right anchor free: yolo v6r2, v8 v10 v11 + utilities::DetectionParserUtils::decodeTLBR(nnData, outDetections, properties, logger); + break; + } +} + } // namespace node } // namespace dai diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp new file mode 100644 index 000000000..a9455e551 --- /dev/null +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp @@ -0,0 +1,897 @@ +#include "DetectionParserUtils.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "depthai/common/KeypointsListT.hpp" +#include "depthai/common/RotatedRect.hpp" +#include "depthai/common/TensorInfo.hpp" +#include "depthai/pipeline/datatype/ImgDetections.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" +#include "depthai/properties/DetectionParserProperties.hpp" +#include "pipeline/utilities/NNDataViewer.hpp" + +namespace dai { +namespace utilities { +namespace DetectionParserUtils { + +// yolo v6 r1 - anchor free +void decodeR1AF(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + std::vector detectionCandidates; + detectionCandidates.reserve(250); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + auto tensorInfo = nnData->getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + const float score = outputData.get(4, row, col); + if(score < confidenceThr) { + continue; + } + + int bestC = 0; + float bestConf = 0.0f; + for(int c = 0; c < numClasses; ++c) { + float candidateProb = outputData.get(c + 5, row, col); + if(candidateProb > bestConf) { + bestConf = candidateProb; + bestC = c; + } + } + if(bestConf * score < confidenceThr) { + continue; + } + + float cx = outputData.get(0, row, col); + float cy = outputData.get(1, row, col); + float w = outputData.get(2, row, col); + float h = outputData.get(3, row, col); + + float xmin = cx - w * 0.5f; + float ymin = cy - h * 0.5f; + float xmax = cx + w * 0.5f; + float ymax = cy + h * 0.5f; + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) { + logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping"); + continue; + } + DetectionCandidate candidate = DetectionCandidate{ + xmin, + ymin, + xmax, + ymax, + bestConf * score, + bestC, + strideIdx, + row, + col, + std::nullopt, + }; + + if(!properties.parser.classNames->empty()) { + candidate.labelName = (*properties.parser.classNames)[bestC]; + } + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } +} + +/* +Decode anchor based yolo v3 and v3-Tiny +*/ +void decodeV3AB(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + + if(properties.parser.anchorsV2.size() != layerNames.size()) { + logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}", + properties.parser.anchorsV2.size(), + layerNames.size()); + return; + } + + std::vector detectionCandidates; + detectionCandidates.reserve(250); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + int stride = strides[strideIdx]; + auto tensorInfo = nnData->getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + int layerChannels = tensorInfo->getChannels(); + + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + std::vector>& anchors = properties.parser.anchorsV2[strideIdx]; + int numAnchors = anchors.size(); + int block = 5 + numClasses; + int expectedC = numAnchors * block; + + if(layerChannels != expectedC) { + std::string errorMsg = fmt::format("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels); + throw std::runtime_error(errorMsg); + } + + auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); }; + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + for(int a = 0; a < numAnchors; ++a) { + const int ch0 = a * block; + const float tx = sigmoid(outputData.get(ch0 + 0, row, col)); + const float ty = sigmoid(outputData.get(ch0 + 1, row, col)); + const float tw = outputData.get(ch0 + 2, row, col); + const float th = outputData.get(ch0 + 3, row, col); + const float obj = sigmoid(outputData.get(ch0 + 4, row, col)); + if(obj < confidenceThr) continue; + + int bestC = 0; + float clsProb = 0.0f; + for(int c = 0; c < numClasses; ++c) { + const float prob = outputData.get(ch0 + 5 + c, row, col); + if(prob > clsProb) { + clsProb = prob; + bestC = c; + } + } + const float conf = obj * 1.f / (1.f + std::exp(-clsProb)); + if(conf < confidenceThr) continue; + + // YOLOv3 decode + const float cx = (static_cast(col) + tx) * static_cast(stride); + const float cy = (static_cast(row) + ty) * static_cast(stride); + const float w_exp = std::exp(tw); + const float h_exp = std::exp(th); + const float w = w_exp * anchors[a][0]; + const float h = h_exp * anchors[a][1]; + + float xmin = cx - 0.5f * w; + float ymin = cy - 0.5f * h; + float xmax = cx + 0.5f * w; + float ymax = cy + 0.5f * h; + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) { + logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping"); + continue; + } + + DetectionCandidate candidate = DetectionCandidate{ + xmin, + ymin, + xmax, + ymax, + conf, + bestC, + strideIdx, + row, + col, + std::nullopt, + }; + + if(!properties.parser.classNames->empty()) { + candidate.labelName = (*properties.parser.classNames)[bestC]; + } + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + // +} + +/* +Decode anchor based networks, e.g., yolo v5, v7, P +*/ +void decodeV5AB(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + + if(properties.parser.anchorsV2.size() != layerNames.size()) { + logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}", + properties.parser.anchorsV2.size(), + layerNames.size()); + return; + } + + std::vector detectionCandidates; + detectionCandidates.reserve(250); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + int stride = strides[strideIdx]; + auto tensorInfo = nnData->getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + int layerChannels = tensorInfo->getChannels(); + + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + std::vector>& anchors = properties.parser.anchorsV2[strideIdx]; + int numAnchors = anchors.size(); + int block = 5 + numClasses; + int expectedC = numAnchors * block; + + if(layerChannels != expectedC) { + logger->error("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels); + return; + } + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + for(int a = 0; a < numAnchors; ++a) { + const int ch0 = a * block; + + const float tx = outputData.get(ch0 + 0, row, col); + const float ty = outputData.get(ch0 + 1, row, col); + const float tw = outputData.get(ch0 + 2, row, col); + const float th = outputData.get(ch0 + 3, row, col); + const float obj = outputData.get(ch0 + 4, row, col); + if(obj < confidenceThr) continue; + + int bestC = 0; + float bestConf = 0.0f; + for(int c = 0; c < numClasses; ++c) { + const float prob = outputData.get(ch0 + 5 + c, row, col); + if(prob > bestConf) { + bestConf = prob; + bestC = c; + } + } + const float conf = obj * bestConf; + if(conf < confidenceThr) continue; + + // YOLOv5 decode + const float cx = ((tx * 2.0f - 0.5f) + static_cast(col)) * static_cast(stride); + const float cy = ((ty * 2.0f - 0.5f) + static_cast(row)) * static_cast(stride); + + const float w = tw * tw * 4.0f * anchors[a][0]; + const float h = th * th * 4.0f * anchors[a][1]; + + float xmin = cx - 0.5f * w; + float ymin = cy - 0.5f * h; + float xmax = cx + 0.5f * w; + float ymax = cy + 0.5f * h; + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) continue; + DetectionCandidate candidate = DetectionCandidate{ + xmin, + ymin, + xmax, + ymax, + conf, + bestC, + strideIdx, + row, + col, + std::nullopt, + }; + + if(!properties.parser.classNames->empty()) { + candidate.labelName = (*properties.parser.classNames)[bestC]; + } + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } +} + +/* +Decode TLBR (top left bottom right) style networks, e.g., yolo v6r2, v8, v10, v11 +*/ +void decodeTLBR(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + + const std::vector strides = properties.parser.strides; + if(strides.size() != layerNames.size()) { + std::string errorMsg = fmt::format( + "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size()); + throw std::runtime_error(errorMsg); + } + const float confidenceThr = properties.parser.confidenceThreshold; + const float iouThr = properties.parser.iouThreshold; + const int numClasses = properties.parser.classes; + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + + if(inputWidth <= 0 || inputHeight <= 0) { + throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation."); + } + + std::vector detectionCandidates; + detectionCandidates.reserve(250); + + for(int strideIdx = 0; strideIdx < static_cast(layerNames.size()); ++strideIdx) { + std::string layerName = layerNames[strideIdx]; + int stride = strides[strideIdx]; + auto tensorInfo = nnData->getTensorInfo(layerName); + if(!tensorInfo) { + std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName); + throw std::runtime_error(errorMsg); + } + + if(!isTensorOrderValid(*tensorInfo, properties, logger)) { + logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName); + continue; + } + + int layerHeight = tensorInfo->getHeight(); + int layerWidth = tensorInfo->getWidth(); + NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger); + if(!outputData.build()) { + std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName); + throw std::runtime_error(errorMsg); + } + + for(int row = 0; row < layerHeight; ++row) { + for(int col = 0; col < layerWidth; ++col) { + const float score = outputData.get(4, row, col); + if(score < confidenceThr) { + continue; + } + + int bestC = 0; + float bestConf = 0.0f; + for(int c = 0; c < numClasses; ++c) { + float candidateProb = outputData.get(c + 5, row, col); + if(candidateProb > bestConf) { + bestConf = candidateProb; + bestC = c; + } + } + float xmin = (col - outputData.get(0, row, col) + 0.5f) * stride; + float ymin = (row - outputData.get(1, row, col) + 0.5f) * stride; + float xmax = (col + outputData.get(2, row, col) + 0.5f) * stride; + float ymax = (row + outputData.get(3, row, col) + 0.5f) * stride; + + if(bestConf < confidenceThr) { + continue; + } + + xmin = std::max(0.0f, std::min(xmin, float(inputWidth))); + ymin = std::max(0.0f, std::min(ymin, float(inputHeight))); + xmax = std::max(0.0f, std::min(xmax, float(inputWidth))); + ymax = std::max(0.0f, std::min(ymax, float(inputHeight))); + + if(xmax <= xmin || ymax <= ymin) { + logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping"); + continue; + } + + DetectionCandidate candidate = DetectionCandidate{ + xmin, + ymin, + xmax, + ymax, + bestConf, + bestC, + strideIdx, + row, + col, + std::nullopt, + + }; + + if(!properties.parser.classNames->empty()) { + candidate.labelName = (*properties.parser.classNames)[bestC]; + } + detectionCandidates.emplace_back(std::move(candidate)); + } + } + } + + std::vector keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr); + if(keepCandidates.size() == 0) { + logger->trace("No detections after NMS, skipping overlay."); + return; + } + + createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight); + + if(properties.parser.decodeSegmentation) { + logger->trace("Segmentation decoding."); + segmentationDecode(nnData, keepCandidates, outDetections, properties, logger); + } + + if(properties.parser.decodeKeypoints) { + logger->trace("Keypoints decoding."); + keypointDecode(nnData, keepCandidates, outDetections, properties, logger); + } +} + +bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr logger) { + // Fix the channel order for Yolo - this is hacky and would be best to be fixed in the actual models and make it consistent + + int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast(properties.parser.anchorsV2.size()); + int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1); + + auto checkAndFixOrder = [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool { + // Check that the dims size is big enough + if(static_cast(tensorInfo.dims.size()) <= channelDimIndex || static_cast(tensorInfo.dims.size()) <= alternativeDimIndex) { + logger->error("Invalid tensor dims size. Skipping."); + return false; + } + + if(tensorInfo.dims[channelDimIndex] != uint32_t(channelSize)) { + // Check if the channel size would match the alternative storage order + if(tensorInfo.dims[alternativeDimIndex] == uint32_t(channelSize)) { + logger->trace("Invalid channel size for the tensor. Expected {}, got {}, switching", channelSize, tensorInfo.dims[channelDimIndex]); + tensorInfo.order = alternativeOrder; + } else { + logger->error("Invalid channel size for the tensor. Expected {}, got {}. Skipping.", channelSize, tensorInfo.dims[channelDimIndex]); + return false; + } + } + return true; + }; + + switch(tensorInfo.order) { + case dai::TensorInfo::StorageOrder::CHW: + if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false; + break; + case dai::TensorInfo::StorageOrder::HWC: + if(!checkAndFixOrder(2, 0, dai::TensorInfo::StorageOrder::CHW)) return false; + break; + case dai::TensorInfo::StorageOrder::NCHW: + if(!checkAndFixOrder(1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false; + break; + case dai::TensorInfo::StorageOrder::NHWC: + if(!checkAndFixOrder(3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false; + break; + case dai::TensorInfo::StorageOrder::NHCW: + case dai::TensorInfo::StorageOrder::WHC: + case dai::TensorInfo::StorageOrder::WCH: + case dai::TensorInfo::StorageOrder::HCW: + case dai::TensorInfo::StorageOrder::CWH: + case dai::TensorInfo::StorageOrder::NC: + case dai::TensorInfo::StorageOrder::CN: + case dai::TensorInfo::StorageOrder::C: + case dai::TensorInfo::StorageOrder::H: + case dai::TensorInfo::StorageOrder::W: + default: + logger->error("Invalid storage order for the tensor. Skipping."); + return false; + } + + return true; +} + +std::vector getSortedDetectionLayerNames(std::shared_ptr nnData, std::string searchTerm, std::vector outputNames) { + if(outputNames.empty()) { + outputNames = nnData->getAllLayerNames(); + } + + std::vector layerNames; + for(const auto& name : outputNames) { + // if yolo in the name, push it to layerNames + if(name.find(searchTerm) != std::string::npos) { + layerNames.push_back(name); + } + } + + std::sort(layerNames.begin(), layerNames.end()); + return layerNames; +} + +float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2) { + float width_of_overlap_area = fmin(box1.xmax, box2.xmax) - fmax(box1.xmin, box2.xmin); + float height_of_overlap_area = fmin(box1.ymax, box2.ymax) - fmax(box1.ymin, box2.ymin); + float area_of_overlap; + if(width_of_overlap_area < 0 || height_of_overlap_area < 0) + area_of_overlap = 0; + else + area_of_overlap = width_of_overlap_area * height_of_overlap_area; + float box_1_area = (box1.ymax - box1.ymin) * (box1.xmax - box1.xmin); + float box_2_area = (box2.ymax - box2.ymin) * (box2.xmax - box2.xmin); + float area_of_union = box_1_area + box_2_area - area_of_overlap; + return area_of_overlap / area_of_union; +} + +std::vector nonMaximumSuppression(std::vector& detectionCandidates, float iouThr) { + std::sort( + detectionCandidates.begin(), detectionCandidates.end(), [](const DetectionCandidate& a, const DetectionCandidate& b) { return a.score > b.score; }); + + std::vector keep(detectionCandidates.size(), 1); + std::vector keepIndices; + keepIndices.reserve(detectionCandidates.size()); + + for(size_t i = 0; i < detectionCandidates.size(); ++i) { + if(!keep[i]) continue; + keepIndices.push_back(i); + + for(size_t j = i + 1; j < detectionCandidates.size(); ++j) { + if(!keep[j]) continue; + if(YoloIntersectionOverUnion(detectionCandidates[i], detectionCandidates[j]) >= iouThr) { + keep[j] = 0; + } + } + } + + std::vector keepCandidates; + keepCandidates.reserve(keepIndices.size()); + for(size_t idx : keepIndices) keepCandidates.push_back(detectionCandidates[idx]); + + return keepCandidates; +} + +void createImgDetections(const std::vector& detectionCandidates, + std::shared_ptr outDetections, + unsigned int width, + unsigned int height) { + for(const auto& det : detectionCandidates) { + dai::ImgDetection detection; + dai::RotatedRect rotatedRect(dai::Rect(dai::Point2f(det.xmin, det.ymin), dai::Point2f(det.xmax, det.ymax)), 0.0f); + detection.setBoundingBox(rotatedRect.normalize(width, height)); + detection.confidence = det.score; + detection.label = det.label; + if(det.labelName) { + detection.labelName = *det.labelName; + } + outDetections->detections.push_back(std::move(detection)); + } +} + +void segmentationDecode(std::shared_ptr nnData, + std::vector& detectionCandidates, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + auto maskFromCoeffs = [](NNDataViewer& protos, const float* coeffs, int width, int height) -> cv::Mat { + cv::Mat maskLow(height, width, CV_32F); + for(int y = 0; y < maskLow.rows; ++y) { + float* row = maskLow.ptr(y); + for(int x = 0; x < maskLow.cols; ++x) { + float sum = 0.f; + for(int c = 0; c < 32; ++c) sum += protos.get(c, y, x) * coeffs[c]; + row[x] = 1.f / (1.f + std::exp(-sum)); // sigmoid + } + } + return maskLow; + }; + + std::pair inputSize = nnData->transformation->getSize(); + int inputWidth = inputSize.first; + int inputHeight = inputSize.second; + + cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255)); + + cv::Mat maskLow, maskUp; + + auto maskLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "masks", std::vector{}); + if(properties.parser.strides.size() != maskLayerNames.size()) { + logger->error( + "Number of strides does not match number of mask output layers. Strides size: {}, mask output layers size: {}. Skipping segmentation decoding.", + properties.parser.strides.size(), + maskLayerNames.size()); + return; + } + auto protoLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "proto", std::vector{}); + if(protoLayerNames.size() == 0) { + logger->error("Expecting proto output layer, found no layer with proto label. Skipping segmentation decoding."); + return; + } + + NNDataViewer protoValues = NNDataViewer(*nnData->getTensorInfo(protoLayerNames[0]), nnData->data, logger); + if(!protoValues.build()) { + logger->error("Failed to build NNDataViewer for proto layer {}. Skipping segmentation decoding.", protoLayerNames[0]); + return; + } + + std::map maskValues; + for(int strideIdx = 0; strideIdx < static_cast(maskLayerNames.size()); ++strideIdx) { + maskValues.try_emplace(strideIdx, *nnData->getTensorInfo(maskLayerNames[strideIdx]), nnData->data, logger); + if(!maskValues.at(strideIdx).build()) { + logger->error("Failed to build NNDataViewer for mask layer {}. Skipping segmentation decoding.", maskLayerNames[strideIdx]); + return; + } + } + + for(size_t i = 0; i < detectionCandidates.size(); ++i) { // loop over all detections + const auto& c = detectionCandidates[i]; + const int detIdx = static_cast(i); // index in outDetections list + + NNDataViewer mask = maskValues.at(c.headIndex); + std::array coeff; + for(int i = 0; i < 32; ++i) { + coeff[i] = mask.get(i, c.rowIndex, c.columnIndex); + } + + TensorInfo protoInfo = *nnData->getTensorInfo(protoLayerNames[0]); + int protoWidth = protoInfo.getWidth(); + int protoHeight = protoInfo.getHeight(); + maskLow = maskFromCoeffs(protoValues, coeff.data(), protoWidth, protoHeight); + + cv::resize(maskLow, maskUp, cv::Size(inputWidth, inputHeight), 0, 0, cv::INTER_LINEAR); + // ROI clamp + int x0 = std::clamp(static_cast(std::floor(c.xmin)), 0, inputWidth - 1); + int y0 = std::clamp(static_cast(std::floor(c.ymin)), 0, inputHeight - 1); + int x1 = std::clamp(static_cast(std::ceil(c.xmax)), 0, inputWidth); + int y1 = std::clamp(static_cast(std::ceil(c.ymax)), 0, inputHeight); + + if(x1 <= x0 || y1 <= y0) continue; + const cv::Rect roi(x0, y0, x1 - x0, y1 - y0); + + // Threshold & paint only unassigned pixels + cv::Mat roiProb = maskUp(roi); + cv::Mat roiBin; + cv::compare(roiProb, static_cast(0.5f), roiBin, cv::CMP_GT); + cv::Mat roiOut = indexMask(roi); + cv::Mat unassigned; + cv::compare(roiOut, 255, unassigned, cv::CMP_EQ); + cv::Mat paintMask; + cv::bitwise_and(roiBin, unassigned, paintMask); + + const uint8_t value = static_cast(std::min(detIdx, 254)); + roiOut.setTo(value, paintMask); + } + + outDetections->setSegmentationMask(indexMask); +} + +void keypointDecode(std::shared_ptr nnData, + std::vector& detectionCandidates, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger) { + int inputWidth; + int inputHeight; + std::tie(inputWidth, inputHeight) = nnData->transformation->getSize(); + + auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames); + std::vector featureMapWidths; + for(int i = 0; i < static_cast(yoloLayerNames.size()); ++i) { + auto tensorInfo = nnData->getTensorInfo(yoloLayerNames[i]); + if(!tensorInfo) { + logger->error("Tensor info for layer {} is null. Skipping keypoints decoding.", yoloLayerNames[i]); + return; + } + featureMapWidths.push_back(tensorInfo->getWidth()); + } + + auto kptsLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "kpt_output", std::vector{}); + if(properties.parser.strides.size() != kptsLayerNames.size()) { + logger->error( + "Number of strides does not match number of keypoints output layers. Strides size: {}, keypoints output layers size: {}. Skipping keypoints " + "decoding.", + properties.parser.strides.size(), + kptsLayerNames.size()); + return; + } + + // TODO (aljaz) move to a function + std::map keypointValues; + for(int strideIdx = 0; strideIdx < static_cast(kptsLayerNames.size()); ++strideIdx) { + keypointValues.try_emplace(strideIdx, *nnData->getTensorInfo(kptsLayerNames[strideIdx]), nnData->data, logger); + if(!keypointValues.at(strideIdx).build()) { + logger->error("Failed to build NNDataViewer for keypoints layer {}. Skipping keypoints decoding.", kptsLayerNames[strideIdx]); + return; + } + } + + if(outDetections->detections.size() != detectionCandidates.size()) { + logger->error( + "Number of detections in ImgDetections does not match number of detection candidates. ImgDetections size: {}, detection candidates size: {}. " + "Skipping keypoints decoding.", + outDetections->detections.size(), + detectionCandidates.size()); + return; + } + + for(size_t i = 0; i < detectionCandidates.size(); ++i) { // loop over all detections + const auto& c = detectionCandidates[i]; + int flattenedIndex = c.rowIndex * featureMapWidths[c.headIndex] + c.columnIndex; + + std::vector keypoints; + keypoints.reserve(*properties.parser.nKeypoints); + NNDataViewer keypointMask = keypointValues.at(c.headIndex); + + for(int k = 0; k < properties.parser.nKeypoints; ++k) { + int base = 3 * k; + + // keypointValues tensor storage order HWC + // H == 0 + // W == 51 == 17 * 3 (x, y, conf for each keypoint) + // C == flattened spatial dimensions of row x col of the feature map + float x = std::clamp(keypointMask.get(flattenedIndex, 0, base + 0) / inputWidth, 0.0f, 1.0f); + float y = std::clamp(keypointMask.get(flattenedIndex, 0, base + 1) / inputHeight, 0.0f, 1.0f); + float conf = 1.f / (1.f + std::exp(-(keypointMask.get(flattenedIndex, 0, base + 2)))); + + keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf}); + } + + outDetections->detections[i].keypoints = KeypointsList(keypoints, properties.parser.keypointEdges); + } +} + +} // namespace DetectionParserUtils +} // namespace utilities +} // namespace dai \ No newline at end of file diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp new file mode 100644 index 000000000..85b5a234f --- /dev/null +++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp @@ -0,0 +1,85 @@ +#pragma once + +#include + +#include + +#include "depthai/pipeline/datatype/ImgDetections.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" +#include "depthai/properties/DetectionParserProperties.hpp" + +namespace dai { +namespace utilities { +namespace DetectionParserUtils { + +struct DetectionCandidate { + float xmin, ymin, xmax, ymax, score; + int label, headIndex, rowIndex, columnIndex; + std::optional labelName; +}; + +/* +Decode anchor free yolo v6r1 with sigmoid assisted center detection +*/ +void decodeR1AF(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +/* +Decode anchor based yolo v3 and v3-Tiny +*/ +void decodeV3AB(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +/* +Decode anchor based networks, e.g., yolo v5, v7, P +*/ +void decodeV5AB(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +/* +Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, v8, v10, v11 +*/ +void decodeTLBR(std::shared_ptr nnData, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +std::vector getSortedDetectionLayerNames(std::shared_ptr nnData, std::string searchTerm, std::vector outputNames); + +float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2); + +bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr logger); + +void createImgDetections(std::vector& detectionCandidates, + std::vector keepIndices, + std::shared_ptr outDetections, + std::shared_ptr logger); + +std::vector nonMaximumSuppression(std::vector& detectionCandidates, float iouThr); + +void createImgDetections(const std::vector& detectionCandidates, + std::shared_ptr outDetections, + unsigned int width, + unsigned int height); + +void segmentationDecode(std::shared_ptr nnData, + std::vector& detectionCandidates, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +void keypointDecode(std::shared_ptr nnData, + std::vector& detectionCandidates, + std::shared_ptr outDetections, + DetectionParserProperties properties, + std::shared_ptr logger); + +} // namespace DetectionParserUtils +} // namespace utilities +} // namespace dai \ No newline at end of file diff --git a/src/pipeline/utilities/NNDataViewer.hpp b/src/pipeline/utilities/NNDataViewer.hpp new file mode 100644 index 000000000..94ab12cda --- /dev/null +++ b/src/pipeline/utilities/NNDataViewer.hpp @@ -0,0 +1,163 @@ +#pragma once +#include + +#include "depthai/common/TensorInfo.hpp" +#include "depthai/pipeline/datatype/NNData.hpp" +#include "fp16/fp16.h" +namespace dai { +class NNDataViewer { + public: + std::shared_ptr data; + dai::TensorInfo tensor; + std::shared_ptr logger; + + // Factors to multiply with before the vectors + struct FactorsBefore { + int32_t h; + int32_t w; + int32_t c; + }; + + FactorsBefore factorsBefore; + + NNDataViewer(dai::TensorInfo tensor, std::shared_ptr data, std::shared_ptr logger) + : data{data}, tensor{tensor}, logger{logger} {}; + bool build() { + if(tensor.strides.size() < 2) { + logger->error("Tensor doesn't have enough strides. Number of strides: {}, expected: {}", tensor.strides.size(), 2); + return false; + } + if(tensor.strides[0] == 0 || tensor.strides[1] == 0) { + logger->error("Tensor strides should not be set to zero. Strides are {} {}", tensor.strides[0], tensor.strides[1]); + return false; + } + switch(tensor.order) { + case TensorInfo::StorageOrder::NCHW: + if(tensor.dims[0] != 1) { + logger->error("NCHW is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]); + return false; + } + if(tensor.strides.size() != 4) { + logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4); + } + factorsBefore.c = tensor.strides[1]; + factorsBefore.h = tensor.strides[2]; + factorsBefore.w = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::NHWC: + if(tensor.dims[0] != 1) { + logger->error("NHWC is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]); + return false; + } + if(tensor.strides.size() != 4) { + logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4); + } + factorsBefore.h = tensor.strides[1]; + factorsBefore.w = tensor.strides[2]; + factorsBefore.c = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::HCW: + factorsBefore.h = tensor.strides[0]; + factorsBefore.c = tensor.strides[1]; + factorsBefore.w = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::HWC: + factorsBefore.h = tensor.strides[0]; + factorsBefore.w = tensor.strides[1]; + factorsBefore.c = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::CHW: + factorsBefore.c = tensor.strides[0]; + factorsBefore.h = tensor.strides[1]; + factorsBefore.w = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::CWH: + factorsBefore.c = tensor.strides[0]; + factorsBefore.w = tensor.strides[1]; + factorsBefore.h = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::WCH: + factorsBefore.w = tensor.strides[0]; + factorsBefore.c = tensor.strides[1]; + factorsBefore.h = tensor.getDataTypeSize(); + break; + + case TensorInfo::StorageOrder::WHC: + factorsBefore.w = tensor.strides[0]; + factorsBefore.h = tensor.strides[1]; + factorsBefore.c = tensor.getDataTypeSize(); + break; + case TensorInfo::StorageOrder::NHCW: + case TensorInfo::StorageOrder::NC: + case TensorInfo::StorageOrder::CN: + case TensorInfo::StorageOrder::H: + case TensorInfo::StorageOrder::W: + case TensorInfo::StorageOrder::C: + default: + logger->error("Storage order not supported in NNDataViewer"); + return false; + } + return sanity_check(); + } + + bool sanity_check() { + if(data->getSize() < (tensor.offset + (tensor.dims[0] * tensor.strides[0]))) { + logger->error( + "Underlying data does not hold enough data for the tensor to be contained.\ + Tensor size: {}, Tensor offset: {}, Data type size: {}, Data size: {} ", + tensor.dims[0] * tensor.strides[0], + tensor.offset, + tensor.getDataTypeSize(), + data->getSize()); + return false; + } + if(tensor.dims.size() < 2) { + logger->error("Number of dimensions for the input tensor is expected to be at least 2. It is {}", tensor.dims.size()); + return false; + } + return true; + }; + + inline float get(int c, int h, int w) { + // If this turns out to be slow, use a function pointer instead and point to the right getter at build time + int32_t index = tensor.offset + factorsBefore.h * h + factorsBefore.w * w + factorsBefore.c * c; +#ifdef DEPTHAI_SAFE_NN_DATA_ACCESS + logger->trace("Offset {}, fbH {}, fbW {}, fbC {}, h {}, w {}, c{}", tensor.offset, factorsBefore.h, factorsBefore.w, factorsBefore.c, h, w, c); + if(index > data->getSize()) { + logger->error("Out of bound access. Size is {}, index is {}", data->getSize(), index); + return 0.0; + } +#endif + + switch(tensor.dataType) { + case TensorInfo::DataType::U8F: { + uint8_t dataOut = data->getData()[index]; + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::I8: { + int8_t dataOut = static_cast(data->getData()[index]); + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::INT: { + int32_t dataOut = reinterpret_cast(data->getData().data())[index / sizeof(int32_t)]; + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::FP16: { + int16_t dataOut = reinterpret_cast(data->getData().data())[index / sizeof(int16_t)]; + return (fp16_ieee_to_fp32_value(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::FP32: { + float dataOut = reinterpret_cast(data->getData().data())[index / sizeof(float)]; + return (static_cast(dataOut) - tensor.qpZp) * tensor.qpScale; + } + case TensorInfo::DataType::FP64: + default: { + return 0.0f; + } + } + } +}; +} // namespace dai diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 702ea0def..51d1d7fa2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -389,7 +389,7 @@ dai_set_test_labels(nndata_test onhost ci) #ImgDetections tests dai_add_test(imgdetections_test src/onhost_tests/pipeline/datatype/imgdetections_test.cpp) -dai_set_test_labels(imgdetections_test onhost ci) +dai_set_test_labels(imgdetections_test ondevice rvc2 rvc4 onhost ci) # Model description tests dai_add_test(model_slug_test src/onhost_tests/model_slug_test.cpp) @@ -524,7 +524,7 @@ FIRE_VIDEO="${fire_video}" KITCHEN_IMAGE_PATH="${kitchen_image}" YOLO_V8_INSTANCE_SEGMENTATION_LARGE_COCO_640x352_KITCHEN_SEGMENTATION_GROUND_TRUTH="${yolo_v8_instance_segmentation_large_coco_640x352_kitchen_segmentation_gt}" ) -dai_set_test_labels(detection_parser_test ondevice rvc4 ci) +dai_set_test_labels(detection_parser_test ondevice rvc4 ci onhost) # Spatial detection network test dai_add_test(spatial_detection_network_test src/ondevice_tests/pipeline/node/spatial_detection_network_test.cpp)