tue-robotics · IasonTheodorou · Jul 3, 2025 · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -5,7 +5,7 @@
             "name": "Debug SAM Model",
             "type": "cppdbg",
             "request": "launch",
-            "program": "${workspaceFolder}/build/devel/lib/sam_onnx_ros/test_sam_onnx_ros", // Path to the executable
+            "program": "/home/amigo/ros/noetic/system/devel/lib/sam_onnx_ros/test11_sam_onnx_ros", // Path to the executable
             "args": [], // Add any command-line arguments for your program here
             "stopAtEntry": false,
             "cwd": "${workspaceFolder}/build", // Set the working directory
@@ -19,7 +19,7 @@
                     "ignoreFailures": true
                 }
             ],
-            "preLaunchTask": "build-sam-project" // Ensure the project is built before debugging
+            //"preLaunchTask": "build-sam-project" // Ensure the project is built before debugging
         },
         {
             "name": "Debug GTest (pick binary)",

diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -52,7 +52,9 @@
             "args": [
                 "-DCMAKE_BUILD_TYPE=Debug",
                 "-S", "${workspaceFolder}/yolo_inference",
-                "-B", "${workspaceFolder}/yolo_inference/build"
+                "-B", "${workspaceFolder}/yolo_inference/build",
+                                "-DCMAKE_POLICY_VERSION_MINIMUM=3.5",
+
             ],
             "problemMatcher": ["$gcc"]
         },
@@ -74,6 +76,7 @@
             "args": [
                 "--build",
                 "${workspaceFolder}/pipeline/build",
+                "-DCMAKE_POLICY_VERSION_MINIMUM=3.5",
                 "--config", "Debug"
             ],
             "problemMatcher": ["$gcc"]

diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@ int main()
     SEG::DL_RESULT res;
     std::tie(samSegmentors, params_encoder, params_decoder, res, resSam) = Initializer();
     std::filesystem::path current_path = std::filesystem::current_path();
-    std::filesystem::path imgs_path =  "/home/amigo/Documents/repos/hero_sam.bak/sam_inference/build/images"; // current_path / <- you could use
+    std::filesystem::path imgs_path =  "/home/amigo/Documents/repos/yolo_onnx_ros/build/images"; // current_path / <- you could use
     for (auto &i : std::filesystem::directory_iterator(imgs_path))
     {
         if (i.path().extension() == ".jpg" || i.path().extension() == ".png" || i.path().extension() == ".jpeg")

diff --git a/src/sam_inference.cpp b/src/sam_inference.cpp
@@ -5,7 +5,7 @@
 #include "sam_onnx_ros/utils.hpp"
 
 #define benchmark
-//#define ROI
+// #define ROI
 
 SAM::SAM()
 {
@@ -117,30 +117,29 @@ const char *SAM::CreateSession(SEG::DL_INIT_PARAM &iParams) {
   }
 }
 
-const char *SAM::RunSession(const cv::Mat &iImg,
-                            std::vector<SEG::DL_RESULT> &oResult,
-                            SEG::MODEL_TYPE _modelType, SEG::DL_RESULT &result)
-{
-#ifdef benchmark
-  clock_t starttime_1 = clock();
-#endif // benchmark
-  Utils utilities;
-  const char *Ret = RET_OK;
-  cv::Mat processedImg;
-  utilities.PreProcess(iImg, _imgSize, processedImg);
-  float *blob = new float[processedImg.total() * 3];
-  utilities.BlobFromImage(processedImg, blob);
-  std::vector<int64_t> inputNodeDims;
-  if (_modelType == SEG::SAM_SEGMENT_ENCODER) {
-    // NCHW: H = imgSize[1], W = imgSize[0]
-    inputNodeDims = {1, 3, _imgSize.at(1), _imgSize.at(0)};
-  } else if (_modelType == SEG::SAM_SEGMENT_DECODER) {
-    inputNodeDims = {1, 256, 64, 64};
-  }
-  TensorProcess_(starttime_1, iImg, blob, inputNodeDims, _modelType, oResult,
-                utilities, result);
-
-  return Ret;
+const char* SAM::RunSession(const cv::Mat& iImg, std::vector<SEG::DL_RESULT>& oResult, SEG::MODEL_TYPE modelType, SEG::DL_RESULT& result) {
-const char* SAM::RunSession(const cv::Mat& iImg, std::vector<SEG::DL_RESULT>& oResult, SEG::MODEL_TYPE modelType, SEG::DL_RESULT& result) {
+const char* SAM::RunSession(const cv::Mat& iImg, std::vector<SEG::DL_RESULT>& oResult, SEG::MODEL_TYPE modelType, SEG::DL_RESULT& result)
+{
-const char* SAM::RunSession(const cv::Mat& iImg, std::vector<SEG::DL_RESULT>& oResult, SEG::MODEL_TYPE modelType, SEG::DL_RESULT& result) {
+const char* SAM::RunSession(const cv::Mat& iImg, std::vector<SEG::DL_RESULT>& oResult, SEG::MODEL_TYPE modelType, SEG::DL_RESULT& result)
+{
+    #ifdef benchmark
+        clock_t starttime_1 = clock();
+    #endif
+    Utils utilities;
+    const char* Ret = RET_OK;
+    cv::Mat processedImg;
+    utilities.PreProcess(iImg, _imgSize, processedImg);
+
+    if (modelType < 4) {
+        float* blob = new float[processedImg.total() * 3];
+        utilities.BlobFromImage(processedImg, blob);
+        std::vector<int64_t> inputNodeDims;
+        if (modelType == SEG::SAM_SEGMENT_ENCODER) {
+            // NCHW with H=imgSize[1], W=imgSize[0]  // FIX
+            inputNodeDims = { 1, 3, _imgSize.at(1), _imgSize.at(0) }; // FIX
+        } else if (modelType == SEG::SAM_SEGMENT_DECODER) {
+            inputNodeDims = { 1, 256, 64, 64 };
+        }
+        TensorProcess_(starttime_1, iImg, blob, inputNodeDims, modelType, oResult, utilities, result);
+    }
+    // ...existing code...
+    return Ret;
 }
 
 template <typename N>
@@ -341,21 +340,19 @@ char *SAM::WarmUpSession_(SEG::MODEL_TYPE _modelType)
   cv::Mat iImg = cv::Mat(cv::Size(_imgSize.at(0), _imgSize.at(1)), CV_8UC3);
   cv::Mat processedImg;
   utilities.PreProcess(iImg, _imgSize, processedImg);
-
-  float *blob = new float[iImg.total() * 3];
-  utilities.BlobFromImage(processedImg, blob);
-
-  // NCHW: H = imgSize[1], W = imgSize[0]
-  std::vector<int64_t> SAM_input_node_dims = {1, 3, _imgSize.at(1), _imgSize.at(0)};
-  switch (_modelType) {
-  case SEG::SAM_SEGMENT_ENCODER: {
-    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
-        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob,
-        3 * _imgSize.at(0) * _imgSize.at(1), SAM_input_node_dims.data(),
-        SAM_input_node_dims.size());
-    auto output_tensors =
-        _session->Run(_options, _inputNodeNames.data(), &input_tensor, 1,
-                     _outputNodeNames.data(), _outputNodeNames.size());
+  if (_modelType < 4) {
-  if (_modelType < 4) {
+  if (_modelType < 4)
+  {
-  if (_modelType < 4) {
+  if (_modelType < 4)
+  {
+      float* blob = new float[iImg.total() * 3];
+      utilities.BlobFromImage(processedImg, blob);
+      // NCHW: H=imgSize[1], W=imgSize[0]  // FIX
+      std::vector<int64_t> SAM_input_node_dims = { 1, 3, _imgSize.at(1), _imgSize.at(0) }; // FIX
+      switch (_modelType) {
+      case SEG::SAM_SEGMENT_ENCODER: {
+          Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
+              Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU),
+              blob, 3 * _imgSize.at(0) * _imgSize.at(1),
+              SAM_input_node_dims.data(), SAM_input_node_dims.size());
+          auto output_tensors = _session->Run(_options, _inputNodeNames.data(), &input_tensor, 1,
+                                                _outputNodeNames.data(), _outputNodeNames.size());
     delete[] blob;
     clock_t starttime_4 = clock();
     double post_process_time =
@@ -433,6 +430,6 @@ char *SAM::WarmUpSession_(SEG::MODEL_TYPE _modelType)
     break;
   }
   }
-
+  }
   return RET_OK;
 }
diff --git a/src/utils.cpp b/src/utils.cpp
@@ -1,6 +1,6 @@
 #include "sam_onnx_ros/utils.hpp"
 
-//#define LOGGING
+// #define LOGGING
 
 // Constructor
 Utils::Utils()
@@ -12,29 +12,26 @@ Utils::~Utils()
 {
 }
 
-char *Utils::PreProcess(const cv::Mat &iImg, std::vector<int> iImgSize, cv::Mat &oImg)
+char* Utils::PreProcess(const cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oImg)
 {
-    if (iImg.channels() == 3)
-    {
+    if (iImg.channels() == 3) {
-    if (iImg.channels() == 3) {
+    if (iImg.channels() == 3)
+    {
-    if (iImg.channels() == 3) {
+    if (iImg.channels() == 3)
+    {
         oImg = iImg.clone();
         cv::cvtColor(oImg, oImg, cv::COLOR_BGR2RGB);
-    }
-    else
-    {
+    } else {
-    } else {
+    }
+    else
+    {
-    } else {
+    }
+    else
+    {
         cv::cvtColor(iImg, oImg, cv::COLOR_GRAY2RGB);
     }
 
-    if (iImg.cols >= iImg.rows)
-    {
-        _resizeScales = iImg.cols / (float)iImgSize.at(0);
-        cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), int(iImg.rows / _resizeScales)));
+    if (iImg.cols >= iImg.rows) {
+        // Width-dominant: scale by target width (iImgSize[0])
+        _resizeScales = iImg.cols / static_cast<float>(iImgSize.at(0));
+        cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), static_cast<int>(iImg.rows / _resizeScales))); // fixed
+    } else {
-    if (iImg.cols >= iImg.rows) {
-        // Width-dominant: scale by target width (iImgSize[0])
-        _resizeScales = iImg.cols / static_cast<float>(iImgSize.at(0));
-        cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), static_cast<int>(iImg.rows / _resizeScales))); // fixed
-    } else {
+    if (iImg.cols >= iImg.rows)
+    {
+        // Width-dominant: scale by target width (iImgSize[0])
+        _resizeScales = iImg.cols / static_cast<float>(iImgSize.at(0));
+        cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), static_cast<int>(iImg.rows / _resizeScales))); // fixed
+    }
+    else
+    {
-    if (iImg.cols >= iImg.rows) {
-        // Width-dominant: scale by target width (iImgSize[0])
-        _resizeScales = iImg.cols / static_cast<float>(iImgSize.at(0));
-        cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), static_cast<int>(iImg.rows / _resizeScales))); // fixed
-    } else {
+    if (iImg.cols >= iImg.rows)
+    {
+        // Width-dominant: scale by target width (iImgSize[0])
+        _resizeScales = iImg.cols / static_cast<float>(iImgSize.at(0));
+        cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), static_cast<int>(iImg.rows / _resizeScales))); // fixed
+    }
+    else
+    {
+        // Height-dominant: scale by target height (iImgSize[1])
+        _resizeScales = iImg.rows / static_cast<float>(iImgSize.at(1));
+        cv::resize(oImg, oImg, cv::Size(static_cast<int>(iImg.cols / _resizeScales), iImgSize.at(1))); // fixed
     }
-    else
-    {
-        _resizeScales = iImg.rows / (float)iImgSize.at(1);
-        cv::resize(oImg, oImg, cv::Size(int(iImg.cols / _resizeScales), iImgSize.at(1)));
-    }
-    //cv::Mat tempImg = cv::Mat::zeros(iImgSize.at(0), iImgSize.at(1), CV_8UC3);
+
+    // Letterbox top-left into a canvas of size (H=iImgSize[1], W=iImgSize[0])
     cv::Mat tempImg = cv::Mat::zeros(iImgSize.at(1), iImgSize.at(0), CV_8UC3);
     oImg.copyTo(tempImg(cv::Rect(0, 0, oImg.cols, oImg.rows)));
     oImg = tempImg;
@@ -122,48 +119,59 @@ std::vector<Ort::Value> Utils::PrepareInputTensor(Ort::Value &decoderInputTensor
 }
 void Utils::PostProcess(std::vector<Ort::Value> &output_tensors, const cv::Mat &iImg, std::vector<int> imgSize, SEG::DL_RESULT &result)
 {
-    if (output_tensors.size() < 2)
-    {
-        std::cerr << "[SAM]: Decoder returned insufficient outputs." << std::endl;
+    if (output_tensors.empty()) {
-    if (output_tensors.empty()) {
+    if (output_tensors.empty())
+    {
-    if (output_tensors.empty()) {
+    if (output_tensors.empty())
+    {
+        std::cerr << "[SAM]: Decoder returned no outputs." << std::endl;
+        return;
+    }
+
+    // Detect masks (4D) and scores (1D/2D) by shape
+    int masksIdx = -1, scoresIdx = -1;
+    for (int i = 0; i < static_cast<int>(output_tensors.size()); ++i) {
-    for (int i = 0; i < static_cast<int>(output_tensors.size()); ++i) {
+    for (int i = 0; i < static_cast<int>(output_tensors.size()); ++i)
+    {
-    for (int i = 0; i < static_cast<int>(output_tensors.size()); ++i) {
+    for (int i = 0; i < static_cast<int>(output_tensors.size()); ++i)
+    {
+        const auto &val = output_tensors[i];
+        auto shape = val.GetTensorTypeAndShapeInfo().GetShape();
+        if (shape.size() == 4) masksIdx = i;
+        else if (shape.size() <= 2) scoresIdx = i;
+    }
+    if (masksIdx < 0) {
-    if (masksIdx < 0) {
+    if (masksIdx < 0)
+    {
-    if (masksIdx < 0) {
+    if (masksIdx < 0)
+    {
+        std::cerr << "[SAM]: No 4D mask tensor found in decoder outputs." << std::endl;
         return;
     }
 
-    // Assume [scores, masks]; consider shape-based detection later
-    auto scoresTensor = std::move(output_tensors[0]);
-    auto masksTensor  = std::move(output_tensors[1]);
+    auto masksTensor = std::move(output_tensors[masksIdx]);
+    const float* scoresData = nullptr;
+    if (scoresIdx >= 0) {
-    if (scoresIdx >= 0) {
+    if (scoresIdx >= 0)
+    {
-    if (scoresIdx >= 0) {
+    if (scoresIdx >= 0)
+    {
+        scoresData = output_tensors[scoresIdx].GetTensorMutableData<float>();
+    }
 
     auto masksInfo  = masksTensor.GetTensorTypeAndShapeInfo();
     auto masksShape = masksInfo.GetShape();
 
     if (masksShape.size() == 4)
     {
         auto masksData  = masksTensor.GetTensorMutableData<float>();
-        auto scoresData = scoresTensor.GetTensorMutableData<float>();
 
         const size_t numMasks = static_cast<size_t>(masksShape[1]);
         const size_t height   = static_cast<size_t>(masksShape[2]);
         const size_t width    = static_cast<size_t>(masksShape[3]);
 
-        // Pick best mask by score
+        // Pick best mask by score if available
         float bestScore = -1.0f;
         size_t bestMaskIndex = 0;
-        for (size_t i = 0; i < numMasks; ++i)
-        {
-            const float s = scoresData ? scoresData[i] : 0.0f;
-            if (s > bestScore) { bestScore = s; bestMaskIndex = i; }
+        if (scoresData) {
+            for (size_t i = 0; i < numMasks; ++i) {
+                const float s = scoresData[i];
+                if (s > bestScore) { bestScore = s; bestMaskIndex = i; }
+            }
-        if (scoresData) {
-            for (size_t i = 0; i < numMasks; ++i) {
-                const float s = scoresData[i];
-                if (s > bestScore) { bestScore = s; bestMaskIndex = i; }
-            }
+        if (scoresData)
+        {
+            for (size_t i = 0; i < numMasks; ++i)
+            {
+                const float s = scoresData[i];
+                if (s > bestScore)
+                {
+                    bestScore = s;
+                    bestMaskIndex = i;
+                }
+            }
-        if (scoresData) {
-            for (size_t i = 0; i < numMasks; ++i) {
-                const float s = scoresData[i];
-                if (s > bestScore) { bestScore = s; bestMaskIndex = i; }
-            }
+        if (scoresData)
+        {
+            for (size_t i = 0; i < numMasks; ++i)
+            {
+                const float s = scoresData[i];
+                if (s > bestScore)
+                {
+                    bestScore = s;
+                    bestMaskIndex = i;
+                }
+            }
         }
 
-        // Compute preprocessed region (top-left anchored)
+        // Compute preprocessed region (top-left anchored) to undo letterbox
         float scale;
         int processedWidth, processedHeight;
-        if (iImg.cols >= iImg.rows)
-        {
+        if (iImg.cols >= iImg.rows) {
-        if (iImg.cols >= iImg.rows) {
+        if (iImg.cols >= iImg.rows)
+        {
-        if (iImg.cols >= iImg.rows) {
+        if (iImg.cols >= iImg.rows)
+        {
             scale = static_cast<float>(imgSize[0]) / static_cast<float>(iImg.cols);
             processedWidth  = imgSize[0];
             processedHeight = static_cast<int>(iImg.rows * scale);
-        }
-        else
-        {
+        } else {
-        } else {
+        }
+        else
+        {
-        } else {
+        }
+        else
+        {
             scale = static_cast<float>(imgSize[1]) / static_cast<float>(iImg.rows);
             processedWidth  = static_cast<int>(iImg.cols * scale);
             processedHeight = imgSize[1];
@@ -176,20 +184,19 @@ void Utils::PostProcess(std::vector<Ort::Value> &output_tensors, const cv::Mat &
         cv::Mat prob32f(static_cast<int>(height), static_cast<int>(width), CV_32F,
                         const_cast<float*>(masksData + planeOffset));
 
-        // Crop in mask space using proportional dimensions (no hardcoded 256)
+        // Crop padding region in mask space
         const int cropW = clampDim(static_cast<int>(std::round(static_cast<float>(width)  * processedWidth  / static_cast<float>(imgSize[0]))), 1, static_cast<int>(width));
         const int cropH = clampDim(static_cast<int>(std::round(static_cast<float>(height) * processedHeight / static_cast<float>(imgSize[1]))), 1, static_cast<int>(height));
         cv::Mat probCropped = prob32f(cv::Rect(0, 0, cropW, cropH));
 
-        // Resize probabilities to original image (linear)
+        // Resize to original image size and threshold
         cv::Mat probResized;
         cv::resize(probCropped, probResized, cv::Size(iImg.cols, iImg.rows), 0, 0, cv::INTER_LINEAR);
 
-        // Threshold once to binary mask
         cv::Mat finalMask;
         cv::compare(probResized, 0.5f, finalMask, cv::CMP_GT); // CV_8U 0/255
 
-        // Morphological cleanup (light, then ensure binary)
+        // Optional cleanup
         int kernelSize = std::max(5, std::min(iImg.cols, iImg.rows) / 100);
         cv::Mat kernel = cv::getStructuringElement(cv::MORPH_ELLIPSE, cv::Size(kernelSize, kernelSize));
         cv::morphologyEx(finalMask, finalMask, cv::MORPH_CLOSE, kernel);