dataplayer12 · Shattered217 · Jan 10, 2026 · Jan 10, 2026 · Jan 14, 2026 · Copilot
diff --git a/cpp/include/sam3.cuh b/cpp/include/sam3.cuh
@@ -3,11 +3,20 @@
 #include "sam3.hpp"
 #include <filesystem>
 #include <fstream>
+#include <memory>
 #include "cuda_runtime.h"
 #include "NvInfer.h"
 #include "NvInferRuntime.h"
 #include "prepost.cuh"
 
+struct PinnedMemoryDeleter {
+    void operator()(void* ptr) const {
+        if (ptr) {
+            cudaFreeHost(ptr);
+        }
+    }
+};
+
 
 #define MAX_DIMS 8
 
@@ -43,6 +52,8 @@ public:
     bool infer_on_image(const cv::Mat& input, cv::Mat& result, SAM3_VISUALIZATION vis_type);
     bool run_blind_inference();
     void pin_opencv_matrices(cv::Mat& input_mat, cv::Mat& result_mat);
+    std::pair<cv::Mat, std::shared_ptr<void>> allocate_pinned_mat(int rows, int cols, int type);
+    void setup_pinned_matrices(cv::Mat& input_mat, cv::Mat& result_mat);
     std::vector<void*> output_cpu;
 
 private:

diff --git a/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp b/cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp
@@ -2,8 +2,52 @@
 #include "sam3.cuh"
 #include <chrono>
 #include <thread>
+#include <memory>
 #include <opencv2/imgproc.hpp>
 
+void ensure_even_dimensions(const cv::Mat& input, cv::Mat& output)
+{
+    int new_width = input.cols;
+    int new_height = input.rows;
+    bool needs_resize = false;
+
+    if (input.cols % 2 != 0)
+    {
+        new_width = input.cols + 1;
+        needs_resize = true;
+    }
+
+    if (input.rows % 2 != 0)
+    {
+        new_height = input.rows + 1;
+        needs_resize = true;
+    }
+
+    if (needs_resize)
+    {
+        cv::resize(input, output, cv::Size(new_width, new_height), 0, 0, cv::INTER_LINEAR);
+    }
+    else
+    {
+        output = input;
+    }
+}
+
+cv::Mat read_and_ensure_even(const std::string imgpath)
+{
+    cv::Mat img_original = cv::imread(imgpath, cv::IMREAD_COLOR);
+    if (img_original.empty())
+    {
+        std::stringstream err;
+        err << "Failed to read image: " << imgpath;
+        throw std::runtime_error(err.str());
+    }
+
+    cv::Mat img;
+    ensure_even_dimensions(img_original, img);
+    return img;
+}
+
 void read_image_into_buffer(const std::string imgpath, char* raw_buffer, cv::Mat& buffer)
 {
     size_t file_size = std::filesystem::file_size(imgpath);
@@ -90,36 +134,47 @@ int main(int argc, char* argv[])
 
     const float vis_alpha = 0.3;
     const float probability_threshold = 0.5;
-    const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_INSTANCE_SEGMENTATION;
+    const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_SEMANTIC_SEGMENTATION;
-    const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_SEMANTIC_SEGMENTATION;
+    const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_INSTANCE_SEGMENTATION;
-    const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_SEMANTIC_SEGMENTATION;
+    const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_INSTANCE_SEGMENTATION;
 
     SAM3_PCS pcs(epath, vis_alpha, probability_threshold);
 
-    cv::Mat img, result;
-    char* raw_bytes;
-
     std::filesystem::create_directories("results");
     int num_images_read=0;
+
+    cv::Mat pinned_img, pinned_result;
+    std::shared_ptr<void> img_mem_holder, result_mem_holder;
+    int last_rows = 0, last_cols = 0;
 
     for (const auto& fname : std::filesystem::directory_iterator(in_dir))
     {
         if (std::filesystem::is_regular_file(fname.path())) 
         {
             std::filesystem::path outfile = std::filesystem::path("results") / fname.path().filename();
 
-            if (num_images_read==0)
+            try
             {
-                cv::Mat tmp = cv::imread(fname.path(), cv::IMREAD_COLOR);
-                raw_bytes = (char *)malloc(tmp.total()*tmp.elemSize());
-                read_image_into_buffer(fname.path(), raw_bytes, img);
-                result = cv::imread(fname.path(), cv::IMREAD_COLOR);
-                pcs.pin_opencv_matrices(img, result);
-            }
-            else
-            {
-                read_image_into_buffer(fname.path(), raw_bytes, img);
+                cv::Mat img_loaded = read_and_ensure_even(fname.path());
+
+                if (img_loaded.rows != last_rows || img_loaded.cols != last_cols || pinned_img.empty())
+                {
+                    auto [img_mat, img_holder] = pcs.allocate_pinned_mat(img_loaded.rows, img_loaded.cols, img_loaded.type());
+                    auto [result_mat, result_holder] = pcs.allocate_pinned_mat(img_loaded.rows, img_loaded.cols, img_loaded.type());
+
+                    pinned_img = img_mat;
+                    pinned_result = result_mat;
+                    img_mem_holder = img_holder;
+                    result_mem_holder = result_holder;
+
+                    last_rows = img_loaded.rows;
+                    last_cols = img_loaded.cols;
+
+                    pcs.setup_pinned_matrices(pinned_img, pinned_result);
             }
+
+                img_loaded.copyTo(pinned_img);
+
             start = std::chrono::system_clock::now();
-            infer_one_image(pcs, img, result, visualize, outfile, benchmark);
+                infer_one_image(pcs, pinned_img, pinned_result, visualize, outfile, benchmark);
             num_images_read++;
             end = std::chrono::system_clock::now();
             diff = end - start;
@@ -129,6 +184,12 @@ int main(int argc, char* argv[])
             {
                 float msec_per_image = millis_elapsed/num_images_read;
                 printf("Processed %d images at %f msec/image\n", num_images_read, msec_per_image);
+                }
+            }
+            catch (const std::exception& e)
+            {
+                std::cout << "Error processing " << fname.path() << ": " << e.what() << std::endl;
+                continue;
             }
         }
     }

diff --git a/cpp/src/sam3/sam3_trt/sam3.cu b/cpp/src/sam3/sam3_trt/sam3.cu
@@ -4,6 +4,11 @@ SAM3_PCS::SAM3_PCS(const std::string engine_path, const float vis_alpha, const f
     : _engine_path(engine_path)
     , _overlay_alpha(vis_alpha)
     , _probability_threshold(prob_threshold)
+    , opencv_input(nullptr)
+    , gpu_result(nullptr)
+    , zc_input(nullptr)
+    , gpu_colpal(nullptr)
+    , opencv_inbytes(0)
 {
 
     cuda_check(cudaStreamCreate(&sam3_stream), "creating CUDA stream for SAM3");
@@ -21,6 +26,47 @@ SAM3_PCS::SAM3_PCS(const std::string engine_path, const float vis_alpha, const f
     bsize.y=16;
 }
 
+std::pair<cv::Mat, std::shared_ptr<void>> SAM3_PCS::allocate_pinned_mat(int rows, int cols, int type)
+{
+    size_t bytes = rows * cols * CV_ELEM_SIZE(type);
+    void* ptr = nullptr;
+
+    cuda_check(cudaMallocHost(&ptr, bytes), " allocating pinned memory for Mat");
+
+    auto deleter = [](void* p) { if (p) cudaFreeHost(p); };
+    std::shared_ptr<void> mem_holder(ptr, deleter);
+
+    cv::Mat mat(rows, cols, type, ptr);
+
+    return std::make_pair(mat, mem_holder);
+}
+
+void SAM3_PCS::setup_pinned_matrices(cv::Mat& input_mat, cv::Mat& result_mat)
+{
+    opencv_inbytes = input_mat.total() * input_mat.elemSize();
+
+    if (is_zerocopy)
+    {
+        cuda_check(cudaHostGetDevicePointer(&zc_input, input_mat.data, 0),
+            " getting GPU pointer for pinned input Mat");
+
+        cuda_check(cudaHostGetDevicePointer(&gpu_result, result_mat.data, 0),
+            " getting GPU pointer for pinned result Mat");
+    }
+    else
+    {
+        if (opencv_input != nullptr)
+        {
+            cudaFree(opencv_input);
+            cudaFree((void*)gpu_result);
-            cudaFree(opencv_input);
-            cudaFree((void*)gpu_result);
+            cuda_check(cudaFree(opencv_input), " freeing opencv input memory on a dGPU system");
+            cuda_check(cudaFree((void*)gpu_result), " freeing result memory on a dGPU system");
-            cudaFree(opencv_input);
-            cudaFree((void*)gpu_result);
+            cuda_check(cudaFree(opencv_input), " freeing opencv input memory on a dGPU system");
+            cuda_check(cudaFree((void*)gpu_result), " freeing result memory on a dGPU system");
+        }
+        cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " allocating opencv input memory on a dGPU system");
+        cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " allocating result memory on a dGPU system");
-        if (opencv_input != nullptr)
-        {
-            cudaFree(opencv_input);
-            cudaFree((void*)gpu_result);
-        }
-        cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " allocating opencv input memory on a dGPU system");
-        cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " allocating result memory on a dGPU system");
+        // Use temporary pointers so reallocation is atomic and leaves the object in a consistent state
+        void* new_opencv_input = nullptr;
+        void* new_gpu_result = nullptr;
+
+        // Allocate new buffers first; cuda_check will handle any allocation errors
+        cuda_check(cudaMalloc(&new_opencv_input, opencv_inbytes), " allocating opencv input memory on a dGPU system");
+        cuda_check(cudaMalloc(&new_gpu_result, opencv_inbytes), " allocating result memory on a dGPU system");
+
+        // Now that both allocations have succeeded, free any old buffers
+        if (opencv_input != nullptr)
+        {
+            cudaFree(opencv_input);
+            cudaFree((void*)gpu_result);
+        }
+
+        // Update member pointers to point to the newly allocated buffers
+        opencv_input = new_opencv_input;
+        gpu_result = new_gpu_result;
+
+        // Initialize the buffers
-        if (opencv_input != nullptr)
-        {
-            cudaFree(opencv_input);
-            cudaFree((void*)gpu_result);
-        }
-        cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " allocating opencv input memory on a dGPU system");
-        cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " allocating result memory on a dGPU system");
+        // Use temporary pointers so reallocation is atomic and leaves the object in a consistent state
+        void* new_opencv_input = nullptr;
+        void* new_gpu_result = nullptr;
+
+        // Allocate new buffers first; cuda_check will handle any allocation errors
+        cuda_check(cudaMalloc(&new_opencv_input, opencv_inbytes), " allocating opencv input memory on a dGPU system");
+        cuda_check(cudaMalloc(&new_gpu_result, opencv_inbytes), " allocating result memory on a dGPU system");
+
+        // Now that both allocations have succeeded, free any old buffers
+        if (opencv_input != nullptr)
+        {
+            cudaFree(opencv_input);
+            cudaFree((void*)gpu_result);
+        }
+
+        // Update member pointers to point to the newly allocated buffers
+        opencv_input = new_opencv_input;
+        gpu_result = new_gpu_result;
+
+        // Initialize the buffers
+        cudaMemset(opencv_input, 0, opencv_inbytes);
+        cudaMemset((void *)gpu_result, 0, opencv_inbytes);
-        cudaMemset(opencv_input, 0, opencv_inbytes);
-        cudaMemset((void *)gpu_result, 0, opencv_inbytes);
+        cuda_check(cudaMemset(opencv_input, 0, opencv_inbytes), " zeroing opencv input memory on a dGPU system");
+        cuda_check(cudaMemset((void *)gpu_result, 0, opencv_inbytes), " zeroing result memory on a dGPU system");
-        cudaMemset(opencv_input, 0, opencv_inbytes);
-        cudaMemset((void *)gpu_result, 0, opencv_inbytes);
+        cuda_check(cudaMemset(opencv_input, 0, opencv_inbytes), " zeroing opencv input memory on a dGPU system");
+        cuda_check(cudaMemset((void *)gpu_result, 0, opencv_inbytes), " zeroing result memory on a dGPU system");
+    }
+}
+
 void SAM3_PCS::pin_opencv_matrices(cv::Mat& input_mat, cv::Mat& result_mat)
 {
     opencv_inbytes = input_mat.total() * input_mat.elemSize();
@@ -101,9 +147,10 @@ void SAM3_PCS::visualize_on_dGPU(const cv::Mat& input, cv::Mat& result, SAM3_VIS
         igsize.y = (input.rows + THREAD_COARSENING_FACTOR*ibsize.y - 1) / (THREAD_COARSENING_FACTOR*ibsize.y);
         // 2D grid
 
+        size_t input_bytes = input.total() * input.elemSize();
         cuda_check(cudaMemcpyAsync((void *)gpu_result, 
             (void *)input_ptr, 
-            opencv_inbytes, 
+            input_bytes, 
             cudaMemcpyDeviceToDevice, 
             sam3_stream), " async memcpy for result during instance seg visualization");
 
@@ -127,31 +174,47 @@ void SAM3_PCS::visualize_on_dGPU(const cv::Mat& input, cv::Mat& result, SAM3_VIS
 
     if (!is_zerocopy && vis_type == SAM3_VISUALIZATION::VIS_NONE)
     {
-        cudaMemcpyAsync(output_cpu[0], output_gpu[0],output_sizes[0], cudaMemcpyDeviceToHost, sam3_stream);
-        cudaMemcpyAsync(output_cpu[1], output_gpu[1],output_sizes[1], cudaMemcpyDeviceToHost, sam3_stream);
+        cudaMemcpyAsync(output_cpu[0], output_gpu[0], output_sizes[0], cudaMemcpyDeviceToHost, sam3_stream);
+        cudaMemcpyAsync(output_cpu[1], output_gpu[1], output_sizes[1], cudaMemcpyDeviceToHost, sam3_stream);
     }
     else if (!is_zerocopy)
     {
-        cudaMemcpyAsync(
-            (void*)result.data, 
-            (void*)gpu_result, 
-            opencv_inbytes, 
-            cudaMemcpyDeviceToHost, 
-            sam3_stream);
+        size_t result_bytes = result.total() * result.elemSize();
+        cudaMemcpyAsync((void*)result.data, (void*)gpu_result, result_bytes, cudaMemcpyDeviceToHost, sam3_stream);
     }
-
-    // if is_zerocopy, there is no need to do any synchronization/copy
-    // to make the result visible to the CPU
 }
 
 bool SAM3_PCS::infer_on_dGPU(const cv::Mat& input, cv::Mat& result, SAM3_VISUALIZATION vis_type)
 {
+    if (input.cols % 2 != 0 || input.rows % 2 != 0)
+    {
+        std::stringstream err;
+        err << "Error: Input image dimensions must be even. Current size: " 
+            << input.cols << "x" << input.rows 
+            << ". Please resize the image to even dimensions before inference.";
+        throw std::runtime_error(err.str());
+    }
+
+    size_t current_inbytes = input.total() * input.elemSize();
+
+    if (current_inbytes > opencv_inbytes)
+    {
+        if (opencv_input != nullptr)
+        {
+            cudaFree(opencv_input);
+            cudaFree((void*)gpu_result);
-            cudaFree(opencv_input);
-            cudaFree((void*)gpu_result);
+            cuda_check(cudaFree(opencv_input), " freeing opencv input memory");
+            cuda_check(cudaFree((void*)gpu_result), " freeing result memory");
-            cudaFree(opencv_input);
-            cudaFree((void*)gpu_result);
+            cuda_check(cudaFree(opencv_input), " freeing opencv input memory");
+            cuda_check(cudaFree((void*)gpu_result), " freeing result memory");
+        }
+        opencv_inbytes = current_inbytes;
+        cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " reallocating opencv input memory");
+        cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " reallocating result memory");
-        if (opencv_input != nullptr)
-        {
-            cudaFree(opencv_input);
-            cudaFree((void*)gpu_result);
-        }
-        opencv_inbytes = current_inbytes;
-        cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " reallocating opencv input memory");
-        cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " reallocating result memory");
+        // Allocate new buffers first, using temporaries to keep state consistent
+        void* new_opencv_input = nullptr;
+        void* new_gpu_result   = nullptr;
+
+        cuda_check(cudaMalloc(&new_opencv_input, current_inbytes), " reallocating opencv input memory");
+        cuda_check(cudaMalloc(&new_gpu_result,   current_inbytes), " reallocating result memory");
+
+        // Free old buffers after successful allocations
+        if (opencv_input != nullptr)
+        {
+            cudaFree(opencv_input);
+            opencv_input = nullptr;
+        }
+        if (gpu_result != nullptr)
+        {
+            cudaFree((void*)gpu_result);
+            gpu_result = nullptr;
+        }
+
+        // Commit new state atomically
+        opencv_input  = new_opencv_input;
+        gpu_result    = static_cast<decltype(gpu_result)>(new_gpu_result);
+        opencv_inbytes = current_inbytes;
-        if (opencv_input != nullptr)
-        {
-            cudaFree(opencv_input);
-            cudaFree((void*)gpu_result);
-        }
-        opencv_inbytes = current_inbytes;
-        cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " reallocating opencv input memory");
-        cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " reallocating result memory");
+        // Allocate new buffers first, using temporaries to keep state consistent
+        void* new_opencv_input = nullptr;
+        void* new_gpu_result   = nullptr;
+
+        cuda_check(cudaMalloc(&new_opencv_input, current_inbytes), " reallocating opencv input memory");
+        cuda_check(cudaMalloc(&new_gpu_result,   current_inbytes), " reallocating result memory");
+
+        // Free old buffers after successful allocations
+        if (opencv_input != nullptr)
+        {
+            cudaFree(opencv_input);
+            opencv_input = nullptr;
+        }
+        if (gpu_result != nullptr)
+        {
+            cudaFree((void*)gpu_result);
+            gpu_result = nullptr;
+        }
+
+        // Commit new state atomically
+        opencv_input  = new_opencv_input;
+        gpu_result    = static_cast<decltype(gpu_result)>(new_gpu_result);
+        opencv_inbytes = current_inbytes;
+    }
+
     gsize.x = (in_width + bsize.x - 1) / (THREAD_COARSENING_FACTOR*bsize.x);
     gsize.y = (in_height + bsize.y - 1) / (THREAD_COARSENING_FACTOR*bsize.y);
 
     cuda_check(
         cudaMemcpyAsync(
-            opencv_input, input.data, opencv_inbytes, cudaMemcpyHostToDevice, sam3_stream)
+            opencv_input, input.data, current_inbytes, cudaMemcpyHostToDevice, sam3_stream)
         , " async memcpy of opencv image");
 
     pre_process_sam3<<<gsize, bsize, 0, sam3_stream>>>(
@@ -172,6 +235,15 @@ bool SAM3_PCS::infer_on_dGPU(const cv::Mat& input, cv::Mat& result, SAM3_VISUALI
 
 bool SAM3_PCS::infer_on_iGPU(const cv::Mat& input, cv::Mat& result, SAM3_VISUALIZATION vis_type)
 {
+    if (input.cols % 2 != 0 || input.rows % 2 != 0)
+    {
+        std::stringstream err;
+        err << "Error: Input image dimensions must be even. Current size: " 
+            << input.cols << "x" << input.rows 
+            << ". Please resize the image to even dimensions before inference.";
+        throw std::runtime_error(err.str());
+    }
+
     gsize.x = (in_width + bsize.x - 1) / (THREAD_COARSENING_FACTOR*bsize.x);
     gsize.y = (in_height + bsize.y - 1) / (THREAD_COARSENING_FACTOR*bsize.y);