Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cpp/include/sam3.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,20 @@
#include "sam3.hpp"
#include <filesystem>
#include <fstream>
#include <memory>
#include "cuda_runtime.h"
#include "NvInfer.h"
#include "NvInferRuntime.h"
#include "prepost.cuh"

struct PinnedMemoryDeleter {
void operator()(void* ptr) const {
if (ptr) {
cudaFreeHost(ptr);
}
}
};
Comment on lines +12 to +18
Copy link

Copilot AI Jan 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PinnedMemoryDeleter struct is defined but never used in the codebase. The allocate_pinned_mat function uses an inline lambda deleter instead. Consider either using this struct in allocate_pinned_mat or removing it to reduce code clutter.

Copilot uses AI. Check for mistakes.


#define MAX_DIMS 8

Expand Down Expand Up @@ -43,6 +52,8 @@ public:
bool infer_on_image(const cv::Mat& input, cv::Mat& result, SAM3_VISUALIZATION vis_type);
bool run_blind_inference();
void pin_opencv_matrices(cv::Mat& input_mat, cv::Mat& result_mat);
std::pair<cv::Mat, std::shared_ptr<void>> allocate_pinned_mat(int rows, int cols, int type);
void setup_pinned_matrices(cv::Mat& input_mat, cv::Mat& result_mat);
std::vector<void*> output_cpu;

private:
Expand Down
91 changes: 76 additions & 15 deletions cpp/src/sam3/sam3_apps/sam3_pcs_app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,52 @@
#include "sam3.cuh"
#include <chrono>
#include <thread>
#include <memory>
#include <opencv2/imgproc.hpp>

void ensure_even_dimensions(const cv::Mat& input, cv::Mat& output)
{
int new_width = input.cols;
int new_height = input.rows;
bool needs_resize = false;

if (input.cols % 2 != 0)
{
new_width = input.cols + 1;
needs_resize = true;
}

if (input.rows % 2 != 0)
{
new_height = input.rows + 1;
needs_resize = true;
}

if (needs_resize)
{
cv::resize(input, output, cv::Size(new_width, new_height), 0, 0, cv::INTER_LINEAR);
}
else
{
output = input;
}
}

cv::Mat read_and_ensure_even(const std::string imgpath)
{
cv::Mat img_original = cv::imread(imgpath, cv::IMREAD_COLOR);
if (img_original.empty())
{
std::stringstream err;
err << "Failed to read image: " << imgpath;
throw std::runtime_error(err.str());
}

cv::Mat img;
ensure_even_dimensions(img_original, img);
return img;
}

void read_image_into_buffer(const std::string imgpath, char* raw_buffer, cv::Mat& buffer)
{
size_t file_size = std::filesystem::file_size(imgpath);
Expand Down Expand Up @@ -90,36 +134,47 @@ int main(int argc, char* argv[])

const float vis_alpha = 0.3;
const float probability_threshold = 0.5;
const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_INSTANCE_SEGMENTATION;
const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_SEMANTIC_SEGMENTATION;
Copy link

Copilot AI Jan 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The visualization type was changed from VIS_INSTANCE_SEGMENTATION to VIS_SEMANTIC_SEGMENTATION. If this change is intentional for the purposes of this PR, consider documenting it in the PR description. Otherwise, this might be a leftover from testing that should be reverted.

Suggested change
const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_SEMANTIC_SEGMENTATION;
const SAM3_VISUALIZATION visualize = SAM3_VISUALIZATION::VIS_INSTANCE_SEGMENTATION;

Copilot uses AI. Check for mistakes.

SAM3_PCS pcs(epath, vis_alpha, probability_threshold);

cv::Mat img, result;
char* raw_bytes;

std::filesystem::create_directories("results");
int num_images_read=0;

cv::Mat pinned_img, pinned_result;
std::shared_ptr<void> img_mem_holder, result_mem_holder;
int last_rows = 0, last_cols = 0;

for (const auto& fname : std::filesystem::directory_iterator(in_dir))
{
if (std::filesystem::is_regular_file(fname.path()))
{
std::filesystem::path outfile = std::filesystem::path("results") / fname.path().filename();

if (num_images_read==0)
try
{
cv::Mat tmp = cv::imread(fname.path(), cv::IMREAD_COLOR);
raw_bytes = (char *)malloc(tmp.total()*tmp.elemSize());
read_image_into_buffer(fname.path(), raw_bytes, img);
result = cv::imread(fname.path(), cv::IMREAD_COLOR);
pcs.pin_opencv_matrices(img, result);
}
else
{
read_image_into_buffer(fname.path(), raw_bytes, img);
cv::Mat img_loaded = read_and_ensure_even(fname.path());

if (img_loaded.rows != last_rows || img_loaded.cols != last_cols || pinned_img.empty())
{
auto [img_mat, img_holder] = pcs.allocate_pinned_mat(img_loaded.rows, img_loaded.cols, img_loaded.type());
auto [result_mat, result_holder] = pcs.allocate_pinned_mat(img_loaded.rows, img_loaded.cols, img_loaded.type());

pinned_img = img_mat;
pinned_result = result_mat;
img_mem_holder = img_holder;
result_mem_holder = result_holder;

last_rows = img_loaded.rows;
last_cols = img_loaded.cols;

pcs.setup_pinned_matrices(pinned_img, pinned_result);
}

img_loaded.copyTo(pinned_img);

start = std::chrono::system_clock::now();
infer_one_image(pcs, img, result, visualize, outfile, benchmark);
infer_one_image(pcs, pinned_img, pinned_result, visualize, outfile, benchmark);
num_images_read++;
end = std::chrono::system_clock::now();
diff = end - start;
Expand All @@ -129,6 +184,12 @@ int main(int argc, char* argv[])
{
float msec_per_image = millis_elapsed/num_images_read;
printf("Processed %d images at %f msec/image\n", num_images_read, msec_per_image);
}
}
catch (const std::exception& e)
{
std::cout << "Error processing " << fname.path() << ": " << e.what() << std::endl;
continue;
}
}
}
Expand Down
98 changes: 85 additions & 13 deletions cpp/src/sam3/sam3_trt/sam3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ SAM3_PCS::SAM3_PCS(const std::string engine_path, const float vis_alpha, const f
: _engine_path(engine_path)
, _overlay_alpha(vis_alpha)
, _probability_threshold(prob_threshold)
, opencv_input(nullptr)
, gpu_result(nullptr)
, zc_input(nullptr)
, gpu_colpal(nullptr)
, opencv_inbytes(0)
{

cuda_check(cudaStreamCreate(&sam3_stream), "creating CUDA stream for SAM3");
Expand All @@ -21,6 +26,47 @@ SAM3_PCS::SAM3_PCS(const std::string engine_path, const float vis_alpha, const f
bsize.y=16;
}

std::pair<cv::Mat, std::shared_ptr<void>> SAM3_PCS::allocate_pinned_mat(int rows, int cols, int type)
{
size_t bytes = rows * cols * CV_ELEM_SIZE(type);
void* ptr = nullptr;

cuda_check(cudaMallocHost(&ptr, bytes), " allocating pinned memory for Mat");

auto deleter = [](void* p) { if (p) cudaFreeHost(p); };
std::shared_ptr<void> mem_holder(ptr, deleter);

cv::Mat mat(rows, cols, type, ptr);

return std::make_pair(mat, mem_holder);
}

void SAM3_PCS::setup_pinned_matrices(cv::Mat& input_mat, cv::Mat& result_mat)
{
opencv_inbytes = input_mat.total() * input_mat.elemSize();

if (is_zerocopy)
{
cuda_check(cudaHostGetDevicePointer(&zc_input, input_mat.data, 0),
" getting GPU pointer for pinned input Mat");

cuda_check(cudaHostGetDevicePointer(&gpu_result, result_mat.data, 0),
" getting GPU pointer for pinned result Mat");
}
else
{
if (opencv_input != nullptr)
{
cudaFree(opencv_input);
cudaFree((void*)gpu_result);
Comment on lines +60 to +61
Copy link

Copilot AI Jan 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cudaFree calls should be wrapped with error checking. While cudaFree typically doesn't fail for valid pointers, it's a best practice to check for errors consistently, especially in cleanup code where errors could indicate corrupted state.

Suggested change
cudaFree(opencv_input);
cudaFree((void*)gpu_result);
cuda_check(cudaFree(opencv_input), " freeing opencv input memory on a dGPU system");
cuda_check(cudaFree((void*)gpu_result), " freeing result memory on a dGPU system");

Copilot uses AI. Check for mistakes.
}
cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " allocating opencv input memory on a dGPU system");
cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " allocating result memory on a dGPU system");
Comment on lines +58 to +64
Copy link

Copilot AI Jan 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a potential memory leak and state inconsistency issue in the reallocation logic. If cudaMalloc for opencv_input succeeds (line 66) but cudaMalloc for gpu_result fails (line 67), the pointers will be in an inconsistent state. The old memory is already freed, opencv_inbytes is updated, but gpu_result is nullptr while opencv_input points to newly allocated memory. Consider setting pointers to nullptr after freeing, or using a temporary variable pattern to ensure atomicity of the reallocation.

Suggested change
if (opencv_input != nullptr)
{
cudaFree(opencv_input);
cudaFree((void*)gpu_result);
}
cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " allocating opencv input memory on a dGPU system");
cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " allocating result memory on a dGPU system");
// Use temporary pointers so reallocation is atomic and leaves the object in a consistent state
void* new_opencv_input = nullptr;
void* new_gpu_result = nullptr;
// Allocate new buffers first; cuda_check will handle any allocation errors
cuda_check(cudaMalloc(&new_opencv_input, opencv_inbytes), " allocating opencv input memory on a dGPU system");
cuda_check(cudaMalloc(&new_gpu_result, opencv_inbytes), " allocating result memory on a dGPU system");
// Now that both allocations have succeeded, free any old buffers
if (opencv_input != nullptr)
{
cudaFree(opencv_input);
cudaFree((void*)gpu_result);
}
// Update member pointers to point to the newly allocated buffers
opencv_input = new_opencv_input;
gpu_result = new_gpu_result;
// Initialize the buffers

Copilot uses AI. Check for mistakes.
cudaMemset(opencv_input, 0, opencv_inbytes);
cudaMemset((void *)gpu_result, 0, opencv_inbytes);
Comment on lines +65 to +66
Copy link

Copilot AI Jan 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cudaMemset calls should be wrapped with cuda_check for consistent error handling. If these operations fail, it could lead to undefined behavior when the uninitialized memory is used.

Suggested change
cudaMemset(opencv_input, 0, opencv_inbytes);
cudaMemset((void *)gpu_result, 0, opencv_inbytes);
cuda_check(cudaMemset(opencv_input, 0, opencv_inbytes), " zeroing opencv input memory on a dGPU system");
cuda_check(cudaMemset((void *)gpu_result, 0, opencv_inbytes), " zeroing result memory on a dGPU system");

Copilot uses AI. Check for mistakes.
}
}

void SAM3_PCS::pin_opencv_matrices(cv::Mat& input_mat, cv::Mat& result_mat)
{
opencv_inbytes = input_mat.total() * input_mat.elemSize();
Expand Down Expand Up @@ -101,9 +147,10 @@ void SAM3_PCS::visualize_on_dGPU(const cv::Mat& input, cv::Mat& result, SAM3_VIS
igsize.y = (input.rows + THREAD_COARSENING_FACTOR*ibsize.y - 1) / (THREAD_COARSENING_FACTOR*ibsize.y);
// 2D grid

size_t input_bytes = input.total() * input.elemSize();
cuda_check(cudaMemcpyAsync((void *)gpu_result,
(void *)input_ptr,
opencv_inbytes,
input_bytes,
cudaMemcpyDeviceToDevice,
sam3_stream), " async memcpy for result during instance seg visualization");

Expand All @@ -127,31 +174,47 @@ void SAM3_PCS::visualize_on_dGPU(const cv::Mat& input, cv::Mat& result, SAM3_VIS

if (!is_zerocopy && vis_type == SAM3_VISUALIZATION::VIS_NONE)
{
cudaMemcpyAsync(output_cpu[0], output_gpu[0],output_sizes[0], cudaMemcpyDeviceToHost, sam3_stream);
cudaMemcpyAsync(output_cpu[1], output_gpu[1],output_sizes[1], cudaMemcpyDeviceToHost, sam3_stream);
cudaMemcpyAsync(output_cpu[0], output_gpu[0], output_sizes[0], cudaMemcpyDeviceToHost, sam3_stream);
cudaMemcpyAsync(output_cpu[1], output_gpu[1], output_sizes[1], cudaMemcpyDeviceToHost, sam3_stream);
}
else if (!is_zerocopy)
{
cudaMemcpyAsync(
(void*)result.data,
(void*)gpu_result,
opencv_inbytes,
cudaMemcpyDeviceToHost,
sam3_stream);
size_t result_bytes = result.total() * result.elemSize();
cudaMemcpyAsync((void*)result.data, (void*)gpu_result, result_bytes, cudaMemcpyDeviceToHost, sam3_stream);
}

// if is_zerocopy, there is no need to do any synchronization/copy
// to make the result visible to the CPU
}

bool SAM3_PCS::infer_on_dGPU(const cv::Mat& input, cv::Mat& result, SAM3_VISUALIZATION vis_type)
{
if (input.cols % 2 != 0 || input.rows % 2 != 0)
{
std::stringstream err;
err << "Error: Input image dimensions must be even. Current size: "
<< input.cols << "x" << input.rows
<< ". Please resize the image to even dimensions before inference.";
throw std::runtime_error(err.str());
}

size_t current_inbytes = input.total() * input.elemSize();

if (current_inbytes > opencv_inbytes)
{
if (opencv_input != nullptr)
{
cudaFree(opencv_input);
cudaFree((void*)gpu_result);
Comment on lines +204 to +205
Copy link

Copilot AI Jan 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cudaFree calls should be wrapped with cuda_check for consistent error handling. While cudaFree typically doesn't fail for valid pointers, it's a best practice to check for errors consistently.

Suggested change
cudaFree(opencv_input);
cudaFree((void*)gpu_result);
cuda_check(cudaFree(opencv_input), " freeing opencv input memory");
cuda_check(cudaFree((void*)gpu_result), " freeing result memory");

Copilot uses AI. Check for mistakes.
}
opencv_inbytes = current_inbytes;
cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " reallocating opencv input memory");
cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " reallocating result memory");
Comment on lines +202 to +209
Copy link

Copilot AI Jan 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a potential memory leak and state inconsistency issue in the reallocation logic. If cudaMalloc for opencv_input succeeds (line 219) but cudaMalloc for gpu_result fails (line 220), the pointers will be in an inconsistent state. The old memory is already freed, opencv_inbytes is updated, but gpu_result is nullptr while opencv_input points to newly allocated memory. Consider setting pointers to nullptr after freeing, or using a temporary variable pattern to ensure atomicity of the reallocation.

Suggested change
if (opencv_input != nullptr)
{
cudaFree(opencv_input);
cudaFree((void*)gpu_result);
}
opencv_inbytes = current_inbytes;
cuda_check(cudaMalloc(&opencv_input, opencv_inbytes), " reallocating opencv input memory");
cuda_check(cudaMalloc((void**)&gpu_result, opencv_inbytes), " reallocating result memory");
// Allocate new buffers first, using temporaries to keep state consistent
void* new_opencv_input = nullptr;
void* new_gpu_result = nullptr;
cuda_check(cudaMalloc(&new_opencv_input, current_inbytes), " reallocating opencv input memory");
cuda_check(cudaMalloc(&new_gpu_result, current_inbytes), " reallocating result memory");
// Free old buffers after successful allocations
if (opencv_input != nullptr)
{
cudaFree(opencv_input);
opencv_input = nullptr;
}
if (gpu_result != nullptr)
{
cudaFree((void*)gpu_result);
gpu_result = nullptr;
}
// Commit new state atomically
opencv_input = new_opencv_input;
gpu_result = static_cast<decltype(gpu_result)>(new_gpu_result);
opencv_inbytes = current_inbytes;

Copilot uses AI. Check for mistakes.
}

gsize.x = (in_width + bsize.x - 1) / (THREAD_COARSENING_FACTOR*bsize.x);
gsize.y = (in_height + bsize.y - 1) / (THREAD_COARSENING_FACTOR*bsize.y);

cuda_check(
cudaMemcpyAsync(
opencv_input, input.data, opencv_inbytes, cudaMemcpyHostToDevice, sam3_stream)
opencv_input, input.data, current_inbytes, cudaMemcpyHostToDevice, sam3_stream)
, " async memcpy of opencv image");

pre_process_sam3<<<gsize, bsize, 0, sam3_stream>>>(
Expand All @@ -172,6 +235,15 @@ bool SAM3_PCS::infer_on_dGPU(const cv::Mat& input, cv::Mat& result, SAM3_VISUALI

bool SAM3_PCS::infer_on_iGPU(const cv::Mat& input, cv::Mat& result, SAM3_VISUALIZATION vis_type)
{
if (input.cols % 2 != 0 || input.rows % 2 != 0)
{
std::stringstream err;
err << "Error: Input image dimensions must be even. Current size: "
<< input.cols << "x" << input.rows
<< ". Please resize the image to even dimensions before inference.";
throw std::runtime_error(err.str());
}

gsize.x = (in_width + bsize.x - 1) / (THREAD_COARSENING_FACTOR*bsize.x);
gsize.y = (in_height + bsize.y - 1) / (THREAD_COARSENING_FACTOR*bsize.y);

Expand Down