From 6e73aa50ce7f9bba9993724b87d3acb7ed0a4b9a Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 2 Mar 2024 07:47:23 -0500 Subject: [PATCH 1/8] Preliminary attempt at C++ demo. --- .../object_tracking_vittrack/CMakeLists.txt | 32 +++ models/object_tracking_vittrack/demo.cpp | 192 ++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 models/object_tracking_vittrack/CMakeLists.txt create mode 100644 models/object_tracking_vittrack/demo.cpp diff --git a/models/object_tracking_vittrack/CMakeLists.txt b/models/object_tracking_vittrack/CMakeLists.txt new file mode 100644 index 00000000..cf20b70e --- /dev/null +++ b/models/object_tracking_vittrack/CMakeLists.txt @@ -0,0 +1,32 @@ +cmake_minimum_required(VERSION 3.24) +set(project_name "opencv_zoo_object_tracking_vittrack") + +PROJECT (${project_name}) + +set(OPENCV_VERSION "4.9.0") +set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation") +find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH}) +# Find OpenCV, you may need to set OpenCV_DIR variable +# to the absolute path to the directory containing OpenCVConfig.cmake file +# via the command line or GUI + +file(GLOB SourceFile + "demo.cpp") +# If the package has been found, several variables will +# be set, you can find the full list with descriptions +# in the OpenCVConfig.cmake file. +# Print some message showing some of them +message(STATUS "OpenCV library status:") +message(STATUS " config: ${OpenCV_DIR}") +message(STATUS " version: ${OpenCV_VERSION}") +message(STATUS " libraries: ${OpenCV_LIBS}") +message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}") + +# Declare the executable target built from your sources +add_executable(${project_name} ${SourceFile}) + +# Set C++ compilation standard to C++11 +set(CMAKE_CXX_STANDARD 11) + +# Link your application with OpenCV libraries +target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS}) diff --git a/models/object_tracking_vittrack/demo.cpp b/models/object_tracking_vittrack/demo.cpp new file mode 100644 index 00000000..32941f9f --- /dev/null +++ b/models/object_tracking_vittrack/demo.cpp @@ -0,0 +1,192 @@ +#include +#include + +class VitTrack { +public: + + VitTrack(const std::string& model_path, int backend_id = 0, int target_id = 0) + : model_path(model_path), backend_id(backend_id), target_id(target_id) { + params.net = model_path; + params.backend = backend_id; + params.target = target_id; + + model = cv::TrackerVit::create(params); + } + + ~VitTrack() = default; + + const std::string& getName() const { + static std::string name = "VitTrack"; + return name; + } + + void setBackendAndTarget(int backend_id, int target_id) { + this->backend_id = backend_id; + this->target_id = target_id; + + params.backend = backend_id; + params.target = target_id; + + model = cv::TrackerVit::create(params); + if (!model) { + std::cerr << "Error: Failed to create the VIT tracker" << std::endl; + } + } + + void init(const cv::Mat& image, const cv::Rect& roi) { + if (model) { + model->init(image, roi); + } else { + std::cerr << "Error: VIT tracker not initialized" << std::endl; + } + } + + std::tuple infer(const cv::Mat& image) { + bool is_located = false; + cv::Rect bbox; + float score = 0.0; + + if (model) { + is_located = model->update(image, bbox); + score = model->getTrackingScore(); + } else { + std::cerr << "Error: VIT tracker not initialized" << std::endl; + } + + return std::make_tuple(is_located, bbox, score); + } + +private: + std::string model_path; + int backend_id; + int target_id; + cv::TrackerVit::Params params; + cv::Ptr model; +}; + +#include +#include + +cv::Mat visualize(const cv::Mat& image, const cv::Rect& bbox, float score, bool isLocated, double fps = -1.0, + const cv::Scalar& box_color = cv::Scalar(0, 255, 0), const cv::Scalar& text_color = cv::Scalar(0, 255, 0), + double fontScale = 1.0, int fontSize = 1) { + cv::Mat output = image.clone(); + int h = output.rows; + int w = output.cols; + + if (fps >= 0) { + cv::putText(output, "FPS: " + std::to_string(fps), cv::Point(0, 30), cv::FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize); + } + + if (isLocated && score >= 0.3) { + cv::rectangle(output, bbox, box_color, 2); + cv::putText(output, cv::format("%.2f", score), cv::Point(bbox.x, bbox.y + 25), + cv::FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize); + } else { + cv::Size text_size = cv::getTextSize("Target lost!", cv::FONT_HERSHEY_DUPLEX, fontScale, fontSize, nullptr); + int text_x = (w - text_size.width) / 2; + int text_y = (h - text_size.height) / 2; + cv::putText(output, "Target lost!", cv::Point(text_x, text_y), cv::FONT_HERSHEY_DUPLEX, fontScale, cv::Scalar(0, 0, 255), fontSize); + } + + return output; +} + +int main(int argc, char** argv) { + cv::CommandLineParser parser(argc, argv, + "{input i| |Set path to the input video. Omit for using default camera.}" + "{model_path |object_tracking_vittrack_2023sep.onnx|Set model path}" + "{backend_target bt|0|Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}" + "{save s|false|Specify to save a file with results. Invalid in case of camera input.}" + "{vis v|false|Specify to open a new window to show results. Invalid in case of camera input.}"); + + std::string input_path = parser.get("input"); + std::string model_path = parser.get("model_path"); + int backend_target = parser.get("backend_target"); + bool save_results = parser.get("save"); + bool visualize_results = parser.get("vis"); + + // Check OpenCV version + if (CV_VERSION_MAJOR < 4 || (CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 9)) { + std::cerr << "Please install the latest opencv version (>=4.9.0)" << std::endl; + return -1; + } + + // Valid combinations of backends and targets + std::vector> backend_target_pairs = { + {cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_CPU}, + {cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA}, + {cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA_FP16}, + {cv::dnn::DNN_BACKEND_TIMVX, cv::dnn::DNN_TARGET_NPU}, + {cv::dnn::DNN_BACKEND_CANN, cv::dnn::DNN_TARGET_NPU} + }; + + int backend_id = backend_target_pairs[backend_target][0]; + int target_id = backend_target_pairs[backend_target][1]; + + // Create VitTrack model + VitTrack model(model_path, backend_id, target_id); + + // Open video capture + cv::VideoCapture video; + if (input_path.empty()) { + video.open(0); // Default camera + } else { + video.open(input_path); + } + + if (!video.isOpened()) { + std::cerr << "Error: Could not open video source" << std::endl; + return -1; + } + + // Select an object + cv::Mat first_frame; + video >> first_frame; + + if (first_frame.empty()) { + std::cerr << "No frames grabbed!" << std::endl; + return -1; + } + + cv::Mat first_frame_copy = first_frame.clone(); + cv::putText(first_frame_copy, "1. Drag a bounding box to track.", cv::Point(0, 15), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0)); + cv::putText(first_frame_copy, "2. Press ENTER to confirm", cv::Point(0, 35), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0)); + cv::Rect roi = cv::selectROI("vitTrack Demo", first_frame_copy); + + if (roi.area() == 0) { + std::cerr << "No ROI is selected! Exiting..." << std::endl; + return -1; + } else { + std::cout << "Selected ROI: " << roi << std::endl; + } + + // Initialize tracker with ROI + model.init(first_frame, roi); + + // Track frame by frame + cv::TickMeter tm; + while (cv::waitKey(1) < 0) { + video >> first_frame; + if (first_frame.empty()) { + std::cout << "End of video" << std::endl; + break; + } + + // Inference + tm.start(); + bool isLocated; + cv::Rect bbox; + float score; + std::tie(isLocated, bbox, score) = model.infer(first_frame); + tm.stop(); + + // Visualize + cv::Mat frame = first_frame.clone(); + frame = visualize(frame, bbox, score, isLocated, tm.getFPS()); + cv::imshow("VitTrack Demo", frame); + tm.reset(); + } + + return 0; +} From ad19119c01e5fc7966f123de230f3dcb5846266f Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 2 Mar 2024 07:52:26 -0500 Subject: [PATCH 2/8] Update README documentation --- models/object_tracking_vittrack/README.md | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/models/object_tracking_vittrack/README.md b/models/object_tracking_vittrack/README.md index 25c54dbf..ad3f0a3e 100644 --- a/models/object_tracking_vittrack/README.md +++ b/models/object_tracking_vittrack/README.md @@ -11,14 +11,34 @@ This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC **NOTE: OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.** # Demo - +## Python ```bash +# tracking on camera input +python demo.py + # tracking on video python demo.py --input /path/to/video # get help regarding various parameters python demo.py --help ``` +## C++ +Install latest OpenCV and CMake >= 3.24.0 to get started. + +```shell +# A typical and default installation path of OpenCV is /usr/local +cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation . +cmake --build build + +# tracking on camera input +./build/opencv_zoo_object_tracking_vittrack + +# tracking on video +./build/opencv_zoo_object_tracking_vittrack -i=/path/to/video + +# get help messages +./build/opencv_zoo_object_tracking_vittrack -h +``` # Example outputs From 8a107243fec8ec13aeddbcdc45ce027af04a42b5 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 2 Mar 2024 22:12:43 -0500 Subject: [PATCH 3/8] Fixed text spacing and removed unused arguments. Cleaned up to not use tuple. --- models/object_tracking_vittrack/demo.cpp | 96 ++++++------------------ 1 file changed, 24 insertions(+), 72 deletions(-) diff --git a/models/object_tracking_vittrack/demo.cpp b/models/object_tracking_vittrack/demo.cpp index 32941f9f..77cdc6d3 100644 --- a/models/object_tracking_vittrack/demo.cpp +++ b/models/object_tracking_vittrack/demo.cpp @@ -1,72 +1,38 @@ #include #include +struct TrackingResult { + bool isLocated; + cv::Rect bbox; + float score; +}; + class VitTrack { public: - VitTrack(const std::string& model_path, int backend_id = 0, int target_id = 0) - : model_path(model_path), backend_id(backend_id), target_id(target_id) { + VitTrack(const std::string& model_path, int backend_id = 0, int target_id = 0) { params.net = model_path; params.backend = backend_id; params.target = target_id; - - model = cv::TrackerVit::create(params); - } - - ~VitTrack() = default; - - const std::string& getName() const { - static std::string name = "VitTrack"; - return name; - } - - void setBackendAndTarget(int backend_id, int target_id) { - this->backend_id = backend_id; - this->target_id = target_id; - - params.backend = backend_id; - params.target = target_id; - model = cv::TrackerVit::create(params); - if (!model) { - std::cerr << "Error: Failed to create the VIT tracker" << std::endl; - } } void init(const cv::Mat& image, const cv::Rect& roi) { - if (model) { - model->init(image, roi); - } else { - std::cerr << "Error: VIT tracker not initialized" << std::endl; - } + model->init(image, roi); } - std::tuple infer(const cv::Mat& image) { - bool is_located = false; - cv::Rect bbox; - float score = 0.0; - - if (model) { - is_located = model->update(image, bbox); - score = model->getTrackingScore(); - } else { - std::cerr << "Error: VIT tracker not initialized" << std::endl; - } - - return std::make_tuple(is_located, bbox, score); + TrackingResult infer(const cv::Mat& image) { + TrackingResult result; + result.isLocated = model->update(image, result.bbox); + result.score = model->getTrackingScore(); + return result; } private: - std::string model_path; - int backend_id; - int target_id; cv::TrackerVit::Params params; cv::Ptr model; }; -#include -#include - cv::Mat visualize(const cv::Mat& image, const cv::Rect& bbox, float score, bool isLocated, double fps = -1.0, const cv::Scalar& box_color = cv::Scalar(0, 255, 0), const cv::Scalar& text_color = cv::Scalar(0, 255, 0), double fontScale = 1.0, int fontSize = 1) { @@ -94,25 +60,14 @@ cv::Mat visualize(const cv::Mat& image, const cv::Rect& bbox, float score, bool int main(int argc, char** argv) { cv::CommandLineParser parser(argc, argv, - "{input i| |Set path to the input video. Omit for using default camera.}" - "{model_path |object_tracking_vittrack_2023sep.onnx|Set model path}" - "{backend_target bt|0|Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}" - "{save s|false|Specify to save a file with results. Invalid in case of camera input.}" - "{vis v|false|Specify to open a new window to show results. Invalid in case of camera input.}"); + "{input i | |Set path to the input video. Omit for using default camera.}" + "{model_path |object_tracking_vittrack_2023sep.onnx |Set model path}" + "{backend_target bt |0 |Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}"); std::string input_path = parser.get("input"); std::string model_path = parser.get("model_path"); int backend_target = parser.get("backend_target"); - bool save_results = parser.get("save"); - bool visualize_results = parser.get("vis"); - - // Check OpenCV version - if (CV_VERSION_MAJOR < 4 || (CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 9)) { - std::cerr << "Please install the latest opencv version (>=4.9.0)" << std::endl; - return -1; - } - // Valid combinations of backends and targets std::vector> backend_target_pairs = { {cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_CPU}, {cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA}, @@ -124,8 +79,8 @@ int main(int argc, char** argv) { int backend_id = backend_target_pairs[backend_target][0]; int target_id = backend_target_pairs[backend_target][1]; - // Create VitTrack model - VitTrack model(model_path, backend_id, target_id); + // Create VitTrack tracker + VitTrack tracker(model_path, backend_id, target_id); // Open video capture cv::VideoCapture video; @@ -150,9 +105,9 @@ int main(int argc, char** argv) { } cv::Mat first_frame_copy = first_frame.clone(); - cv::putText(first_frame_copy, "1. Drag a bounding box to track.", cv::Point(0, 15), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0)); - cv::putText(first_frame_copy, "2. Press ENTER to confirm", cv::Point(0, 35), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0)); - cv::Rect roi = cv::selectROI("vitTrack Demo", first_frame_copy); + cv::putText(first_frame_copy, "1. Drag a bounding box to track.", cv::Point(0, 25), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0)); + cv::putText(first_frame_copy, "2. Press ENTER to confirm", cv::Point(0, 50), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0)); + cv::Rect roi = cv::selectROI("VitTrack Demo", first_frame_copy); if (roi.area() == 0) { std::cerr << "No ROI is selected! Exiting..." << std::endl; @@ -162,7 +117,7 @@ int main(int argc, char** argv) { } // Initialize tracker with ROI - model.init(first_frame, roi); + tracker.init(first_frame, roi); // Track frame by frame cv::TickMeter tm; @@ -175,15 +130,12 @@ int main(int argc, char** argv) { // Inference tm.start(); - bool isLocated; - cv::Rect bbox; - float score; - std::tie(isLocated, bbox, score) = model.infer(first_frame); + TrackingResult result = tracker.infer(first_frame); tm.stop(); // Visualize cv::Mat frame = first_frame.clone(); - frame = visualize(frame, bbox, score, isLocated, tm.getFPS()); + frame = visualize(frame, result.bbox, result.score, result.isLocated, tm.getFPS()); cv::imshow("VitTrack Demo", frame); tm.reset(); } From 7b8c179ffa086c10a0ae0bb98d2187549b3a7749 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 2 Mar 2024 22:38:33 -0500 Subject: [PATCH 4/8] Update offsets to match C++ to prevent overlapping text --- models/object_tracking_vittrack/demo.cpp | 4 +++- models/object_tracking_vittrack/demo.py | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/models/object_tracking_vittrack/demo.cpp b/models/object_tracking_vittrack/demo.cpp index 77cdc6d3..641f9b07 100644 --- a/models/object_tracking_vittrack/demo.cpp +++ b/models/object_tracking_vittrack/demo.cpp @@ -62,7 +62,9 @@ int main(int argc, char** argv) { cv::CommandLineParser parser(argc, argv, "{input i | |Set path to the input video. Omit for using default camera.}" "{model_path |object_tracking_vittrack_2023sep.onnx |Set model path}" - "{backend_target bt |0 |Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}"); + "{backend_target bt |0 |Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}" + "{save s |false |Specify to save a file with results. Invalid in case of camera input.}" + "{vis v |false |Specify to open a new window to show results. Invalid in case of camera input.}"); std::string input_path = parser.get("input"); std::string model_path = parser.get("model_path"); diff --git a/models/object_tracking_vittrack/demo.py b/models/object_tracking_vittrack/demo.py index 6230d0db..ce7bc7b7 100644 --- a/models/object_tracking_vittrack/demo.py +++ b/models/object_tracking_vittrack/demo.py @@ -80,12 +80,12 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex print('No frames grabbed!') exit() first_frame_copy = first_frame.copy() - cv.putText(first_frame_copy, "1. Drag a bounding box to track.", (0, 15), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) - cv.putText(first_frame_copy, "2. Press ENTER to confirm", (0, 35), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) - roi = cv.selectROI('vitTrack Demo', first_frame_copy) + cv.putText(first_frame_copy, "1. Drag a bounding box to track.", (0, 25), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) + cv.putText(first_frame_copy, "2. Press ENTER to confirm", (0, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) + roi = cv.selectROI('VitTrack Demo', first_frame_copy) if np.all(np.array(roi) == 0): - print("No roi is selected! Exiting ...") + print("No ROI is selected! Exiting ...") exit() else: print("Selected ROI: {}".format(roi)) From cb6d08ed5db2df6db290cd47c5990f36e92e4416 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 2 Mar 2024 23:41:28 -0500 Subject: [PATCH 5/8] Add help functionality --- models/object_tracking_vittrack/demo.cpp | 6 ++++++ models/object_tracking_vittrack/demo.py | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/models/object_tracking_vittrack/demo.cpp b/models/object_tracking_vittrack/demo.cpp index 641f9b07..751720bd 100644 --- a/models/object_tracking_vittrack/demo.cpp +++ b/models/object_tracking_vittrack/demo.cpp @@ -60,11 +60,17 @@ cv::Mat visualize(const cv::Mat& image, const cv::Rect& bbox, float score, bool int main(int argc, char** argv) { cv::CommandLineParser parser(argc, argv, + "{help h | | Print help message. }" "{input i | |Set path to the input video. Omit for using default camera.}" "{model_path |object_tracking_vittrack_2023sep.onnx |Set model path}" "{backend_target bt |0 |Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}" "{save s |false |Specify to save a file with results. Invalid in case of camera input.}" "{vis v |false |Specify to open a new window to show results. Invalid in case of camera input.}"); + if (parser.has("help")) + { + parser.printMessage(); + return 0; + } std::string input_path = parser.get("input"); std::string model_path = parser.get("model_path"); diff --git a/models/object_tracking_vittrack/demo.py b/models/object_tracking_vittrack/demo.py index ce7bc7b7..5b43b0a9 100644 --- a/models/object_tracking_vittrack/demo.py +++ b/models/object_tracking_vittrack/demo.py @@ -40,7 +40,6 @@ parser.add_argument('--vis', '-v', action='store_true', help='Usage: Specify to open a new window to show results. Invalid in case of camera input.') args = parser.parse_args() - def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1): output = image.copy() h, w, _ = output.shape From e19f3881b50393c63866297d58130d9a77aa99d6 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Sat, 2 Mar 2024 23:43:23 -0500 Subject: [PATCH 6/8] Add using namespace for standalone C++ demo file for readability. --- models/object_tracking_vittrack/demo.cpp | 83 ++++++++++++------------ 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/models/object_tracking_vittrack/demo.cpp b/models/object_tracking_vittrack/demo.cpp index 751720bd..2b9316bd 100644 --- a/models/object_tracking_vittrack/demo.cpp +++ b/models/object_tracking_vittrack/demo.cpp @@ -1,27 +1,30 @@ #include #include +using namespace std; +using namespace cv; + struct TrackingResult { bool isLocated; - cv::Rect bbox; + Rect bbox; float score; }; class VitTrack { public: - VitTrack(const std::string& model_path, int backend_id = 0, int target_id = 0) { + VitTrack(const string& model_path, int backend_id = 0, int target_id = 0) { params.net = model_path; params.backend = backend_id; params.target = target_id; - model = cv::TrackerVit::create(params); + model = TrackerVit::create(params); } - void init(const cv::Mat& image, const cv::Rect& roi) { + void init(const Mat& image, const Rect& roi) { model->init(image, roi); } - TrackingResult infer(const cv::Mat& image) { + TrackingResult infer(const Mat& image) { TrackingResult result; result.isLocated = model->update(image, result.bbox); result.score = model->getTrackingScore(); @@ -29,37 +32,37 @@ class VitTrack { } private: - cv::TrackerVit::Params params; - cv::Ptr model; + TrackerVit::Params params; + Ptr model; }; -cv::Mat visualize(const cv::Mat& image, const cv::Rect& bbox, float score, bool isLocated, double fps = -1.0, - const cv::Scalar& box_color = cv::Scalar(0, 255, 0), const cv::Scalar& text_color = cv::Scalar(0, 255, 0), +Mat visualize(const Mat& image, const Rect& bbox, float score, bool isLocated, double fps = -1.0, + const Scalar& box_color = Scalar(0, 255, 0), const Scalar& text_color = Scalar(0, 255, 0), double fontScale = 1.0, int fontSize = 1) { - cv::Mat output = image.clone(); + Mat output = image.clone(); int h = output.rows; int w = output.cols; if (fps >= 0) { - cv::putText(output, "FPS: " + std::to_string(fps), cv::Point(0, 30), cv::FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize); + putText(output, "FPS: " + to_string(fps), Point(0, 30), FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize); } if (isLocated && score >= 0.3) { - cv::rectangle(output, bbox, box_color, 2); - cv::putText(output, cv::format("%.2f", score), cv::Point(bbox.x, bbox.y + 25), - cv::FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize); + rectangle(output, bbox, box_color, 2); + putText(output, format("%.2f", score), Point(bbox.x, bbox.y + 25), + FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize); } else { - cv::Size text_size = cv::getTextSize("Target lost!", cv::FONT_HERSHEY_DUPLEX, fontScale, fontSize, nullptr); + Size text_size = getTextSize("Target lost!", FONT_HERSHEY_DUPLEX, fontScale, fontSize, nullptr); int text_x = (w - text_size.width) / 2; int text_y = (h - text_size.height) / 2; - cv::putText(output, "Target lost!", cv::Point(text_x, text_y), cv::FONT_HERSHEY_DUPLEX, fontScale, cv::Scalar(0, 0, 255), fontSize); + putText(output, "Target lost!", Point(text_x, text_y), FONT_HERSHEY_DUPLEX, fontScale, Scalar(0, 0, 255), fontSize); } return output; } int main(int argc, char** argv) { - cv::CommandLineParser parser(argc, argv, + CommandLineParser parser(argc, argv, "{help h | | Print help message. }" "{input i | |Set path to the input video. Omit for using default camera.}" "{model_path |object_tracking_vittrack_2023sep.onnx |Set model path}" @@ -72,16 +75,16 @@ int main(int argc, char** argv) { return 0; } - std::string input_path = parser.get("input"); - std::string model_path = parser.get("model_path"); + string input_path = parser.get("input"); + string model_path = parser.get("model_path"); int backend_target = parser.get("backend_target"); - std::vector> backend_target_pairs = { - {cv::dnn::DNN_BACKEND_OPENCV, cv::dnn::DNN_TARGET_CPU}, - {cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA}, - {cv::dnn::DNN_BACKEND_CUDA, cv::dnn::DNN_TARGET_CUDA_FP16}, - {cv::dnn::DNN_BACKEND_TIMVX, cv::dnn::DNN_TARGET_NPU}, - {cv::dnn::DNN_BACKEND_CANN, cv::dnn::DNN_TARGET_NPU} + vector> backend_target_pairs = { + {dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU}, + {dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA}, + {dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16}, + {dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU}, + {dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU} }; int backend_id = backend_target_pairs[backend_target][0]; @@ -91,7 +94,7 @@ int main(int argc, char** argv) { VitTrack tracker(model_path, backend_id, target_id); // Open video capture - cv::VideoCapture video; + VideoCapture video; if (input_path.empty()) { video.open(0); // Default camera } else { @@ -99,40 +102,40 @@ int main(int argc, char** argv) { } if (!video.isOpened()) { - std::cerr << "Error: Could not open video source" << std::endl; + cerr << "Error: Could not open video source" << endl; return -1; } // Select an object - cv::Mat first_frame; + Mat first_frame; video >> first_frame; if (first_frame.empty()) { - std::cerr << "No frames grabbed!" << std::endl; + cerr << "No frames grabbed!" << endl; return -1; } - cv::Mat first_frame_copy = first_frame.clone(); - cv::putText(first_frame_copy, "1. Drag a bounding box to track.", cv::Point(0, 25), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0)); - cv::putText(first_frame_copy, "2. Press ENTER to confirm", cv::Point(0, 50), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0)); - cv::Rect roi = cv::selectROI("VitTrack Demo", first_frame_copy); + Mat first_frame_copy = first_frame.clone(); + putText(first_frame_copy, "1. Drag a bounding box to track.", Point(0, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0)); + putText(first_frame_copy, "2. Press ENTER to confirm", Point(0, 50), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0)); + Rect roi = selectROI("VitTrack Demo", first_frame_copy); if (roi.area() == 0) { - std::cerr << "No ROI is selected! Exiting..." << std::endl; + cerr << "No ROI is selected! Exiting..." << endl; return -1; } else { - std::cout << "Selected ROI: " << roi << std::endl; + cout << "Selected ROI: " << roi << endl; } // Initialize tracker with ROI tracker.init(first_frame, roi); // Track frame by frame - cv::TickMeter tm; - while (cv::waitKey(1) < 0) { + TickMeter tm; + while (waitKey(1) < 0) { video >> first_frame; if (first_frame.empty()) { - std::cout << "End of video" << std::endl; + cout << "End of video" << endl; break; } @@ -142,9 +145,9 @@ int main(int argc, char** argv) { tm.stop(); // Visualize - cv::Mat frame = first_frame.clone(); + Mat frame = first_frame.clone(); frame = visualize(frame, result.bbox, result.score, result.isLocated, tm.getFPS()); - cv::imshow("VitTrack Demo", frame); + imshow("VitTrack Demo", frame); tm.reset(); } From b3d0cd39c5fe585ac187df6e309a25e9361c7a46 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Tue, 5 Mar 2024 11:52:10 -0500 Subject: [PATCH 7/8] Update formatting and add save/visualization functionality --- models/object_tracking_vittrack/demo.cpp | 107 +++++++++++++++++------ models/object_tracking_vittrack/demo.py | 25 ++++-- 2 files changed, 100 insertions(+), 32 deletions(-) diff --git a/models/object_tracking_vittrack/demo.cpp b/models/object_tracking_vittrack/demo.cpp index 2b9316bd..84cd0325 100644 --- a/models/object_tracking_vittrack/demo.cpp +++ b/models/object_tracking_vittrack/demo.cpp @@ -3,8 +3,10 @@ using namespace std; using namespace cv; +using namespace dnn; -struct TrackingResult { +struct TrackingResult +{ bool isLocated; Rect bbox; float score; @@ -13,18 +15,21 @@ struct TrackingResult { class VitTrack { public: - VitTrack(const string& model_path, int backend_id = 0, int target_id = 0) { + VitTrack(const string& model_path, int backend_id = 0, int target_id = 0) + { params.net = model_path; params.backend = backend_id; params.target = target_id; model = TrackerVit::create(params); } - void init(const Mat& image, const Rect& roi) { + void init(const Mat& image, const Rect& roi) + { model->init(image, roi); } - TrackingResult infer(const Mat& image) { + TrackingResult infer(const Mat& image) + { TrackingResult result; result.isLocated = model->update(image, result.bbox); result.score = model->getTrackingScore(); @@ -38,20 +43,25 @@ class VitTrack { Mat visualize(const Mat& image, const Rect& bbox, float score, bool isLocated, double fps = -1.0, const Scalar& box_color = Scalar(0, 255, 0), const Scalar& text_color = Scalar(0, 255, 0), - double fontScale = 1.0, int fontSize = 1) { + double fontScale = 1.0, int fontSize = 1) +{ Mat output = image.clone(); int h = output.rows; int w = output.cols; - if (fps >= 0) { + if (fps >= 0) + { putText(output, "FPS: " + to_string(fps), Point(0, 30), FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize); } - if (isLocated && score >= 0.3) { + if (isLocated && score >= 0.3) + { rectangle(output, bbox, box_color, 2); putText(output, format("%.2f", score), Point(bbox.x, bbox.y + 25), FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize); - } else { + } + else + { Size text_size = getTextSize("Target lost!", FONT_HERSHEY_DUPLEX, fontScale, fontSize, nullptr); int text_x = (w - text_size.width) / 2; int text_y = (h - text_size.height) / 2; @@ -61,30 +71,33 @@ Mat visualize(const Mat& image, const Rect& bbox, float score, bool isLocated, d return output; } -int main(int argc, char** argv) { +int main(int argc, char** argv) +{ CommandLineParser parser(argc, argv, "{help h | | Print help message. }" "{input i | |Set path to the input video. Omit for using default camera.}" "{model_path |object_tracking_vittrack_2023sep.onnx |Set model path}" "{backend_target bt |0 |Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}" - "{save s |false |Specify to save a file with results. Invalid in case of camera input.}" - "{vis v |false |Specify to open a new window to show results. Invalid in case of camera input.}"); + "{save s |false |Specify to save a file with results.}" + "{vis v |true |Specify to open a new window to show results.}"); if (parser.has("help")) { parser.printMessage(); return 0; } - string input_path = parser.get("input"); + string input = parser.get("input"); string model_path = parser.get("model_path"); int backend_target = parser.get("backend_target"); + bool save = parser.get("save"); + bool vis = parser.get("vis"); vector> backend_target_pairs = { - {dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU}, - {dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA}, - {dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16}, - {dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU}, - {dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU} + {DNN_BACKEND_OPENCV, DNN_TARGET_CPU}, + {DNN_BACKEND_CUDA, DNN_TARGET_CUDA}, + {DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16}, + {DNN_BACKEND_TIMVX, DNN_TARGET_NPU}, + {DNN_BACKEND_CANN, DNN_TARGET_NPU} }; int backend_id = backend_target_pairs[backend_target][0]; @@ -95,13 +108,17 @@ int main(int argc, char** argv) { // Open video capture VideoCapture video; - if (input_path.empty()) { + if (input.empty()) + { video.open(0); // Default camera - } else { - video.open(input_path); + } + else + { + video.open(input); } - if (!video.isOpened()) { + if (!video.isOpened()) + { cerr << "Error: Could not open video source" << endl; return -1; } @@ -110,7 +127,8 @@ int main(int argc, char** argv) { Mat first_frame; video >> first_frame; - if (first_frame.empty()) { + if (first_frame.empty()) + { cerr << "No frames grabbed!" << endl; return -1; } @@ -120,21 +138,39 @@ int main(int argc, char** argv) { putText(first_frame_copy, "2. Press ENTER to confirm", Point(0, 50), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0)); Rect roi = selectROI("VitTrack Demo", first_frame_copy); - if (roi.area() == 0) { + if (roi.area() == 0) + { cerr << "No ROI is selected! Exiting..." << endl; return -1; - } else { + } + else + { cout << "Selected ROI: " << roi << endl; } + // Create VideoWriter if save option is specified + VideoWriter output_video; + if (save) + { + Size frame_size = first_frame.size(); + output_video.open("output.mp4", VideoWriter::fourcc('m', 'p', '4', 'v'), video.get(CAP_PROP_FPS), frame_size); + if (!output_video.isOpened()) + { + cerr << "Error: Could not create output video stream" << endl; + return -1; + } + } + // Initialize tracker with ROI tracker.init(first_frame, roi); // Track frame by frame TickMeter tm; - while (waitKey(1) < 0) { + while (waitKey(1) < 0) + { video >> first_frame; - if (first_frame.empty()) { + if (first_frame.empty()) + { cout << "End of video" << endl; break; } @@ -147,9 +183,26 @@ int main(int argc, char** argv) { // Visualize Mat frame = first_frame.clone(); frame = visualize(frame, result.bbox, result.score, result.isLocated, tm.getFPS()); - imshow("VitTrack Demo", frame); + + if (save) + { + output_video.write(frame); + } + + if (vis) + { + imshow("VitTrack Demo", frame); + } tm.reset(); } + if (save) + { + output_video.release(); + } + + video.release(); + destroyAllWindows(); + return 0; } diff --git a/models/object_tracking_vittrack/demo.py b/models/object_tracking_vittrack/demo.py index 5b43b0a9..0111126f 100644 --- a/models/object_tracking_vittrack/demo.py +++ b/models/object_tracking_vittrack/demo.py @@ -35,10 +35,10 @@ {:d}: TIM-VX + NPU, {:d}: CANN + NPU '''.format(*[x for x in range(len(backend_target_pairs))])) -parser.add_argument('--save', '-s', action='store_true', - help='Usage: Specify to save a file with results. Invalid in case of camera input.') -parser.add_argument('--vis', '-v', action='store_true', - help='Usage: Specify to open a new window to show results. Invalid in case of camera input.') +parser.add_argument('--save', '-s', action='store_true', default=False, + help='Usage: Specify to save a file with results.') +parser.add_argument('--vis', '-v', action='store_true', default=True, + help='Usage: Specify to open a new window to show results.') args = parser.parse_args() def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1): output = image.copy() @@ -89,6 +89,11 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex else: print("Selected ROI: {}".format(roi)) + if args.save: + fps = video.get(cv.CAP_PROP_FPS) + frame_size = (first_frame.shape[1], first_frame.shape[0]) + output_video = cv.VideoWriter('output.mp4', cv.VideoWriter_fourcc(*'mp4v'), fps, frame_size) + # Init tracker with ROI model.init(first_frame, roi) @@ -105,5 +110,15 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex tm.stop() # Visualize frame = visualize(frame, bbox, score, isLocated, fps=tm.getFPS()) - cv.imshow('VitTrack Demo', frame) + if args.save: + output_video.write(frame) + + if args.vis: + cv.imshow('VitTrack Demo', frame) tm.reset() + + if args.save: + output_video.release() + + video.release() + cv.destroyAllWindows() From 11beab7bbe9a18afbbc0e075c633d6fd77984be3 Mon Sep 17 00:00:00 2001 From: Ryan Lee Date: Tue, 5 Mar 2024 11:53:22 -0500 Subject: [PATCH 8/8] More formatting changes --- models/object_tracking_vittrack/demo.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/models/object_tracking_vittrack/demo.cpp b/models/object_tracking_vittrack/demo.cpp index 84cd0325..c8ccb26b 100644 --- a/models/object_tracking_vittrack/demo.cpp +++ b/models/object_tracking_vittrack/demo.cpp @@ -12,7 +12,8 @@ struct TrackingResult float score; }; -class VitTrack { +class VitTrack +{ public: VitTrack(const string& model_path, int backend_id = 0, int target_id = 0) @@ -92,7 +93,8 @@ int main(int argc, char** argv) bool save = parser.get("save"); bool vis = parser.get("vis"); - vector> backend_target_pairs = { + vector> backend_target_pairs = + { {DNN_BACKEND_OPENCV, DNN_TARGET_CPU}, {DNN_BACKEND_CUDA, DNN_TARGET_CUDA}, {DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16},