diff --git a/models/object_tracking_vittrack/CMakeLists.txt b/models/object_tracking_vittrack/CMakeLists.txt new file mode 100644 index 00000000..cf20b70e --- /dev/null +++ b/models/object_tracking_vittrack/CMakeLists.txt @@ -0,0 +1,32 @@ +cmake_minimum_required(VERSION 3.24) +set(project_name "opencv_zoo_object_tracking_vittrack") + +PROJECT (${project_name}) + +set(OPENCV_VERSION "4.9.0") +set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation") +find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH}) +# Find OpenCV, you may need to set OpenCV_DIR variable +# to the absolute path to the directory containing OpenCVConfig.cmake file +# via the command line or GUI + +file(GLOB SourceFile + "demo.cpp") +# If the package has been found, several variables will +# be set, you can find the full list with descriptions +# in the OpenCVConfig.cmake file. +# Print some message showing some of them +message(STATUS "OpenCV library status:") +message(STATUS " config: ${OpenCV_DIR}") +message(STATUS " version: ${OpenCV_VERSION}") +message(STATUS " libraries: ${OpenCV_LIBS}") +message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}") + +# Declare the executable target built from your sources +add_executable(${project_name} ${SourceFile}) + +# Set C++ compilation standard to C++11 +set(CMAKE_CXX_STANDARD 11) + +# Link your application with OpenCV libraries +target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS}) diff --git a/models/object_tracking_vittrack/README.md b/models/object_tracking_vittrack/README.md index 25c54dbf..ad3f0a3e 100644 --- a/models/object_tracking_vittrack/README.md +++ b/models/object_tracking_vittrack/README.md @@ -11,14 +11,34 @@ This model is contributed by [Pengyu Liu](https://github.com/lpylpy0514) in GSoC **NOTE: OpenCV > 4.8.0 is required. Build from source with instructions from https://opencv.org/get-started/.** # Demo - +## Python ```bash +# tracking on camera input +python demo.py + # tracking on video python demo.py --input /path/to/video # get help regarding various parameters python demo.py --help ``` +## C++ +Install latest OpenCV and CMake >= 3.24.0 to get started. + +```shell +# A typical and default installation path of OpenCV is /usr/local +cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation . +cmake --build build + +# tracking on camera input +./build/opencv_zoo_object_tracking_vittrack + +# tracking on video +./build/opencv_zoo_object_tracking_vittrack -i=/path/to/video + +# get help messages +./build/opencv_zoo_object_tracking_vittrack -h +``` # Example outputs diff --git a/models/object_tracking_vittrack/demo.cpp b/models/object_tracking_vittrack/demo.cpp new file mode 100644 index 00000000..c8ccb26b --- /dev/null +++ b/models/object_tracking_vittrack/demo.cpp @@ -0,0 +1,210 @@ +#include +#include + +using namespace std; +using namespace cv; +using namespace dnn; + +struct TrackingResult +{ + bool isLocated; + Rect bbox; + float score; +}; + +class VitTrack +{ +public: + + VitTrack(const string& model_path, int backend_id = 0, int target_id = 0) + { + params.net = model_path; + params.backend = backend_id; + params.target = target_id; + model = TrackerVit::create(params); + } + + void init(const Mat& image, const Rect& roi) + { + model->init(image, roi); + } + + TrackingResult infer(const Mat& image) + { + TrackingResult result; + result.isLocated = model->update(image, result.bbox); + result.score = model->getTrackingScore(); + return result; + } + +private: + TrackerVit::Params params; + Ptr model; +}; + +Mat visualize(const Mat& image, const Rect& bbox, float score, bool isLocated, double fps = -1.0, + const Scalar& box_color = Scalar(0, 255, 0), const Scalar& text_color = Scalar(0, 255, 0), + double fontScale = 1.0, int fontSize = 1) +{ + Mat output = image.clone(); + int h = output.rows; + int w = output.cols; + + if (fps >= 0) + { + putText(output, "FPS: " + to_string(fps), Point(0, 30), FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize); + } + + if (isLocated && score >= 0.3) + { + rectangle(output, bbox, box_color, 2); + putText(output, format("%.2f", score), Point(bbox.x, bbox.y + 25), + FONT_HERSHEY_DUPLEX, fontScale, text_color, fontSize); + } + else + { + Size text_size = getTextSize("Target lost!", FONT_HERSHEY_DUPLEX, fontScale, fontSize, nullptr); + int text_x = (w - text_size.width) / 2; + int text_y = (h - text_size.height) / 2; + putText(output, "Target lost!", Point(text_x, text_y), FONT_HERSHEY_DUPLEX, fontScale, Scalar(0, 0, 255), fontSize); + } + + return output; +} + +int main(int argc, char** argv) +{ + CommandLineParser parser(argc, argv, + "{help h | | Print help message. }" + "{input i | |Set path to the input video. Omit for using default camera.}" + "{model_path |object_tracking_vittrack_2023sep.onnx |Set model path}" + "{backend_target bt |0 |Choose backend-target pair: 0 - OpenCV implementation + CPU, 1 - CUDA + GPU (CUDA), 2 - CUDA + GPU (CUDA FP16), 3 - TIM-VX + NPU, 4 - CANN + NPU}" + "{save s |false |Specify to save a file with results.}" + "{vis v |true |Specify to open a new window to show results.}"); + if (parser.has("help")) + { + parser.printMessage(); + return 0; + } + + string input = parser.get("input"); + string model_path = parser.get("model_path"); + int backend_target = parser.get("backend_target"); + bool save = parser.get("save"); + bool vis = parser.get("vis"); + + vector> backend_target_pairs = + { + {DNN_BACKEND_OPENCV, DNN_TARGET_CPU}, + {DNN_BACKEND_CUDA, DNN_TARGET_CUDA}, + {DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16}, + {DNN_BACKEND_TIMVX, DNN_TARGET_NPU}, + {DNN_BACKEND_CANN, DNN_TARGET_NPU} + }; + + int backend_id = backend_target_pairs[backend_target][0]; + int target_id = backend_target_pairs[backend_target][1]; + + // Create VitTrack tracker + VitTrack tracker(model_path, backend_id, target_id); + + // Open video capture + VideoCapture video; + if (input.empty()) + { + video.open(0); // Default camera + } + else + { + video.open(input); + } + + if (!video.isOpened()) + { + cerr << "Error: Could not open video source" << endl; + return -1; + } + + // Select an object + Mat first_frame; + video >> first_frame; + + if (first_frame.empty()) + { + cerr << "No frames grabbed!" << endl; + return -1; + } + + Mat first_frame_copy = first_frame.clone(); + putText(first_frame_copy, "1. Drag a bounding box to track.", Point(0, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0)); + putText(first_frame_copy, "2. Press ENTER to confirm", Point(0, 50), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0)); + Rect roi = selectROI("VitTrack Demo", first_frame_copy); + + if (roi.area() == 0) + { + cerr << "No ROI is selected! Exiting..." << endl; + return -1; + } + else + { + cout << "Selected ROI: " << roi << endl; + } + + // Create VideoWriter if save option is specified + VideoWriter output_video; + if (save) + { + Size frame_size = first_frame.size(); + output_video.open("output.mp4", VideoWriter::fourcc('m', 'p', '4', 'v'), video.get(CAP_PROP_FPS), frame_size); + if (!output_video.isOpened()) + { + cerr << "Error: Could not create output video stream" << endl; + return -1; + } + } + + // Initialize tracker with ROI + tracker.init(first_frame, roi); + + // Track frame by frame + TickMeter tm; + while (waitKey(1) < 0) + { + video >> first_frame; + if (first_frame.empty()) + { + cout << "End of video" << endl; + break; + } + + // Inference + tm.start(); + TrackingResult result = tracker.infer(first_frame); + tm.stop(); + + // Visualize + Mat frame = first_frame.clone(); + frame = visualize(frame, result.bbox, result.score, result.isLocated, tm.getFPS()); + + if (save) + { + output_video.write(frame); + } + + if (vis) + { + imshow("VitTrack Demo", frame); + } + tm.reset(); + } + + if (save) + { + output_video.release(); + } + + video.release(); + destroyAllWindows(); + + return 0; +} diff --git a/models/object_tracking_vittrack/demo.py b/models/object_tracking_vittrack/demo.py index 6230d0db..0111126f 100644 --- a/models/object_tracking_vittrack/demo.py +++ b/models/object_tracking_vittrack/demo.py @@ -35,12 +35,11 @@ {:d}: TIM-VX + NPU, {:d}: CANN + NPU '''.format(*[x for x in range(len(backend_target_pairs))])) -parser.add_argument('--save', '-s', action='store_true', - help='Usage: Specify to save a file with results. Invalid in case of camera input.') -parser.add_argument('--vis', '-v', action='store_true', - help='Usage: Specify to open a new window to show results. Invalid in case of camera input.') +parser.add_argument('--save', '-s', action='store_true', default=False, + help='Usage: Specify to save a file with results.') +parser.add_argument('--vis', '-v', action='store_true', default=True, + help='Usage: Specify to open a new window to show results.') args = parser.parse_args() - def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),text_color=(0, 255, 0), fontScale = 1, fontSize = 1): output = image.copy() h, w, _ = output.shape @@ -80,16 +79,21 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex print('No frames grabbed!') exit() first_frame_copy = first_frame.copy() - cv.putText(first_frame_copy, "1. Drag a bounding box to track.", (0, 15), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) - cv.putText(first_frame_copy, "2. Press ENTER to confirm", (0, 35), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) - roi = cv.selectROI('vitTrack Demo', first_frame_copy) + cv.putText(first_frame_copy, "1. Drag a bounding box to track.", (0, 25), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) + cv.putText(first_frame_copy, "2. Press ENTER to confirm", (0, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) + roi = cv.selectROI('VitTrack Demo', first_frame_copy) if np.all(np.array(roi) == 0): - print("No roi is selected! Exiting ...") + print("No ROI is selected! Exiting ...") exit() else: print("Selected ROI: {}".format(roi)) + if args.save: + fps = video.get(cv.CAP_PROP_FPS) + frame_size = (first_frame.shape[1], first_frame.shape[0]) + output_video = cv.VideoWriter('output.mp4', cv.VideoWriter_fourcc(*'mp4v'), fps, frame_size) + # Init tracker with ROI model.init(first_frame, roi) @@ -106,5 +110,15 @@ def visualize(image, bbox, score, isLocated, fps=None, box_color=(0, 255, 0),tex tm.stop() # Visualize frame = visualize(frame, bbox, score, isLocated, fps=tm.getFPS()) - cv.imshow('VitTrack Demo', frame) + if args.save: + output_video.write(frame) + + if args.vis: + cv.imshow('VitTrack Demo', frame) tm.reset() + + if args.save: + output_video.release() + + video.release() + cv.destroyAllWindows()