diff --git a/python/OpenVINO_EP/tiny_yolo_v2_object_detection/README.md b/python/OpenVINO_EP/tiny_yolo_v2_object_detection/README.md index 276b21fa0..30e40b71f 100644 --- a/python/OpenVINO_EP/tiny_yolo_v2_object_detection/README.md +++ b/python/OpenVINO_EP/tiny_yolo_v2_object_detection/README.md @@ -9,13 +9,16 @@ The source code for this sample is available [here](https://github.com/microsoft # How to build ## Prerequisites -1. [The Intel® Distribution of OpenVINO toolkit](https://docs.openvinotoolkit.org/latest/index.html) +1. [The Intel® Distribution of OpenVINO toolkit](https://docs.openvino.ai/latest/openvino_docs_install_guides_install_runtime.html) + Please select Install OpenVINO Runtime using an installer +2. Please check also the documentation link for the [installer](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_linux.html#doxid-openvino-docs-install-guides-installing-openvino-linux) 2. Download the latest tinyYOLOv2 model from the ONNX Model Zoo. This model was adapted from [ONNX Model Zoo](https://github.com/onnx/models).Download the latest version of the [tinyYOLOv2](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/tiny-yolov2) model from here. ## Install ONNX Runtime for OpenVINO Execution Provider +Please install the onnxruntime-openvino python package from [here](https://github.com/intel/onnxruntime/releases/) -## Build steps +## Optional Build steps for ONNX Runtime [build instructions](https://onnxruntime.ai/docs/build/eps.html#openvino) ## Reference Documentation @@ -26,18 +29,21 @@ The source code for this sample is available [here](https://github.com/microsoft * numpy version 1.19.5+ * opencv 4.5.1+ * python 3+ -* use any sample video with objects as test input to this sample -* Download the tinyYOLOv2 model from the ONNX Model Zoo +* use any sample video with objects as test input to this sample [Download Sample videos](https://github.com/intel-iot-devkit/sample-videos) +* Download the tinyYOLOv2 model from the [ONNX Model Zoo](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/tiny-yolov2) Note: For all the python package dependencies requirements, check 'requirements.txt' file in the sample directory. You may also install these dependencies with: ```bash pip3 install -r requirements.txt ``` - +### How to run the sample +```bash +python3 tiny_yolov2_obj_detection_sample.py --h +``` ## Running the ONNXRuntime OpenVINO Execution Provider sample ```bash -python3 tiny_yolov2_obj_detection_sample.py +python3 tiny_yolov2_obj_detection_sample.py --video face-demographics-walking-and-pause.mp4 --model tinyyolov2.onnx --device CPU_FP32 ``` ## To stop the sample from running diff --git a/python/OpenVINO_EP/tiny_yolo_v2_object_detection/tiny_yolov2_obj_detection_sample.py b/python/OpenVINO_EP/tiny_yolo_v2_object_detection/tiny_yolov2_obj_detection_sample.py index e4dbbe291..ef978462d 100644 --- a/python/OpenVINO_EP/tiny_yolo_v2_object_detection/tiny_yolov2_obj_detection_sample.py +++ b/python/OpenVINO_EP/tiny_yolo_v2_object_detection/tiny_yolov2_obj_detection_sample.py @@ -1,5 +1,5 @@ ''' -Copyright (C) 2021, Intel Corporation +Copyright (C) 2021-2022, Intel Corporation SPDX-License-Identifier: Apache-2.0 ''' @@ -8,15 +8,36 @@ import cv2 import time import os +import argparse + +# color look up table for different classes for object detection sample +clut = [(0,0,0),(255,0,0),(255,0,255),(0,0,255),(0,255,0),(0,255,128), + (128,255,0),(128,128,0),(0,128,255),(128,0,128), + (255,0,128),(128,0,255),(255,128,128),(128,255,128),(255,255,0), + (255,128,128),(128,128,255),(255,128,128),(128,255,128),(128,255,128)] + +# 20 labels that the tiny-yolov2 model can do the object_detection on +label = ["aeroplane","bicycle","bird","boat","bottle", + "bus","car","cat","chair","cow","diningtable", + "dog","horse","motorbike","person","pottedplant", + "sheep","sofa","train","tvmonitor"] + +def parse_arguments(): + parser = argparse.ArgumentParser(description='Object Detection using YOLOv2 in OPENCV using OpenVINO Execution Provider for ONNXRuntime') + parser.add_argument('--device', default='CPU_FP32', help="Device to perform inference on 'cpu (MLAS)' or on devices supported by OpenVINO-EP [CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VAD-M_FP16].") + parser.add_argument('--video', help='Path to video file.') + parser.add_argument('--model', help='Path to model.') + args = parser.parse_args() + return args def sigmoid(x, derivative=False): return x*(1-x) if derivative else 1/(1+np.exp(-x)) def softmax(x): - scoreMatExp = np.exp(np.asarray(x)) - return scoreMatExp / scoreMatExp.sum(0) + score_mat_exp = np.exp(np.asarray(x)) + return score_mat_exp / score_mat_exp.sum(0) -def checkModelExtension(fp): +def check_model_extension(fp): # Split the extension from the path and normalise it to lowercase. ext = os.path.splitext(fp)[-1].lower() @@ -27,7 +48,7 @@ def checkModelExtension(fp): if not os.path.exists(fp): raise Exception("[ ERROR ] Path of the onnx model file is Invalid") -def checkVideoFileExtension(fp): +def check_video_file_extension(fp): # Split the extension from the path and normalise it to lowercase. ext = os.path.splitext(fp)[-1].lower() # Now we can simply use != to check for inequality, no need for wildcards. @@ -40,154 +61,157 @@ def checkVideoFileExtension(fp): if not os.path.exists(fp): raise Exception("[ ERROR ] Path of the video file is Invalid") -# color look up table for different classes for object detection sample -clut = [(0,0,0),(255,0,0),(255,0,255),(0,0,255),(0,255,0),(0,255,128), - (128,255,0),(128,128,0),(0,128,255),(128,0,128), - (255,0,128),(128,0,255),(255,128,128),(128,255,128),(255,255,0), - (255,128,128),(128,128,255),(255,128,128),(128,255,128),(128,255,128)] - -# 20 labels that the tiny-yolov2 model can do the object_detection on -label = ["aeroplane","bicycle","bird","boat","bottle", - "bus","car","cat","chair","cow","diningtable", - "dog","horse","motorbike","person","pottedplant", - "sheep","sofa","train","tvmonitor"] - -model_file_path = "tiny_yolo_v2_zoo_model.onnx" -# TODO: You need to modify the path to the input onnx model based on where it is located on your device after downloading it from ONNX Model zoo. - -# Validate model file path -checkModelExtension(model_file_path) - -device = 'CPU_FP32' -# Set OpenVINO as the Execution provider to infer this model and load the model -sess = rt.InferenceSession(model_file_path, providers=['OpenVINOExecutionProvider'], provider_options=[{'device_type' : device}]) - -# Get the input name of the model -input_name = sess.get_inputs()[0].name - -''' -other 'device_type' options are: (Any hardware target can be assigned if you have the access to it) - -'CPU_FP32', 'GPU_FP32', 'GPU_FP16', 'MYRIAD_FP16', 'VAD-M_FP16', 'VAD-F_FP32', -'HETERO:MYRIAD,CPU', 'MULTI:MYRIAD,GPU,CPU' +def image_preprocess(frame): + in_frame = cv2.resize(frame, (416, 416)) + preprocessed_image = np.asarray(in_frame) + preprocessed_image = preprocessed_image.astype(np.float32) + preprocessed_image = preprocessed_image.transpose(2,0,1) + #Reshaping the input array to align with the input shape of the model + preprocessed_image = preprocessed_image.reshape(1,3,416,416) + return preprocessed_image + +def postprocess_output(out, frame, x_scale, y_scale, i): + out = out[0][0] + num_classes = 20 + anchors = [1.08, 1.19, 3.42, 4.41, 6.63, 11.38, 9.42, 5.11, 16.62, 10.52] + existing_labels = {l: [] for l in label} + + #Inside this loop we compute the bounding box b for grid cell (cy, cx) + for cy in range(0,13): + for cx in range(0,13): + for b in range(0,5): + # First we read the tx, ty, width(tw), and height(th) for the bounding box from the out array, as well as the confidence score + channel = b*(num_classes+5) + tx = out[channel ][cy][cx] + ty = out[channel+1][cy][cx] + tw = out[channel+2][cy][cx] + th = out[channel+3][cy][cx] + tc = out[channel+4][cy][cx] + + x = (float(cx) + sigmoid(tx))*32 + y = (float(cy) + sigmoid(ty))*32 + w = np.exp(tw) * 32 * anchors[2*b] + h = np.exp(th) * 32 * anchors[2*b+1] + + #calculating the confidence score + confidence = sigmoid(tc) # The confidence value for the bounding box is given by tc + classes = np.zeros(num_classes) + for c in range(0,num_classes): + classes[c] = out[channel + 5 +c][cy][cx] + # we take the softmax to turn the array into a probability distribution. And then we pick the class with the largest score as the winner. + classes = softmax(classes) + detected_class = classes.argmax() + # Now we can compute the final score for this bounding box and we only want to keep the ones whose combined score is over a certain threshold + if 0.60 < classes[detected_class]*confidence: + color =clut[detected_class] + x = (x - w/2)*x_scale + y = (y - h/2)*y_scale + w *= x_scale + h *= y_scale + + labelX = int((x+x+w)/2) + labelY = int((y+y+h)/2) + addLabel = True + lab_threshold = 100 + for point in existing_labels[label[detected_class]]: + if labelX < point[0] + lab_threshold and labelX > point[0] - lab_threshold and \ + labelY < point[1] + lab_threshold and labelY > point[1] - lab_threshold: + addLabel = False + #Adding class labels to the output of the frame and also drawing a rectangular bounding box around the object detected. + if addLabel: + cv2.rectangle(frame, (int(x),int(y)),(int(x+w),int(y+h)),color,2) + cv2.rectangle(frame, (int(x),int(y-13)),(int(x)+9*len(label[detected_class]),int(y)),color,-1) + cv2.putText(frame,label[detected_class],(int(x)+2,int(y)-3),cv2.FONT_HERSHEY_COMPLEX,0.4,(255,255,255),1) + existing_labels[label[detected_class]].append((labelX,labelY)) + print('{} detected in frame {}'.format(label[detected_class],i)) + -''' +def show_bbox(device, frame, inference_time): + cv2.putText(frame,device,(10,20),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1) + cv2.putText(frame,'FPS: {}'.format(1.0/inference_time),(10,40),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1) + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + cv2.imshow('frame',frame) -#Path to video file has to be provided -video_file_path = "sample_demo_video.mp4" -# TODO: You need to specify the path to your own sample video based on where it is located on your device. - -#validate video file input path -checkVideoFileExtension(video_file_path) - -#Path to video file has to be provided -cap = cv2.VideoCapture(video_file_path) - -# capturing different metrics of the image from the video -fps = cap.get(cv2.CAP_PROP_FPS) -width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) -height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) -x_scale = float(width)/416.0 #In the document of tino-yolo-v2, input shape of this network is (1,3,416,416). -y_scale = float(height)/416.0 - -# writing the inferencing output as a video to the local disk -fourcc = cv2.VideoWriter_fourcc(*'XVID') -output_video_name = device + "_output.avi" -output_video = cv2.VideoWriter(output_video_name,fourcc, float(17.0), (640,360)) - -# capturing one frame at a time from the video feed and performing the inference -i = 0 -while cap.isOpened(): - l_start = time.time() - ret, frame = cap.read() - if not ret: - break - initial_w = cap.get(3) - initial_h = cap.get(4) +def main(): + + # Process arguments + args = parse_arguments() + + # Validate model file path + check_model_extension(args.model) + so = rt.SessionOptions() + so.log_severity_level = 3 + if (args.device == 'cpu'): + print("Device type selected is 'cpu' which is the default CPU Execution Provider (MLAS)") + #Specify the path to the ONNX model on your machine and register the CPU EP + sess = rt.InferenceSession(args.model, so, providers=['CPUExecutionProvider']) + elif (args.device == 'CPU_FP32' or args.device == 'GPU_FP32' or args.device == 'GPU_FP16' or args.device == 'MYRIAD_FP16' or args.device == 'VADM_FP16'): + #Specify the path to the ONNX model on your machine and register the OpenVINO EP + sess = rt.InferenceSession(args.model, so, providers=['OpenVINOExecutionProvider'], provider_options=[{'device_type' : args.device}]) + print("Device type selected is: " + args.device + " using the OpenVINO Execution Provider") + ''' + other 'device_type' options are: (Any hardware target can be assigned if you have the access to it) + 'CPU_FP32', 'GPU_FP32', 'GPU_FP16', 'MYRIAD_FP16', 'VAD-M_FP16' + ''' + else: + print("Device type selected is not [cpu, CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VADM_FP16]") + + # Get the input name of the model + input_name = sess.get_inputs()[0].name + + #validate video file input path + check_video_file_extension(args.video) + + #Path to video file has to be provided + cap = cv2.VideoCapture(args.video) + + # capturing different metrics of the image from the video + fps = cap.get(cv2.CAP_PROP_FPS) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + x_scale = float(width)/416.0 #In the document of tino-yolo-v2, input shape of this network is (1,3,416,416). + y_scale = float(height)/416.0 + + # writing the inferencing output as a video to the local disk + fourcc = cv2.VideoWriter_fourcc(*'XVID') + output_video_name = args.device + "_output.avi" + output_video = cv2.VideoWriter(output_video_name,fourcc, float(17.0), (640,360)) + + # capturing one frame at a time from the video feed and performing the inference + i = 0 + while cv2.waitKey(1) < 0: + l_start = time.time() + ret, frame = cap.read() + if not ret: + break + initial_w = cap.get(3) + initial_h = cap.get(4) - # preprocessing the input frame and reshaping it. - #In the document of tino-yolo-v2, input shape of this network is (1,3,416,416). so we resize the model frame w.r.t that size. - in_frame = cv2.resize(frame, (416, 416)) - X = np.asarray(in_frame) - X = X.astype(np.float32) - X = X.transpose(2,0,1) - # Reshaping the input array to align with the input shape of the model - X = X.reshape(1,3,416,416) + # preprocessing the input frame and reshaping it. + #In the document of tino-yolo-v2, input shape of this network is (1,3,416,416). so we resize the model frame w.r.t that size. + preprocessed_image = image_preprocess(frame) + + start = time.time() + #Running the session by passing in the input data of the model + out = sess.run(None, {input_name: preprocessed_image}) + end = time.time() + inference_time = end - start + + #Get the output + postprocess_output(out, frame, x_scale, y_scale, i) + + #Show the Output + output_video.write(frame) + show_bbox(args.device, frame, inference_time) - start = time.time() - #Running the session by passing in the input data of the model - out = sess.run(None, {input_name: X}) - end = time.time() - inference_time = end - start - out = out[0][0] - - numClasses = 20 - anchors = [1.08, 1.19, 3.42, 4.41, 6.63, 11.38, 9.42, 5.11, 16.62, 10.52] - - existingLabels = {l: [] for l in label} - - #Inside this loop we compute the bounding box b for grid cell (cy, cx) - for cy in range(0,13): - for cx in range(0,13): - for b in range(0,5): - # First we read the tx, ty, width(tw), and height(th) for the bounding box from the out array, as well as the confidence score - channel = b*(numClasses+5) - tx = out[channel ][cy][cx] - ty = out[channel+1][cy][cx] - tw = out[channel+2][cy][cx] - th = out[channel+3][cy][cx] - tc = out[channel+4][cy][cx] - - x = (float(cx) + sigmoid(tx))*32 - y = (float(cy) + sigmoid(ty))*32 - - w = np.exp(tw) * 32 * anchors[2*b ] - h = np.exp(th) * 32 * anchors[2*b+1] - - #calculating the confidence score - confidence = sigmoid(tc) # The confidence value for the bounding box is given by tc - - classes = np.zeros(numClasses) - for c in range(0,numClasses): - classes[c] = out[channel + 5 +c][cy][cx] - # we take the softmax to turn the array into a probability distribution. And then we pick the class with the largest score as the winner. - classes = softmax(classes) - detectedClass = classes.argmax() - - # Now we can compute the final score for this bounding box and we only want to keep the ones whose combined score is over a certain threshold - if 0.45< classes[detectedClass]*confidence: - color =clut[detectedClass] - x = (x - w/2)*x_scale - y = (y - h/2)*y_scale - w *= x_scale - h *= y_scale - - labelX = int((x+x+w)/2) - labelY = int((y+y+h)/2) - addLabel = True - labThreshold = 40 - for point in existingLabels[label[detectedClass]]: - if labelX < point[0] + labThreshold and labelX > point[0] - labThreshold and \ - labelY < point[1] + labThreshold and labelY > point[1] - labThreshold: - addLabel = False - #Adding class labels to the output of the frame and also drawing a rectangular bounding box around the object detected. - if addLabel: - cv2.rectangle(frame, (int(x),int(y)),(int(x+w),int(y+h)),color,2) - cv2.rectangle(frame, (int(x),int(y-13)),(int(x)+9*len(label[detectedClass]),int(y)),color,-1) - cv2.putText(frame,label[detectedClass],(int(x)+2,int(y)-3),cv2.FONT_HERSHEY_COMPLEX,0.4,(255,255,255),1) - existingLabels[label[detectedClass]].append((labelX,labelY)) - print('{} detected in frame {}'.format(label[detectedClass],i)) - output_video.write(frame) - cv2.putText(frame,device,(10,20),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1) - cv2.putText(frame,'FPS: {}'.format(1.0/inference_time),(10,40),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1) - cv2.imshow('frame',frame) - - #Press 'q' to quit the process - if cv2.waitKey(1) & 0xFF == ord('q'): - break - print('Processed Frame {}'.format(i)) - i += 1 - l_end = time.time() - print('Loop Time = {}'.format(l_end - l_start)) -output_video.release() -cv2.destroyAllWindows() \ No newline at end of file + #Press 'q' to quit the process + print('Processed Frame {}'.format(i)) + i += 1 + l_end = time.time() + print('Loop Time = {}'.format(l_end - l_start)) + + output_video.release() + cv2.destroyAllWindows() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/python/OpenVINO_EP/yolov4_object_detection/README.md b/python/OpenVINO_EP/yolov4_object_detection/README.md index ffe9c1ca8..988e26d6e 100644 --- a/python/OpenVINO_EP/yolov4_object_detection/README.md +++ b/python/OpenVINO_EP/yolov4_object_detection/README.md @@ -16,13 +16,15 @@ The source code for this sample is available [here](https://github.com/microsoft # How to build ## Prerequisites -1. [The Intel® Distribution of OpenVINO toolkit](https://docs.openvinotoolkit.org/latest/index.html) -2. Download the latest tinyYOLOv2 model from the ONNX Model Zoo. - This model was adapted from [ONNX Model Zoo](https://github.com/onnx/models).Download the latest version of the [YOLOv4](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/yolov4) model from here. +1. [The Intel® Distribution of OpenVINO toolkit](https://docs.openvino.ai/latest/openvino_docs_install_guides_install_runtime.html) + Please select Install OpenVINO Runtime using an installer +2. Please check also the documentation link for the [installer](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_linux.html#doxid-openvino-docs-install-guides-installing-openvino-linux) +3. Download the latest version of the [YOLOv4](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/yolov4) model from here. ## Install ONNX Runtime for OpenVINO Execution Provider +Please install the onnxruntime-openvino python package from [here](https://github.com/intel/onnxruntime/releases/) -## Build steps +## Optional Build steps for ONNX Runtime [build instructions](https://onnxruntime.ai/docs/build/eps.html#openvino) ## Reference Documentation @@ -33,9 +35,8 @@ The source code for this sample is available [here](https://github.com/microsoft * numpy version 1.19.5+ * opencv 4.5.1+ * python 3+ -* use any sample video with objects as test input to this sample -* Download the tinyYOLOv2 model from the ONNX Model Zoo -[Download Sample videos](https://github.com/intel-iot-devkit/sample-videos) +* use any sample video with objects as test input to this sample [Download Sample videos](https://github.com/intel-iot-devkit/sample-videos) +* download the Yolov4 model from the [ONNX Model Zoo](https://github.com/onnx/models/tree/main/vision/object_detection_segmentation/yolov4) Note: For all the python package dependencies requirements, check 'requirements.txt' file in the sample directory. You may also install these dependencies with: ```bash @@ -51,28 +52,28 @@ python3 yolov4.py --h ### Run the sample on OpenVINO EP ```bash -python3 yolov4.py --device CPU_FP32 --video bottle-detection.mp4 +python3 yolov4.py --device CPU_FP32 --video classroom.mp4 --model yolov4.onnx ``` Note: You can pick different device options to run on OpenVINO EP like GPU_FP32, GPU_FP16 and MYRIAD_FP16. ### Run the sample on default CPU EP (MLAS) ```bash -python3 yolov4.py --device cpu --video bottle-detection.mp4 +python3 yolov4.py --device cpu --video classroom.mp4 --model yolov4.onnx ``` ### Run the sample with video as Input ```bash -python3 yolov4.py --device CPU_FP32 --video bottle-detection.mp4 +python3 yolov4.py --device CPU_FP32 --video classroom.mp4 --model yolov4.onnx ``` ### Run the sample with Image as Input ```bash -python3 yolov4.py --device CPU_FP32 --image cat.jpg +python3 yolov4.py --device CPU_FP32 --image cat.jpg --model yolov4.onnx ``` ### Run the sample with Live Input stream Like webcam ```bash -python3 yolov4.py --device CPU_FP32 +python3 yolov4.py --device CPU_FP32 --model yolov4.onnx ``` ## To stop the sample from running diff --git a/python/OpenVINO_EP/yolov4_object_detection/yolov4.py b/python/OpenVINO_EP/yolov4_object_detection/yolov4.py index c519e9cf9..526289381 100644 --- a/python/OpenVINO_EP/yolov4_object_detection/yolov4.py +++ b/python/OpenVINO_EP/yolov4_object_detection/yolov4.py @@ -1,17 +1,17 @@ ''' -Copyright (C) 2021, Intel Corporation +Copyright (C) 2021-2022, Intel Corporation SPDX-License-Identifier: Apache-2.0 Major Portions of this code are copyright of their respective authors and released under the Apache License Version 2.0: -- onnx, Copyright 2021. For licensing see https://github.com/onnx/models/blob/master/LICENSE +- onnx, Copyright 2021-2022. For licensing see https://github.com/onnx/models/blob/master/LICENSE ''' import cv2 import numpy as np from onnx import numpy_helper import onnx +import onnxruntime as rt import os from PIL import Image -import onnxruntime as rt from scipy import special import colorsys import random @@ -19,12 +19,6 @@ import sys import time -parser = argparse.ArgumentParser(description='Object Detection using YOLOv4 in OPENCV using OpenVINO Execution Provider for ONNXRuntime') -parser.add_argument('--device', default='cpu', help="Device to perform inference on 'cpu (MLAS)' or on devices supported by OpenVINO-EP [CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VAD-M_FP16].") -parser.add_argument('--image', help='Path to image file.') -parser.add_argument('--video', help='Path to video file.') -args = parser.parse_args() - def image_preprocess(image, target_size, gt_boxes=None): ih, iw = target_size @@ -47,60 +41,7 @@ def image_preprocess(image, target_size, gt_boxes=None): gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * scale + dh return image_padded, gt_boxes -# Process inputs -winName = 'Object detection using ONNXRuntime OpenVINO Execution Provider using YoloV4 model' -cv2.namedWindow(winName, cv2.WINDOW_NORMAL) - -outputFile = "yolo_out_py.avi" -if (args.image): - # Open the image file - if not os.path.isfile(args.image): - print("Input image file ", args.image, " doesn't exist") - sys.exit(1) - cap = cv2.VideoCapture(args.image) - outputFile = args.image[:-4]+'_yolo_out_py.jpg' -elif (args.video): - # Open the video file - if not os.path.isfile(args.video): - print("Input video file ", args.video, " doesn't exist") - sys.exit(1) - cap = cv2.VideoCapture(args.video) - outputFile = args.video[:-4]+'_yolo_out_py.avi' -else: - # Webcam input - cap = cv2.VideoCapture(0) - -# Get the video writer initialized to save the output video -if (not args.image): - vid_writer = cv2.VideoWriter(outputFile, cv2.VideoWriter_fourcc('M','J','P','G'), 30, (round(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))) - -device = args.device - -if(args.device == 'cpu'): - print("Device type selected is 'cpu' which is the default CPU Execution Provider (MLAS)") - #Specify the path to the ONNX model on your machine and register the CPU EP - sess = rt.InferenceSession("yolov4/yolov4.onnx", providers=['CPUExecutionProvider']) -else: - #Specify the path to the ONNX model on your machine and register the OpenVINO EP - sess = rt.InferenceSession("yolov4/yolov4.onnx", providers=['OpenVINOExecutionProvider'], provider_options=[{'device_type' : device}]) - print("Device type selected is: " + device + " using the OpenVINO Execution Provider") - ''' - other 'device_type' options are: (Any hardware target can be assigned if you have the access to it) - 'CPU_FP32', 'GPU_FP32', 'GPU_FP16', 'MYRIAD_FP16', 'VAD-M_FP16' - ''' - -outputs = sess.get_outputs() -output_names = list(map(lambda output: output.name, outputs)) -input_name = sess.get_inputs()[0].name - -def get_anchors(anchors_path, tiny=False): - '''loads the anchors from a file''' - with open(anchors_path) as f: - anchors = f.readline() - anchors = np.array(anchors.split(','), dtype=np.float32) - return anchors.reshape(3, 3, 2) - -def postprocess_bbbox(pred_bbox, ANCHORS, STRIDES, XYSCALE=[1,1,1]): +def postprocess_bbbox(pred_bbox): '''define anchor boxes''' for i, pred in enumerate(pred_bbox): conv_shape = pred.shape @@ -111,7 +52,7 @@ def postprocess_bbbox(pred_bbox, ANCHORS, STRIDES, XYSCALE=[1,1,1]): xy_grid = np.expand_dims(np.stack(xy_grid, axis=-1), axis=2) xy_grid = np.tile(np.expand_dims(xy_grid, axis=0), [1, 1, 1, 3, 1]) - xy_grid = xy_grid.astype(np.float) + xy_grid = xy_grid.astype(float) pred_xy = ((special.expit(conv_raw_dxdy) * XYSCALE[i]) - 0.5 * (XYSCALE[i] - 1) + xy_grid) * STRIDES[i] pred_wh = (np.exp(conv_raw_dwdh) * ANCHORS[i]) @@ -258,54 +199,139 @@ def draw_bbox(image, bboxes, classes=read_class_names("coco.names"), show_label= return image +def get_anchors(anchors_path, tiny=False): + '''loads the anchors from a file''' + with open(anchors_path) as f: + anchors = f.readline() + anchors = np.array(anchors.split(','), dtype=np.float32) + return anchors.reshape(3, 3, 2) + #Specify the path to anchors file on your machine -ANCHORS = "./yolov4_anchors.txt" +ANCHORS = "./yolov4_anchors.txt" STRIDES = [8, 16, 32] XYSCALE = [1.2, 1.1, 1.05] - ANCHORS = get_anchors(ANCHORS) STRIDES = np.array(STRIDES) -while cv2.waitKey(1) < 0: - - # get frame from the video - hasFrame, frame = cap.read() - - # Stop the program if reached end of video - if not hasFrame: - print("Done processing !!!") - print("Output file is stored as ", outputFile) - cv2.waitKey(3000) - # Release device - cap.release() - break - - input_size = 416 - - original_image = frame - original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) - original_image_size = original_image.shape[:2] - - image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) - image_data = image_data[np.newaxis, ...].astype(np.float32) +def parse_arguments(): + parser = argparse.ArgumentParser(description='Object Detection using YOLOv4 in OPENCV using OpenVINO Execution Provider for ONNXRuntime') + parser.add_argument('--device', default='CPU_FP32', help="Device to perform inference on 'cpu (MLAS)' or on devices supported by OpenVINO-EP [CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VAD-M_FP16].") + parser.add_argument('--image', help='Path to image file.') + parser.add_argument('--video', help='Path to video file.') + parser.add_argument('--model', help='Path to model.') + args = parser.parse_args() + return args - start = time.time() - detections = sess.run(output_names, {input_name: image_data}) - end = time.time() - inference_time = end - start +def check_model_extension(fp): + # Split the extension from the path and normalise it to lowercase. + ext = os.path.splitext(fp)[-1].lower() - pred_bbox = postprocess_bbbox(detections, ANCHORS, STRIDES, XYSCALE) - bboxes = postprocess_boxes(pred_bbox, original_image_size, input_size, 0.25) - bboxes = nms(bboxes, 0.213, method='nms') - image = draw_bbox(original_image, bboxes) + # Now we can simply use != to check for inequality, no need for wildcards. + if(ext != ".onnx"): + raise Exception(fp, "is an unknown file format. Use the model ending with .onnx format") + + if not os.path.exists(fp): + raise Exception("[ ERROR ] Path of the onnx model file is Invalid") - cv2.putText(image,device,(10,20),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1) - cv2.putText(image,'FPS: {}'.format(1.0/inference_time),(10,40),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1) +def main(): - # Write the frame with the detection boxes + # Process arguments + args = parse_arguments() + + # Validate model file path + check_model_extension(args.model) + + # Process inputs + win_name = 'Object detection using ONNXRuntime OpenVINO Execution Provider using YoloV4 model' + cv2.namedWindow(win_name, cv2.WINDOW_NORMAL) + + output_file = "yolo_out_py.avi" if (args.image): - cv2.imwrite(outputFile, frame.astype(np.uint8)) + # Open the image file + if not os.path.isfile(args.image): + print("Input image file ", args.image, " doesn't exist") + sys.exit(1) + cap = cv2.VideoCapture(args.image) + output_file = args.image[:-4]+'_yolo_out_py.jpg' + elif (args.video): + # Open the video file + if not os.path.isfile(args.video): + print("Input video file ", args.video, " doesn't exist") + sys.exit(1) + cap = cv2.VideoCapture(args.video) + output_file = args.video[:-4]+'_yolo_out_py.avi' else: - vid_writer.write(frame.astype(np.uint8)) + # Webcam input + cap = cv2.VideoCapture(0) - cv2.imshow(winName, image) \ No newline at end of file + # Get the video writer initialized to save the output video + if (not args.image): + vid_writer = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc('M','J','P','G'), 30, (round(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))) + + # Check the device information and create a session + device = args.device + so = rt.SessionOptions() + so.log_severity_level = 3 + if(args.device == 'cpu'): + print("Device type selected is 'cpu' which is the default CPU Execution Provider (MLAS)") + #Specify the path to the ONNX model on your machine and register the CPU EP + sess = rt.InferenceSession(args.model, so, providers=['CPUExecutionProvider']) + else: + #Specify the path to the ONNX model on your machine and register the OpenVINO EP + sess = rt.InferenceSession(args.model, so, providers=['OpenVINOExecutionProvider'], provider_options=[{'device_type' : device}]) + print("Device type selected is: " + device + " using the OpenVINO Execution Provider") + ''' + other 'device_type' options are: (Any hardware target can be assigned if you have the access to it) + 'CPU_FP32', 'GPU_FP32', 'GPU_FP16', 'MYRIAD_FP16', 'VAD-M_FP16' + ''' + + input_name = sess.get_inputs()[0].name + + while cv2.waitKey(1) < 0: + # get frame from the video + has_frame, frame = cap.read() + # Stop the program if reached end of video + if not has_frame: + print("Done processing !!!") + print("Output file is stored as ", output_file) + has_frame=False + cv2.waitKey(3000) + # Release device + cap.release() + break + + input_size = 416 + original_image = frame + original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) + original_image_size = original_image.shape[:2] + + image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) + image_data = image_data[np.newaxis, ...].astype(np.float32) + + outputs = sess.get_outputs() + output_names = list(map(lambda output: output.name, outputs)) + + start = time.time() + detections = sess.run(output_names, {input_name: image_data}) + end = time.time() + inference_time = end - start + + pred_bbox = postprocess_bbbox(detections) + bboxes = postprocess_boxes(pred_bbox, original_image_size, input_size, 0.25) + bboxes = nms(bboxes, 0.213, method='nms') + image = draw_bbox(original_image, bboxes) + + cv2.putText(image,device,(10,20),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1) + cv2.putText(image,'FPS: {}'.format(1.0/inference_time),(10,40),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1) + + # Write the frame with the detection boxes + if (args.image): + cv2.imwrite(output_file, frame.astype(np.uint8)) + else: + vid_writer.write(frame.astype(np.uint8)) + + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + cv2.imshow(win_name, image) + +if __name__ == "__main__": + main()