Skip to content
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions python/OpenVINO_EP/tiny_yolo_v2_object_detection/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ The source code for this sample is available [here](https://github.com/microsoft
# How to build

## Prerequisites
1. [The Intel<sup>®</sup> Distribution of OpenVINO toolkit](https://docs.openvinotoolkit.org/latest/index.html)
1. [The Intel<sup>®</sup> Distribution of OpenVINO toolkit](https://docs.openvino.ai/latest/openvino_docs_install_guides_install_runtime.html)
Please select Install OpenVINO Runtime using an installer

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolved

2. Download the latest tinyYOLOv2 model from the ONNX Model Zoo.
This model was adapted from [ONNX Model Zoo](https://github.com/onnx/models).Download the latest version of the [tinyYOLOv2](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/tiny-yolov2) model from here.

## Install ONNX Runtime for OpenVINO Execution Provider
Please install the onnxruntime-openvino python package from [here](https://github.com/intel/onnxruntime/releases/tag/v4.0)

## Build steps
## Optional Build steps for ONNX Runtime
[build instructions](https://onnxruntime.ai/docs/build/eps.html#openvino)

## Reference Documentation
Expand All @@ -37,7 +39,7 @@ pip3 install -r requirements.txt

## Running the ONNXRuntime OpenVINO Execution Provider sample
```bash

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also we need to add --h option to this sample

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolved

python3 tiny_yolov2_obj_detection_sample.py
python3 tiny_yolov2_obj_detection_sample.py --video bottle-detection.mp4 --model tinyyolov2.onnx

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also make passing device type options configurable with the sample.
Default, it should pick OpenVINO-EP CPU.
But the user should also have options to run on other device_types in OpenVINO-EP and also MLAS (default CPU-EP)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolved

```

## To stop the sample from running
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,36 @@
import cv2
import time
import os
import argparse

# color look up table for different classes for object detection sample
clut = [(0,0,0),(255,0,0),(255,0,255),(0,0,255),(0,255,0),(0,255,128),
(128,255,0),(128,128,0),(0,128,255),(128,0,128),
(255,0,128),(128,0,255),(255,128,128),(128,255,128),(255,255,0),
(255,128,128),(128,128,255),(255,128,128),(128,255,128),(128,255,128)]

# 20 labels that the tiny-yolov2 model can do the object_detection on
label = ["aeroplane","bicycle","bird","boat","bottle",
"bus","car","cat","chair","cow","diningtable",
"dog","horse","motorbike","person","pottedplant",
"sheep","sofa","train","tvmonitor"]

def parse_arguments():
parser = argparse.ArgumentParser(description='Object Detection using YOLOv2 in OPENCV using OpenVINO Execution Provider for ONNXRuntime')
parser.add_argument('--device', default='CPU_FP32', help="Device to perform inference on 'cpu (MLAS)' or on devices supported by OpenVINO-EP [CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VAD-M_FP16].")
parser.add_argument('--video', help='Path to video file.')
parser.add_argument('--model', help='Path to model.')
args = parser.parse_args()
return args

def sigmoid(x, derivative=False):
return x*(1-x) if derivative else 1/(1+np.exp(-x))

def softmax(x):
scoreMatExp = np.exp(np.asarray(x))
return scoreMatExp / scoreMatExp.sum(0)
score_mat_exp = np.exp(np.asarray(x))
return score_mat_exp / score_mat_exp.sum(0)

def checkModelExtension(fp):
def check_model_extension(fp):
# Split the extension from the path and normalise it to lowercase.
ext = os.path.splitext(fp)[-1].lower()

Expand All @@ -27,7 +48,7 @@ def checkModelExtension(fp):
if not os.path.exists(fp):
raise Exception("[ ERROR ] Path of the onnx model file is Invalid")

def checkVideoFileExtension(fp):
def check_video_file_extension(fp):
# Split the extension from the path and normalise it to lowercase.
ext = os.path.splitext(fp)[-1].lower()
# Now we can simply use != to check for inequality, no need for wildcards.
Expand All @@ -40,154 +61,155 @@ def checkVideoFileExtension(fp):
if not os.path.exists(fp):
raise Exception("[ ERROR ] Path of the video file is Invalid")

# color look up table for different classes for object detection sample
clut = [(0,0,0),(255,0,0),(255,0,255),(0,0,255),(0,255,0),(0,255,128),
(128,255,0),(128,128,0),(0,128,255),(128,0,128),
(255,0,128),(128,0,255),(255,128,128),(128,255,128),(255,255,0),
(255,128,128),(128,128,255),(255,128,128),(128,255,128),(128,255,128)]

# 20 labels that the tiny-yolov2 model can do the object_detection on
label = ["aeroplane","bicycle","bird","boat","bottle",
"bus","car","cat","chair","cow","diningtable",
"dog","horse","motorbike","person","pottedplant",
"sheep","sofa","train","tvmonitor"]

model_file_path = "tiny_yolo_v2_zoo_model.onnx"
# TODO: You need to modify the path to the input onnx model based on where it is located on your device after downloading it from ONNX Model zoo.

# Validate model file path
checkModelExtension(model_file_path)

device = 'CPU_FP32'
# Set OpenVINO as the Execution provider to infer this model and load the model
sess = rt.InferenceSession(model_file_path, providers=['OpenVINOExecutionProvider'], provider_options=[{'device_type' : device}])

# Get the input name of the model
input_name = sess.get_inputs()[0].name

'''
other 'device_type' options are: (Any hardware target can be assigned if you have the access to it)

'CPU_FP32', 'GPU_FP32', 'GPU_FP16', 'MYRIAD_FP16', 'VAD-M_FP16', 'VAD-F_FP32',
'HETERO:MYRIAD,CPU', 'MULTI:MYRIAD,GPU,CPU'

'''

#Path to video file has to be provided
video_file_path = "sample_demo_video.mp4"
# TODO: You need to specify the path to your own sample video based on where it is located on your device.

#validate video file input path
checkVideoFileExtension(video_file_path)

#Path to video file has to be provided
cap = cv2.VideoCapture(video_file_path)

# capturing different metrics of the image from the video
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
x_scale = float(width)/416.0 #In the document of tino-yolo-v2, input shape of this network is (1,3,416,416).
y_scale = float(height)/416.0

# writing the inferencing output as a video to the local disk
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_video_name = device + "_output.avi"
output_video = cv2.VideoWriter(output_video_name,fourcc, float(17.0), (640,360))

# capturing one frame at a time from the video feed and performing the inference
i = 0
while cap.isOpened():
l_start = time.time()
ret, frame = cap.read()
if not ret:
break
initial_w = cap.get(3)
initial_h = cap.get(4)
def image_preprocess(frame):
in_frame = cv2.resize(frame, (416, 416))
preprocessed_image = np.asarray(in_frame)
preprocessed_image = preprocessed_image.astype(np.float32)
preprocessed_image = preprocessed_image.transpose(2,0,1)
#Reshaping the input array to align with the input shape of the model
preprocessed_image = preprocessed_image.reshape(1,3,416,416)
return preprocessed_image

def postprocess_output(out, frame, x_scale, y_scale):
out = out[0][0]
num_classes = 20
anchors = [1.08, 1.19, 3.42, 4.41, 6.63, 11.38, 9.42, 5.11, 16.62, 10.52]
existing_labels = {l: [] for l in label}

#Inside this loop we compute the bounding box b for grid cell (cy, cx)
for cy in range(0,13):
for cx in range(0,13):
for b in range(0,5):
# First we read the tx, ty, width(tw), and height(th) for the bounding box from the out array, as well as the confidence score
channel = b*(num_classes+5)
tx = out[channel ][cy][cx]
ty = out[channel+1][cy][cx]
tw = out[channel+2][cy][cx]
th = out[channel+3][cy][cx]
tc = out[channel+4][cy][cx]

x = (float(cx) + sigmoid(tx))*32
y = (float(cy) + sigmoid(ty))*32
w = np.exp(tw) * 32 * anchors[2*b]
h = np.exp(th) * 32 * anchors[2*b+1]

#calculating the confidence score
confidence = sigmoid(tc) # The confidence value for the bounding box is given by tc
classes = np.zeros(num_classes)
for c in range(0,num_classes):
classes[c] = out[channel + 5 +c][cy][cx]
# we take the softmax to turn the array into a probability distribution. And then we pick the class with the largest score as the winner.
classes = softmax(classes)
detected_class = classes.argmax()
# Now we can compute the final score for this bounding box and we only want to keep the ones whose combined score is over a certain threshold
if 0.45< classes[detected_class]*confidence:
color =clut[detected_class]
x = (x - w/2)*x_scale
y = (y - h/2)*y_scale
w *= x_scale
h *= y_scale

labelX = int((x+x+w)/2)
labelY = int((y+y+h)/2)
addLabel = True
lab_threshold = 40
for point in existing_labels[label[detected_class]]:
if labelX < point[0] + lab_threshold and labelX > point[0] - lab_threshold and \
labelY < point[1] + lab_threshold and labelY > point[1] - lab_threshold:
addLabel = False
#Adding class labels to the output of the frame and also drawing a rectangular bounding box around the object detected.
if addLabel:
cv2.rectangle(frame, (int(x),int(y)),(int(x+w),int(y+h)),color,2)
cv2.rectangle(frame, (int(x),int(y-13)),(int(x)+9*len(label[detected_class]),int(y)),color,-1)
cv2.putText(frame,label[detected_class],(int(x)+2,int(y)-3),cv2.FONT_HERSHEY_COMPLEX,0.4,(255,255,255),1)
existing_labels[label[detected_class]].append((labelX,labelY))
print('{} detected in frame {}'.format(label[detected_class],i))

def show_bbox(device, frame, inference_time):
cv2.putText(frame,device,(10,20),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1)
cv2.putText(frame,'FPS: {}'.format(1.0/inference_time),(10,40),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1)
cv2.imshow('frame',frame)

def main():

# Process arguments
args = parse_arguments()

# Validate model file path
check_model_extension(args.model)

if (args.device == 'cpu'):
print("Device type selected is 'cpu' which is the default CPU Execution Provider (MLAS)")
#Specify the path to the ONNX model on your machine and register the CPU EP
sess = rt.InferenceSession(args.model, providers=['CPUExecutionProvider'])
elif (args.device == 'CPU_FP32' or args.device == 'GPU_FP32' or args.device == 'GPU_FP16' or args.device == 'MYRIAD_FP16' or args.device == 'VADM_FP16'):
#Specify the path to the ONNX model on your machine and register the OpenVINO EP
sess = rt.InferenceSession(args.model, providers=['OpenVINOExecutionProvider'], provider_options=[{'device_type' : args.device}])
print("Device type selected is: " + args.device + " using the OpenVINO Execution Provider")
'''
other 'device_type' options are: (Any hardware target can be assigned if you have the access to it)
'CPU_FP32', 'GPU_FP32', 'GPU_FP16', 'MYRIAD_FP16', 'VAD-M_FP16'
'''
else:
print("Device type selected is not [cpu, CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VADM_FP16]")

# Get the input name of the model
input_name = sess.get_inputs()[0].name

#validate video file input path
check_video_file_extension(args.video)

#Path to video file has to be provided
cap = cv2.VideoCapture(args.video)

# capturing different metrics of the image from the video
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# writing the inferencing output as a video to the local disk
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_video_name = args.device + "_output.avi"
output_video = cv2.VideoWriter(output_video_name,fourcc, float(17.0), (640,360))

# capturing one frame at a time from the video feed and performing the inference
i = 0
while cap.isOpened():
l_start = time.time()
ret, frame = cap.read()
if not ret:
break
initial_w = cap.get(3)
initial_h = cap.get(4)

# preprocessing the input frame and reshaping it.
#In the document of tino-yolo-v2, input shape of this network is (1,3,416,416). so we resize the model frame w.r.t that size.
in_frame = cv2.resize(frame, (416, 416))
X = np.asarray(in_frame)
X = X.astype(np.float32)
X = X.transpose(2,0,1)
# Reshaping the input array to align with the input shape of the model
X = X.reshape(1,3,416,416)
# preprocessing the input frame and reshaping it.
#In the document of tino-yolo-v2, input shape of this network is (1,3,416,416). so we resize the model frame w.r.t that size.
preprocessed_image = image_preprocess(frame)

start = time.time()
#Running the session by passing in the input data of the model
out = sess.run(None, {input_name: preprocessed_image})
end = time.time()
inference_time = end - start

#Get the output
x_scale = float(width)/416.0 #In the document of tino-yolo-v2, input shape of this network is (1,3,416,416).
y_scale = float(height)/416.0
out = postprocess_output(out, frame, x_scale, y_scale)

#Show the Output
output_video.write(frame)
show_bbox(args.device, frame, inference_time)

start = time.time()
#Running the session by passing in the input data of the model
out = sess.run(None, {input_name: X})
end = time.time()
inference_time = end - start
out = out[0][0]

numClasses = 20
anchors = [1.08, 1.19, 3.42, 4.41, 6.63, 11.38, 9.42, 5.11, 16.62, 10.52]

existingLabels = {l: [] for l in label}

#Inside this loop we compute the bounding box b for grid cell (cy, cx)
for cy in range(0,13):
for cx in range(0,13):
for b in range(0,5):
# First we read the tx, ty, width(tw), and height(th) for the bounding box from the out array, as well as the confidence score
channel = b*(numClasses+5)
tx = out[channel ][cy][cx]
ty = out[channel+1][cy][cx]
tw = out[channel+2][cy][cx]
th = out[channel+3][cy][cx]
tc = out[channel+4][cy][cx]

x = (float(cx) + sigmoid(tx))*32
y = (float(cy) + sigmoid(ty))*32

w = np.exp(tw) * 32 * anchors[2*b ]
h = np.exp(th) * 32 * anchors[2*b+1]

#calculating the confidence score
confidence = sigmoid(tc) # The confidence value for the bounding box is given by tc

classes = np.zeros(numClasses)
for c in range(0,numClasses):
classes[c] = out[channel + 5 +c][cy][cx]
# we take the softmax to turn the array into a probability distribution. And then we pick the class with the largest score as the winner.
classes = softmax(classes)
detectedClass = classes.argmax()

# Now we can compute the final score for this bounding box and we only want to keep the ones whose combined score is over a certain threshold
if 0.45< classes[detectedClass]*confidence:
color =clut[detectedClass]
x = (x - w/2)*x_scale
y = (y - h/2)*y_scale
w *= x_scale
h *= y_scale

labelX = int((x+x+w)/2)
labelY = int((y+y+h)/2)
addLabel = True
labThreshold = 40
for point in existingLabels[label[detectedClass]]:
if labelX < point[0] + labThreshold and labelX > point[0] - labThreshold and \
labelY < point[1] + labThreshold and labelY > point[1] - labThreshold:
addLabel = False
#Adding class labels to the output of the frame and also drawing a rectangular bounding box around the object detected.
if addLabel:
cv2.rectangle(frame, (int(x),int(y)),(int(x+w),int(y+h)),color,2)
cv2.rectangle(frame, (int(x),int(y-13)),(int(x)+9*len(label[detectedClass]),int(y)),color,-1)
cv2.putText(frame,label[detectedClass],(int(x)+2,int(y)-3),cv2.FONT_HERSHEY_COMPLEX,0.4,(255,255,255),1)
existingLabels[label[detectedClass]].append((labelX,labelY))
print('{} detected in frame {}'.format(label[detectedClass],i))
output_video.write(frame)
cv2.putText(frame,device,(10,20),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1)
cv2.putText(frame,'FPS: {}'.format(1.0/inference_time),(10,40),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1)
cv2.imshow('frame',frame)

#Press 'q' to quit the process
if cv2.waitKey(1) & 0xFF == ord('q'):
break
print('Processed Frame {}'.format(i))
i += 1
l_end = time.time()
print('Loop Time = {}'.format(l_end - l_start))
output_video.release()
cv2.destroyAllWindows()
#Press 'q' to quit the process
if cv2.waitKey(1) & 0xFF == ord('q'):
break
print('Processed Frame {}'.format(i))
i += 1
l_end = time.time()
print('Loop Time = {}'.format(l_end - l_start))
output_video.release()
cv2.destroyAllWindows()

if __name__ == "__main__":
main()
Loading