diff --git a/predict.py b/predict.py index 9dceed406..09778d9c5 100644 --- a/predict.py +++ b/predict.py @@ -5,19 +5,51 @@ 3、如果想要获得框的坐标,可以进入detect_image函数,读取top,left,bottom,right这四个值。 4、如果想要截取下目标,可以利用获取到的top,left,bottom,right这四个值在原图上利用矩阵的方式进行截取。 ''' +import cv2 from PIL import Image +import numpy as np from yolo import YOLO +from IPython import embed + yolo = YOLO() -while True: - img = input('Input image filename:') - try: - image = Image.open(img) - except: - print('Open Error! Try again!') - continue - else: - r_image = yolo.detect_image(image) - r_image.show() +image = Image.open('./img/view2.jpg')# 返回PIL.img对象 +uncroped_image = cv2.imread("./img/view2.jpg") + +r_image,boxes = yolo.detect_image(image) + +# 进行裁剪 +box = boxes + +for i in range(boxes.shape[0]): + # top, left, bottom, right = boxes[i] + # 或者用下面这句等价 + top = boxes[i][0] + left = boxes[i][1] + bottom = boxes[i][2] + right = boxes[i][3] + + top = top - 5 + left = left - 5 + bottom = bottom + 5 + right = right + 5 + + # 左上角点的坐标 + top = int(max(0, np.floor(top + 0.5).astype('int32'))) + + left = int(max(0, np.floor(left + 0.5).astype('int32'))) + # 右下角点的坐标 + bottom = int(min(np.shape(image)[0], np.floor(bottom + 0.5).astype('int32'))) + right = int(min(np.shape(image)[1], np.floor(right + 0.5).astype('int32'))) + + # embed() + + # 问题出在这里:不能用这个方法,看两个参数是长和宽,是从图像的原点开始裁剪的,这样肯定是不对的 + croped_region = uncroped_image[top:bottom,left:right]# 先高后宽 + # 将裁剪好的目标保存到本地 + cv2.imwrite("./output/croped_view2_img_"+str(i)+".jpg",croped_region) + +# embed() +r_image.show() diff --git a/yolo.py b/yolo.py index 7d439cbdd..f46d8f2d4 100644 --- a/yolo.py +++ b/yolo.py @@ -16,6 +16,7 @@ from utils.utils import (DecodeBox, bbox_iou, letterbox_image, non_max_suppression, yolo_correct_boxes) +from IPython import embed #--------------------------------------------# # 使用自己训练好的模型预测需要修改2个参数 @@ -28,10 +29,10 @@ class YOLO(object): "model_path" : 'model_data/yolo4_weights.pth', "anchors_path" : 'model_data/yolo_anchors.txt', "classes_path" : 'model_data/coco_classes.txt', - "model_image_size" : (416, 416, 3), - "confidence" : 0.5, + "model_image_size" : (416, 416, 3),#这里的model_image_size是什么,不会跟图像size产生冲突吗,为什么不可以改???? + "confidence" : 0.3, "iou" : 0.3, - "cuda" : True + "cuda" : False } @classmethod @@ -96,7 +97,7 @@ def generate(self): #---------------------------------------------------# # 建立三个特征层解码用的工具 #---------------------------------------------------# - self.yolo_decodes = [] + self.yolo_decodes = []# 创建数组,将三个解码器放到数组中 for i in range(3): self.yolo_decodes.append(DecodeBox(self.anchors[i], len(self.class_names), (self.model_image_size[1], self.model_image_size[0]))) @@ -114,39 +115,52 @@ def generate(self): # 检测图片 #---------------------------------------------------# def detect_image(self, image): + # embed() image_shape = np.array(np.shape(image)[0:2]) + num_class = len(self.class_names)# 有80类 + # embed() #---------------------------------------------------------# - # 给图像增加灰条,实现不失真的resize + # 给图像增加灰条(什么是灰条),实现不失真的resize #---------------------------------------------------------# + # 复制image return new_image crop_img = np.array(letterbox_image(image, (self.model_image_size[1],self.model_image_size[0]))) - photo = np.array(crop_img,dtype = np.float32) / 255.0 - photo = np.transpose(photo, (2, 0, 1)) + photo = np.array(crop_img,dtype = np.float32) / 255.0# 归一化? + photo = np.transpose(photo, (2, 0, 1))# 转置:将Image.open(img)得到的[H,W,C]格式转换permute为pytorch可以处理的[C,H,W]格式 #---------------------------------------------------------# # 添加上batch_size维度 #---------------------------------------------------------# - images = [photo] + images = [photo]# 将photo变为list类型 - with torch.no_grad(): - images = torch.from_numpy(np.asarray(images)) + with torch.no_grad():# disabled gradient calculation,reduce memory consumption for computations + images = torch.from_numpy(np.asarray(images))# Creates a Tensor from a numpy.ndarray,此时images的shape为[1, 3, 416, 416] if self.cuda: images = images.cuda() #---------------------------------------------------------# # 将图像输入网络当中进行预测! #---------------------------------------------------------# + # embed() + # 从这里开始处理 + # 特征提取 + # 输出outputs为tuple,len=3,每个tensor的shape分别为 第一个特征层[1, 255, 13, 13],第二个特征层[1, 255, 26, 26],第三个特征层[1, 255, 52, 52] outputs = self.net(images) + # embed() output_list = [] - for i in range(3): - output_list.append(self.yolo_decodes[i](outputs[i])) + for i in range(3):# 为什么是3 + # 有三个特征层,每个特征层对应自己的decode解码器 + output_list.append(self.yolo_decodes[i](outputs[i]))# 在这里打几个断点看看 #---------------------------------------------------------# # 将预测框进行堆叠,然后进行非极大抑制 #---------------------------------------------------------# - output = torch.cat(output_list, 1) + # torch.cat()对矩阵按行进行拼接得到向量 + output = torch.cat(output_list, 1)# 这里也打几个断点 + # output就是predictions,格式为[batch_size, num_anchors, 85] batch_detections = non_max_suppression(output, len(self.class_names), conf_thres=self.confidence, nms_thres=self.iou) + # embed() #---------------------------------------------------------# # 如果没有检测出物体,返回原图 @@ -159,17 +173,25 @@ def detect_image(self, image): #---------------------------------------------------------# # 对预测框进行得分筛选 #---------------------------------------------------------# + # coordinates = []# bboxes的坐标 + top_index = batch_detections[:,4] * batch_detections[:,5] > self.confidence top_conf = batch_detections[top_index,4]*batch_detections[top_index,5] top_label = np.array(batch_detections[top_index,-1],np.int32) top_bboxes = np.array(batch_detections[top_index,:4]) + + # 得到坐标点 top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1) + # coordinates.append((top_xmin,top_xmax,top_ymin,top_ymax))# 把四个坐标点看做一个整体 + #-----------------------------------------------------------------# # 在图像传入网络预测前会进行letterbox_image给图像周围添加灰条 # 因此生成的top_bboxes是相对于有灰条的图像的 # 我们需要对其进行修改,去除灰条的部分。 #-----------------------------------------------------------------# + + # boxes存放各目标的坐标 boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape) font = ImageFont.truetype(font='model_data/simhei.ttf',size=np.floor(3e-2 * np.shape(image)[1] + 0.5).astype('int32')) @@ -177,6 +199,7 @@ def detect_image(self, image): thickness = max((np.shape(image)[0] + np.shape(image)[1]) // self.model_image_size[0], 1) for i, c in enumerate(top_label): + # embed() predicted_class = self.class_names[c] score = top_conf[i] @@ -186,8 +209,10 @@ def detect_image(self, image): bottom = bottom + 5 right = right + 5 + # 左上角点的坐标 top = max(0, np.floor(top + 0.5).astype('int32')) left = max(0, np.floor(left + 0.5).astype('int32')) + # 右下角点的坐标 bottom = min(np.shape(image)[0], np.floor(bottom + 0.5).astype('int32')) right = min(np.shape(image)[1], np.floor(right + 0.5).astype('int32')) @@ -207,10 +232,10 @@ def detect_image(self, image): draw.rectangle( [left + i, top + i, right - i, bottom - i], outline=self.colors[self.class_names.index(predicted_class)]) - draw.rectangle( + draw.rectangle(# 画框框 [tuple(text_origin), tuple(text_origin + label_size)], fill=self.colors[self.class_names.index(predicted_class)]) draw.text(text_origin, str(label,'UTF-8'), fill=(0, 0, 0), font=font) del draw - return image + return image,boxes# 将boxes返回