|
| 1 | +# !/usr/bin/env python |
| 2 | +# -*- encoding: utf-8 -*- |
| 3 | + |
| 4 | +import argparse |
| 5 | +import json |
| 6 | +from webbrowser import BackgroundBrowser |
| 7 | +import yaml |
| 8 | +import shutil |
| 9 | +import glob |
| 10 | +import os |
| 11 | +from pathlib import Path |
| 12 | + |
| 13 | +import cv2 |
| 14 | +from tqdm import tqdm |
| 15 | + |
| 16 | + |
| 17 | +def read_txt(txt_path): |
| 18 | + with open(str(txt_path), 'r', encoding='utf-8') as f: |
| 19 | + data = list(map(lambda x: x.rstrip('\n'), f)) |
| 20 | + return data |
| 21 | + |
| 22 | + |
| 23 | +def mkdir(dir_path): |
| 24 | + Path(dir_path).mkdir(parents=True, exist_ok=True) |
| 25 | + |
| 26 | + |
| 27 | +def verify_exists(file_path): |
| 28 | + file_path = Path(file_path) |
| 29 | + if not file_path.exists(): |
| 30 | + raise FileNotFoundError(f'The {file_path} is not exists!!!') |
| 31 | + |
| 32 | +class YOLOV5CFG2COCO(object): |
| 33 | + def __init__(self, cfg_file): |
| 34 | + ROOT = Path(cfg_file).resolve().parent |
| 35 | + with open(cfg_file, 'r', encoding="UTF-8") as f: |
| 36 | + data_cfg = yaml.safe_load(f) |
| 37 | + path = Path(data_cfg.get('path') or '') # optional 'path' default to '.' |
| 38 | + if not path.is_absolute(): |
| 39 | + path = (ROOT / path).resolve() |
| 40 | + for k in 'train', 'val', 'test': |
| 41 | + if data_cfg.get(k): # prepend path |
| 42 | + data_cfg[k] = str(path / data_cfg[k]) if isinstance(data_cfg[k], str) else [str(path / x) for x in data_cfg[k]] |
| 43 | + if 'names' not in data_cfg: |
| 44 | + data_cfg['names'] = [f'class{i}' for i in range(data_cfg['nc'])] # assign class names if missing |
| 45 | + self.train_path, self.val_path, self.test_path = (data_cfg.get(x) for x in ('train', 'val', 'test')) |
| 46 | + nc = data_cfg['nc'] |
| 47 | + self.names = data_cfg['names'] |
| 48 | + assert len(self.names) == nc, f'{len(self.names)} names found for nc={nc} dataset in {cfg_file}' # check |
| 49 | + |
| 50 | + # 构建COCO格式目录 |
| 51 | + self.dst = ROOT / f"{Path(cfg_file).stem}_COCO_format" |
| 52 | + self.coco_train = "train2017" |
| 53 | + self.coco_val = "val2017" |
| 54 | + self.coco_annotation = "annotations" |
| 55 | + self.coco_train_json = self.dst / self.coco_annotation / \ |
| 56 | + f'instances_{self.coco_train}.json' |
| 57 | + self.coco_val_json = self.dst / self.coco_annotation / \ |
| 58 | + f'instances_{self.coco_val}.json' |
| 59 | + |
| 60 | + mkdir(self.dst) |
| 61 | + mkdir(self.dst / self.coco_train) |
| 62 | + mkdir(self.dst / self.coco_val) |
| 63 | + mkdir(self.dst / self.coco_annotation) |
| 64 | + |
| 65 | + # 构建json内容结构 |
| 66 | + self.type = 'instances' |
| 67 | + self.categories = [] |
| 68 | + self._get_category() |
| 69 | + self.annotation_id = 1 |
| 70 | + |
| 71 | + self.info = { |
| 72 | + 'year': 2021, |
| 73 | + 'version': '1.0', |
| 74 | + 'description': 'For object detection', |
| 75 | + 'date_created': '2021', |
| 76 | + } |
| 77 | + |
| 78 | + self.licenses = [{ |
| 79 | + 'id': 1, |
| 80 | + 'name': 'Apache License v2.0', |
| 81 | + 'url': 'https://github.com/RapidAI/YOLO2COCO/LICENSE', |
| 82 | + }] |
| 83 | + |
| 84 | + def _get_category(self): |
| 85 | + for i, category in enumerate(self.names, 1): |
| 86 | + self.categories.append({ |
| 87 | + 'supercategory': category, |
| 88 | + 'id': i, |
| 89 | + 'name': category, |
| 90 | + }) |
| 91 | + # self.categories.append({ |
| 92 | + # 'supercategory': 'Background', |
| 93 | + # 'id': 0, |
| 94 | + # 'name': 'Background', |
| 95 | + # }) |
| 96 | + |
| 97 | + def generate(self): |
| 98 | + self.train_files = self.getfiles(self.train_path) |
| 99 | + self.valid_files = self.getfiles(self.val_path) |
| 100 | + |
| 101 | + train_dest_dir = Path(self.dst) / self.coco_train |
| 102 | + self.gen_dataset(self.train_files, train_dest_dir, |
| 103 | + self.coco_train_json, mode='train') |
| 104 | + |
| 105 | + val_dest_dir = Path(self.dst) / self.coco_val |
| 106 | + self.gen_dataset(self.valid_files, val_dest_dir, |
| 107 | + self.coco_val_json, mode='val') |
| 108 | + |
| 109 | + print(f"The output directory is: {str(self.dst)}") |
| 110 | + |
| 111 | + def getfiles(self, path): |
| 112 | + IMG_FORMATS = 'bmp', 'dng', 'jpeg', 'jpg', 'mpo', 'png', 'tif', 'tiff', 'webp' # include image suffixes |
| 113 | + f = [] |
| 114 | + for p in path if isinstance(path, list) else [path]: |
| 115 | + p = Path(p) # os-agnostic |
| 116 | + if p.is_dir(): # dir |
| 117 | + f += glob.glob(str(p / '**' / '*.*'), recursive=True) |
| 118 | + # f = list(p.rglob('*.*')) # pathlib |
| 119 | + elif p.is_file(): # file |
| 120 | + with open(p) as t: |
| 121 | + t = t.read().strip().splitlines() |
| 122 | + parent = str(p.parent) + os.sep |
| 123 | + f += [x.replace('./', parent) if x.startswith('./') else x for x in t] # local to global path |
| 124 | + # f += [p.parent / x.lstrip(os.sep) for x in t] # local to global path (pathlib) |
| 125 | + else: |
| 126 | + raise Exception(f'{p} does not exist') |
| 127 | + im_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS) |
| 128 | + return im_files |
| 129 | + |
| 130 | + def gen_dataset(self, img_paths, target_img_path, target_json, mode): |
| 131 | + """ |
| 132 | + https://cocodataset.org/#format-data |
| 133 | +
|
| 134 | + """ |
| 135 | + images = [] |
| 136 | + annotations = [] |
| 137 | + sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep # /images/, /labels/ substrings |
| 138 | + for img_id, img_path in enumerate(tqdm(img_paths, desc=mode), 1): |
| 139 | + label_path = sb.join(img_path.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' |
| 140 | + img_path = Path(img_path) |
| 141 | + |
| 142 | + verify_exists(img_path) |
| 143 | + print(img_path) |
| 144 | + imgsrc = cv2.imread(str(img_path)) |
| 145 | + height, width = imgsrc.shape[:2] |
| 146 | + |
| 147 | + dest_file_name = f'{img_id:012d}.jpg' |
| 148 | + save_img_path = target_img_path / dest_file_name |
| 149 | + |
| 150 | + if img_path.suffix.lower() == ".jpg": |
| 151 | + shutil.copyfile(img_path, save_img_path) |
| 152 | + else: |
| 153 | + cv2.imwrite(str(save_img_path), imgsrc) |
| 154 | + |
| 155 | + images.append({ |
| 156 | + 'date_captured': '2021', |
| 157 | + 'file_name': dest_file_name, |
| 158 | + 'id': img_id, |
| 159 | + 'height': height, |
| 160 | + 'width': width, |
| 161 | + }) |
| 162 | + |
| 163 | + if Path(label_path).exists(): |
| 164 | + new_anno = self.read_annotation(label_path, img_id, |
| 165 | + height, width) |
| 166 | + if len(new_anno) > 0: |
| 167 | + annotations.extend(new_anno) |
| 168 | + else: |
| 169 | + # print(f'{label_path} is empty') |
| 170 | + raise ValueError(f'{label_path} is empty') |
| 171 | + else: |
| 172 | + raise FileNotFoundError(f'{label_path} not exists') |
| 173 | + |
| 174 | + json_data = { |
| 175 | + 'info': self.info, |
| 176 | + 'images': images, |
| 177 | + 'licenses': self.licenses, |
| 178 | + 'type': self.type, |
| 179 | + 'annotations': annotations, |
| 180 | + 'categories': self.categories, |
| 181 | + } |
| 182 | + with open(target_json, 'w', encoding='utf-8') as f: |
| 183 | + json.dump(json_data, f, ensure_ascii=False) |
| 184 | + |
| 185 | + def read_annotation(self, txt_file, img_id, height, width): |
| 186 | + annotation = [] |
| 187 | + all_info = read_txt(txt_file) |
| 188 | + for label_info in all_info: |
| 189 | + # 遍历一张图中不同标注对象 |
| 190 | + label_info = label_info.split(" ") |
| 191 | + if len(label_info) < 5: |
| 192 | + continue |
| 193 | + |
| 194 | + category_id, vertex_info = label_info[0], label_info[1:] |
| 195 | + segmentation, bbox, area = self._get_annotation(vertex_info, |
| 196 | + height, width) |
| 197 | + annotation.append({ |
| 198 | + 'segmentation': segmentation, |
| 199 | + 'area': area, |
| 200 | + 'iscrowd': 0, |
| 201 | + 'image_id': img_id, |
| 202 | + 'bbox': bbox, |
| 203 | + 'category_id': int(category_id)+1, |
| 204 | + 'id': self.annotation_id, |
| 205 | + }) |
| 206 | + self.annotation_id += 1 |
| 207 | + return annotation |
| 208 | + |
| 209 | + @staticmethod |
| 210 | + def _get_annotation(vertex_info, height, width): |
| 211 | + cx, cy, w, h = [float(i) for i in vertex_info] |
| 212 | + |
| 213 | + cx = cx * width |
| 214 | + cy = cy * height |
| 215 | + box_w = w * width |
| 216 | + box_h = h * height |
| 217 | + |
| 218 | + # left top |
| 219 | + x0 = max(cx - box_w / 2, 0) |
| 220 | + y0 = max(cy - box_h / 2, 0) |
| 221 | + |
| 222 | + # right bottomt |
| 223 | + x1 = min(x0 + box_w, width) |
| 224 | + y1 = min(y0 + box_h, height) |
| 225 | + |
| 226 | + segmentation = [[x0, y0, x1, y0, x1, y1, x0, y1]] |
| 227 | + bbox = [x0, y0, box_w, box_h] |
| 228 | + area = box_w * box_h |
| 229 | + return segmentation, bbox, area |
| 230 | + |
| 231 | + |
| 232 | +if __name__ == "__main__": |
| 233 | + parser = argparse.ArgumentParser('Datasets converter from YOLOV5 to COCO') |
| 234 | + parser.add_argument('--cfg_file', type=str, |
| 235 | + default='datasets/YOLOV5', |
| 236 | + help='Dataset cfg file') |
| 237 | + args = parser.parse_args() |
| 238 | + |
| 239 | + converter = YOLOV5CFG2COCO(args.cfg_file) |
| 240 | + converter.generate() |
0 commit comments