|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +""" |
| 5 | +Name: label_to_csv.py |
| 6 | +Author: Justin Ruan |
| 7 | + |
| 8 | +Time: 2021.02.06 |
| 9 | +""" |
| 10 | + |
| 11 | +import os |
| 12 | +import argparse |
| 13 | +import codecs |
| 14 | + |
| 15 | +import pandas as pd |
| 16 | + |
| 17 | + |
| 18 | +def txt2csv(location, training_dir, path_prefix): |
| 19 | + # Return list |
| 20 | + temp_res = [] |
| 21 | + |
| 22 | + # Run through all the files |
| 23 | + for file in os.listdir(location): |
| 24 | + # Check the file name ends with txt |
| 25 | + # and not class.txt |
| 26 | + if (not file.endswith(".txt")) | \ |
| 27 | + (file == "classes.txt"): |
| 28 | + continue |
| 29 | + |
| 30 | + # Get the file name |
| 31 | + file_whole_name = f"{location}/{file}" |
| 32 | + |
| 33 | + # Read in txt as csv |
| 34 | + df_txt = pd.read_csv(file_whole_name, sep=" ", header=None) |
| 35 | + |
| 36 | + # Create data for each labels |
| 37 | + for index, row in df_txt.iterrows(): |
| 38 | + # Temp array for csv, initialized by the training types |
| 39 | + temp_csv = [str(training_dir)] |
| 40 | + |
| 41 | + # gs://prefix/name/{image_name} |
| 42 | + cloud_path = f"{path_prefix}/{os.path.splitext(file)[0]}.jpg" |
| 43 | + temp_csv.append(cloud_path) |
| 44 | + |
| 45 | + # Class label |
| 46 | + temp_csv.append(class_labels[int(row[0])]) |
| 47 | + |
| 48 | + # Add the upper left coordinate |
| 49 | + x_min = min(max(0.0, row[1] - row[3] / 2), 1.0) |
| 50 | + y_min = min(max(0.0, row[2] - row[4] / 2), 1.0) |
| 51 | + temp_csv.extend([x_min, y_min]) |
| 52 | + |
| 53 | + # Add the lower left coordinate (not necessary, left blank) |
| 54 | + temp_csv.extend(["", ""]) |
| 55 | + |
| 56 | + # Add the lower right coordinate |
| 57 | + x_max = min(max(0.0, row[1] + row[3] / 2), 1.0) |
| 58 | + y_max = min(max(0.0, row[2] + row[4] / 2), 1.0) |
| 59 | + temp_csv.extend([x_max, y_max]) |
| 60 | + |
| 61 | + # Add the upper right coordinate (not necessary, left blank) |
| 62 | + temp_csv.extend(["", ""]) |
| 63 | + |
| 64 | + # Append to the res |
| 65 | + temp_res.append(temp_csv) |
| 66 | + |
| 67 | + return temp_res |
| 68 | + |
| 69 | + |
| 70 | +def xml2csv(location, training_dir, path_prefix): |
| 71 | + # To parse the xml files |
| 72 | + import xml.etree.ElementTree as ET |
| 73 | + |
| 74 | + # Return list |
| 75 | + temp_res = [] |
| 76 | + |
| 77 | + # Run through all the files |
| 78 | + for file in os.listdir(location): |
| 79 | + # Check the file name ends with xml |
| 80 | + if not file.endswith(".xml"): |
| 81 | + continue |
| 82 | + |
| 83 | + # Get the file name |
| 84 | + file_whole_name = f"{location}/{file}" |
| 85 | + |
| 86 | + # Open the xml name |
| 87 | + tree = ET.parse(file_whole_name) |
| 88 | + root = tree.getroot() |
| 89 | + |
| 90 | + # Get the width, height of images |
| 91 | + # to normalize the bounding boxes |
| 92 | + size = root.find("size") |
| 93 | + width, height = float(size.find("width").text), float(size.find("height").text) |
| 94 | + |
| 95 | + # Find all the bounding objects |
| 96 | + for label_object in root.findall("object"): |
| 97 | + # Temp array for csv, initialized by the training types |
| 98 | + temp_csv = [str(training_dir)] |
| 99 | + |
| 100 | + # gs://prefix/name/{image_name} |
| 101 | + cloud_path = f"{path_prefix}/{os.path.splitext(file)[0]}.jpg" |
| 102 | + temp_csv.append(cloud_path) |
| 103 | + |
| 104 | + # Class label |
| 105 | + temp_csv.append(label_object.find("name").text) |
| 106 | + |
| 107 | + # Bounding box coordinate |
| 108 | + bounding_box = label_object.find("bndbox") |
| 109 | + |
| 110 | + # Add the upper left coordinate |
| 111 | + x_min = float(bounding_box.find("xmin").text) / width |
| 112 | + y_min = float(bounding_box.find("ymin").text) / height |
| 113 | + temp_csv.extend([x_min, y_min]) |
| 114 | + |
| 115 | + # Add the lower left coordinate (not necessary, left blank) |
| 116 | + temp_csv.extend(["", ""]) |
| 117 | + |
| 118 | + # Add the lower right coordinate |
| 119 | + x_max = float(bounding_box.find("xmax").text) / width |
| 120 | + y_max = float(bounding_box.find("ymax").text) / height |
| 121 | + temp_csv.extend([x_max, y_max]) |
| 122 | + |
| 123 | + # Add the upper right coordinate (not necessary, left blank) |
| 124 | + temp_csv.extend(["", ""]) |
| 125 | + |
| 126 | + # Append to the res |
| 127 | + temp_res.append(temp_csv) |
| 128 | + |
| 129 | + return temp_res |
| 130 | + |
| 131 | + |
| 132 | +if __name__ == "__main__": |
| 133 | + # Add the argument parse |
| 134 | + arg_p = argparse.ArgumentParser() |
| 135 | + arg_p.add_argument("-p", "--prefix", |
| 136 | + required=True, |
| 137 | + type=str, |
| 138 | + help="Bucket of the cloud storage path") |
| 139 | + arg_p.add_argument("-l", "--location", |
| 140 | + type=str, |
| 141 | + required=True, |
| 142 | + help="Location of the label files") |
| 143 | + arg_p.add_argument("-m", "--mode", |
| 144 | + type=str, |
| 145 | + required=True, |
| 146 | + help="'xml' for converting from xml and 'txt' for converting from txt") |
| 147 | + arg_p.add_argument("-o", "--output", |
| 148 | + type=str, |
| 149 | + default="res.csv", |
| 150 | + help="Output name of csv file") |
| 151 | + arg_p.add_argument("-c", "--classes", |
| 152 | + type=str, |
| 153 | + default=os.path.join("..", "data", "predefined_classes.txt"), |
| 154 | + help="Label classes path") |
| 155 | + args = vars(arg_p.parse_args()) |
| 156 | + |
| 157 | + # Class labels |
| 158 | + class_labels = [] |
| 159 | + |
| 160 | + # Load in the defined classes |
| 161 | + if os.path.exists(args["classes"]) is True: |
| 162 | + with codecs.open(args["classes"], 'r', 'utf8') as f: |
| 163 | + for line in f: |
| 164 | + line = line.strip() |
| 165 | + class_labels.append(line) |
| 166 | + else: # Exit if errors occurred |
| 167 | + print(f"File: {args['classes']} not exists") |
| 168 | + exit(1) |
| 169 | + |
| 170 | + # Prefix of the cloud storage |
| 171 | + ori_prefix = f"gs://{args['prefix']}" |
| 172 | + |
| 173 | + # Array for final csv file |
| 174 | + res = [] |
| 175 | + # Get all the file in dir |
| 176 | + for training_type_dir in os.listdir(args["location"]): |
| 177 | + # Get the dirname |
| 178 | + dir_name = f"{args['location']}/{training_type_dir}" |
| 179 | + |
| 180 | + # Check whether is dir |
| 181 | + if not os.path.isdir(dir_name): |
| 182 | + continue |
| 183 | + # Process the files |
| 184 | + |
| 185 | + for class_type_dir in os.listdir(dir_name): |
| 186 | + |
| 187 | + # Check whether is dir |
| 188 | + if not os.path.isdir(dir_name): |
| 189 | + continue |
| 190 | + |
| 191 | + prefix = f"{ori_prefix}/{class_type_dir}" |
| 192 | + |
| 193 | + # Convert the chosen extension to csv |
| 194 | + if args["mode"] == "txt": |
| 195 | + res.extend(txt2csv(f"{dir_name}/{class_type_dir}", |
| 196 | + training_type_dir, |
| 197 | + prefix)) |
| 198 | + elif args["mode"] == "xml": |
| 199 | + res.extend(xml2csv(f"{dir_name}/{class_type_dir}", |
| 200 | + training_type_dir, |
| 201 | + prefix)) |
| 202 | + else: |
| 203 | + print("Wrong argument for convert mode.\n" |
| 204 | + "'xml' for converting from xml to csv\n" |
| 205 | + "'txt' for converting from txt to csv") |
| 206 | + exit(1) |
| 207 | + |
| 208 | + # Write to the result csv |
| 209 | + res_csv = pd.DataFrame(res, |
| 210 | + columns=["set", "path", "label", |
| 211 | + "x_min", "y_min", |
| 212 | + "x_max", "y_min", |
| 213 | + "x_max", "y_max", |
| 214 | + "x_min", "y_max"]) |
| 215 | + res_csv.to_csv("res.csv", index=False, header=False) |
0 commit comments