Skip to content

Commit 5893f56

Browse files
authored
Evaluation metrics (#4957)
1 parent 6de0976 commit 5893f56

File tree

10 files changed

+879
-9
lines changed

10 files changed

+879
-9
lines changed

application/backend/app/services/datumaro_converter.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77

88
import numpy as np
99
import polars as pl
10-
from datumaro.experimental import Dataset, Sample, bbox_field, image_path_field, label_field
10+
from datumaro.experimental import Dataset, Sample, bbox_field, image_info_field, image_path_field, label_field
1111
from datumaro.experimental.categories import LabelCategories
12-
from datumaro.experimental.fields import polygon_field
12+
from datumaro.experimental.fields import ImageInfo, polygon_field
1313

1414
from app.core.models.task_type import TaskType
1515
from app.db.schema import DatasetItemDB
@@ -25,22 +25,26 @@
2525

2626
class DetectionSample(Sample):
2727
image: str = image_path_field()
28+
image_info: ImageInfo = image_info_field()
2829
bboxes: np.ndarray[Any, Any] = bbox_field(dtype=pl.Int32)
2930
label: np.ndarray[Any, Any] = label_field(dtype=pl.Int32, is_list=True)
3031

3132

3233
class ClassificationSample(Sample):
3334
image: str = image_path_field()
35+
image_info: ImageInfo = image_info_field()
3436
label: int = label_field(dtype=pl.Int32, is_list=False)
3537

3638

3739
class MultilabelClassificationSample(Sample):
3840
image: str = image_path_field()
39-
label: np.ndarray[Any, Any] = label_field(dtype=pl.Int32, is_list=True)
41+
image_info: ImageInfo = image_info_field()
42+
label: np.ndarray[Any, Any] = label_field(dtype=pl.Int32, multi_label=True)
4043

4144

4245
class InstanceSegmentationSample(Sample):
4346
image: str = image_path_field()
47+
image_info: ImageInfo = image_info_field()
4448
polygons: np.ndarray[Any, Any] = polygon_field(dtype=pl.Float32)
4549
label: np.ndarray[Any, Any] = label_field(dtype=pl.Int32, is_list=True)
4650

@@ -137,6 +141,7 @@ def _convert_sample(
137141
return None
138142
return DetectionSample(
139143
image=image_path,
144+
image_info=ImageInfo(width=dataset_item.width, height=dataset_item.height),
140145
bboxes=np.array(coords),
141146
label=np.array(labels_indexes),
142147
)
@@ -176,7 +181,11 @@ def _convert_sample(
176181
DatasetItemAnnotation.model_validate(annotation) for annotation in dataset_item.annotation_data
177182
)
178183
try:
179-
return ClassificationSample(image=image_path, label=project_labels_ids.index(annotation.labels[0].id))
184+
return ClassificationSample(
185+
image=image_path,
186+
image_info=ImageInfo(width=dataset_item.width, height=dataset_item.height),
187+
label=project_labels_ids.index(annotation.labels[0].id),
188+
)
180189
except ValueError:
181190
logger.error("Unable to find one of dataset item %s labels in project", dataset_item.id)
182191
return None
@@ -220,7 +229,11 @@ def _convert_sample(
220229
except ValueError:
221230
logger.error("Unable to find one of dataset item %s labels in project", dataset_item.id)
222231
return None
223-
return MultilabelClassificationSample(image=image_path, label=np.array(labels_indexes))
232+
return MultilabelClassificationSample(
233+
image=image_path,
234+
image_info=ImageInfo(width=dataset_item.width, height=dataset_item.height),
235+
label=np.array(labels_indexes),
236+
)
224237

225238
return _convert_dataset(
226239
sample_type=MultilabelClassificationSample,
@@ -267,7 +280,10 @@ def _convert_sample(
267280
logger.error("Unable to find one of dataset item %s labels in project", dataset_item.id)
268281
return None
269282
return InstanceSegmentationSample(
270-
image=image_path, polygons=np.array(polygons, dtype=np.float32), label=np.array(labels_indexes)
283+
image=image_path,
284+
image_info=ImageInfo(width=dataset_item.width, height=dataset_item.height),
285+
polygons=np.array(polygons, dtype=np.float32),
286+
label=np.array(labels_indexes),
271287
)
272288

273289
return _convert_dataset(
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from .evaluators import (
5+
AveragingMethod,
6+
DetectionEvaluator,
7+
Evaluator,
8+
InstanceSegmentationEvaluator,
9+
MultiClassClassificationEvaluator,
10+
MultiLabelClassificationEvaluator,
11+
)
12+
from .factory import EvaluatorFactory
13+
14+
__all__ = [
15+
"AveragingMethod",
16+
"DetectionEvaluator",
17+
"Evaluator",
18+
"EvaluatorFactory",
19+
"InstanceSegmentationEvaluator",
20+
"MultiClassClassificationEvaluator",
21+
"MultiLabelClassificationEvaluator",
22+
]
Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from abc import ABCMeta, abstractmethod
5+
from enum import StrEnum
6+
7+
import numpy as np
8+
from datumaro.experimental import Dataset
9+
from faster_coco_eval import COCO, COCOeval_faster
10+
from numpy.typing import NDArray
11+
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
12+
from sklearn.preprocessing import MultiLabelBinarizer
13+
14+
from app.services.datumaro_converter import DetectionSample
15+
16+
17+
def datumaro_dataset_to_coco(dataset: Dataset) -> dict:
18+
"""
19+
Convert Datumaro Dataset to COCO format.
20+
21+
Supports detection (DetectionSample) and instance segmentation (InstanceSegmentationSample) datasets.
22+
23+
Args:
24+
dataset (Dataset): Datumaro Dataset to convert.
25+
Returns:
26+
dict: COCO formatted dataset.
27+
"""
28+
coco_dataset_dict: dict[str, list] = {"images": [], "annotations": [], "categories": []}
29+
30+
# Add categories
31+
for label_idx, label in enumerate(dataset.schema.attributes["label"].categories.labels):
32+
coco_dataset_dict["categories"].append(
33+
{
34+
"id": label_idx,
35+
"name": label,
36+
}
37+
)
38+
39+
annotation_id = 1 # COCOeval ignores annotation ID 0
40+
for image_id, sample in enumerate(dataset):
41+
# Add image entry
42+
coco_dataset_dict["images"].append(
43+
{
44+
"id": image_id,
45+
"file_name": sample.image,
46+
"width": sample.image_info.width,
47+
"height": sample.image_info.height,
48+
}
49+
)
50+
51+
# Detection
52+
if hasattr(sample, "bboxes") and sample.bboxes is not None:
53+
for bbox, label_idx in zip(sample.bboxes, sample.label):
54+
x1, y1, x2, y2 = bbox
55+
width = x2 - x1
56+
height = y2 - y1
57+
coco_dataset_dict["annotations"].append(
58+
{
59+
"id": annotation_id,
60+
"image_id": image_id,
61+
"category_id": int(label_idx),
62+
"bbox": [float(x1), float(y1), float(width), float(height)],
63+
"score": 1.0,
64+
}
65+
)
66+
annotation_id += 1
67+
68+
# Instance Segmentation
69+
if hasattr(sample, "polygons") and sample.polygons is not None:
70+
for polygon, label_idx in zip(sample.polygons, sample.label):
71+
flattened_polygon = [coord for point in polygon for coord in point]
72+
x_coords = [point[0] for point in polygon]
73+
y_coords = [point[1] for point in polygon]
74+
x_min, x_max = min(x_coords), max(x_coords)
75+
y_min, y_max = min(y_coords), max(y_coords)
76+
width = x_max - x_min
77+
height = y_max - y_min
78+
coco_dataset_dict["annotations"].append(
79+
{
80+
"id": annotation_id,
81+
"image_id": image_id,
82+
"category_id": int(label_idx),
83+
"segmentation": [flattened_polygon],
84+
"bbox": [float(x_min), float(y_min), float(width), float(height)],
85+
"score": 1.0,
86+
}
87+
)
88+
annotation_id += 1
89+
90+
return coco_dataset_dict
91+
92+
93+
class AveragingMethod(StrEnum):
94+
MICRO = "micro"
95+
MACRO = "macro"
96+
WEIGHTED = "weighted"
97+
SAMPLES = "samples"
98+
99+
100+
class EvaluatorBase(metaclass=ABCMeta):
101+
"""Base class for all evaluators."""
102+
103+
def __init__(self, predictions_dataset: Dataset, ground_truth_dataset: Dataset):
104+
self.predictions_dataset = predictions_dataset
105+
self.ground_truth_dataset = ground_truth_dataset
106+
107+
108+
class EvaluatorWithLabelArrays(EvaluatorBase):
109+
"""Base evaluator for tasks that use label arrays."""
110+
111+
def __init__(self, predictions_dataset: Dataset, ground_truth_dataset: Dataset):
112+
super().__init__(predictions_dataset=predictions_dataset, ground_truth_dataset=ground_truth_dataset)
113+
self.__pred_labels: NDArray[np.int_] | None = None
114+
self.__gt_labels: NDArray[np.int_] | None = None
115+
116+
@abstractmethod
117+
def _build_label_arrays(self) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
118+
"""Set up the prediction and ground truth label arrays."""
119+
120+
@property
121+
def _pred_labels(self) -> NDArray[np.int_]:
122+
if self.__pred_labels is None:
123+
self.__gt_labels, self.__pred_labels = self._build_label_arrays()
124+
return self.__pred_labels
125+
126+
@property
127+
def _gt_labels(self) -> NDArray[np.int_]:
128+
if self.__gt_labels is None:
129+
self.__gt_labels, self.__pred_labels = self._build_label_arrays()
130+
return self.__gt_labels
131+
132+
133+
class AccuracyEvaluator(EvaluatorWithLabelArrays):
134+
"""Evaluator for accuracy, precision, recall, and F1 metrics."""
135+
136+
def __init__(self, predictions_dataset: Dataset, ground_truth_dataset: Dataset):
137+
super().__init__(predictions_dataset=predictions_dataset, ground_truth_dataset=ground_truth_dataset)
138+
139+
def precision(self, averaging_method: AveragingMethod = AveragingMethod.MACRO) -> float:
140+
return precision_score(y_true=self._gt_labels, y_pred=self._pred_labels, average=averaging_method.value)
141+
142+
def recall(self, averaging_method: AveragingMethod = AveragingMethod.MACRO) -> float:
143+
return recall_score(y_true=self._gt_labels, y_pred=self._pred_labels, average=averaging_method.value)
144+
145+
def accuracy(self) -> float:
146+
return accuracy_score(y_true=self._gt_labels, y_pred=self._pred_labels)
147+
148+
def f1_score(self, averaging_method: AveragingMethod = AveragingMethod.MACRO) -> float:
149+
return f1_score(y_true=self._gt_labels, y_pred=self._pred_labels, average=averaging_method.value)
150+
151+
152+
class ConfusionMatrixEvaluator(EvaluatorWithLabelArrays):
153+
"""Evaluator for confusion matrix computation."""
154+
155+
def __init__(self, predictions_dataset: Dataset, ground_truth_dataset: Dataset):
156+
super().__init__(predictions_dataset=predictions_dataset, ground_truth_dataset=ground_truth_dataset)
157+
158+
def confusion_matrix(self) -> np.ndarray:
159+
"""Compute the confusion matrix"""
160+
return confusion_matrix(y_true=self._gt_labels, y_pred=self._pred_labels)
161+
162+
163+
class MeanAveragePrecisionEvaluator(EvaluatorBase):
164+
"""Evaluator for mean average precision (mAP) metrics."""
165+
166+
def __init__(self, predictions_dataset: Dataset, ground_truth_dataset: Dataset):
167+
super().__init__(predictions_dataset=predictions_dataset, ground_truth_dataset=ground_truth_dataset)
168+
self.__gt_coco_dict: dict | None = None
169+
self.__pred_coco_dict: dict | None = None
170+
171+
@property
172+
def _gt_coco_dict(self) -> dict:
173+
if self.__gt_coco_dict is None:
174+
self.__gt_coco_dict = datumaro_dataset_to_coco(self.ground_truth_dataset)
175+
return self.__gt_coco_dict
176+
177+
@property
178+
def _pred_coco_dict(self) -> dict:
179+
if self.__pred_coco_dict is None:
180+
self.__pred_coco_dict = datumaro_dataset_to_coco(self.predictions_dataset)
181+
return self.__pred_coco_dict
182+
183+
def mean_average_precision(self) -> dict:
184+
gt_coco = COCO(self._gt_coco_dict)
185+
pred_coco = gt_coco.loadRes(self._pred_coco_dict["annotations"])
186+
coco_evaluator = COCOeval_faster(
187+
cocoGt=gt_coco,
188+
cocoDt=pred_coco,
189+
iouType="bbox" if self.predictions_dataset.dtype is DetectionSample else "segm",
190+
)
191+
coco_evaluator.run()
192+
return coco_evaluator.stats_as_dict
193+
194+
195+
class MultiClassClassificationEvaluator(AccuracyEvaluator, ConfusionMatrixEvaluator):
196+
"""Evaluator for multi-class classification tasks."""
197+
198+
def __init__(self, predictions_dataset: Dataset, ground_truth_dataset: Dataset):
199+
if (
200+
predictions_dataset.schema.attributes["label"].annotation.multi_label
201+
or ground_truth_dataset.schema.attributes["label"].annotation.multi_label
202+
):
203+
raise ValueError(f"{self.__class__.__name__} should not be used for multi-label classification datasets")
204+
205+
AccuracyEvaluator.__init__(
206+
self, predictions_dataset=predictions_dataset, ground_truth_dataset=ground_truth_dataset
207+
)
208+
ConfusionMatrixEvaluator.__init__(
209+
self, predictions_dataset=predictions_dataset, ground_truth_dataset=ground_truth_dataset
210+
)
211+
212+
def _build_label_arrays(self) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
213+
pred_labels = np.array([sample.label for sample in self.predictions_dataset], dtype=int)
214+
gt_labels = np.array([sample.label for sample in self.ground_truth_dataset], dtype=int)
215+
return gt_labels, pred_labels
216+
217+
218+
class MultiLabelClassificationEvaluator(AccuracyEvaluator):
219+
"""Evaluator for multi-label classification tasks."""
220+
221+
def __init__(self, predictions_dataset: Dataset, ground_truth_dataset: Dataset):
222+
if not (
223+
predictions_dataset.schema.attributes["label"].annotation.multi_label
224+
and ground_truth_dataset.schema.attributes["label"].annotation.multi_label
225+
):
226+
raise ValueError(f"{self.__class__.__name__} should only be used for multi-label classification datasets")
227+
228+
AccuracyEvaluator.__init__(
229+
self, predictions_dataset=predictions_dataset, ground_truth_dataset=ground_truth_dataset
230+
)
231+
232+
def _build_label_arrays(self) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
233+
mlb = MultiLabelBinarizer()
234+
gt_labels_list = [s.label for s in self.ground_truth_dataset]
235+
pred_labels_list = [s.label for s in self.predictions_dataset]
236+
gt_labels = mlb.fit_transform(gt_labels_list)
237+
pred_labels = mlb.transform(pred_labels_list)
238+
return gt_labels, pred_labels
239+
240+
241+
class DetectionEvaluator(MeanAveragePrecisionEvaluator):
242+
"""Evaluator for object detection tasks."""
243+
244+
245+
class InstanceSegmentationEvaluator(MeanAveragePrecisionEvaluator):
246+
"""Evaluator for instance segmentation tasks."""
247+
248+
249+
Evaluator = (
250+
MultiClassClassificationEvaluator
251+
| MultiLabelClassificationEvaluator
252+
| DetectionEvaluator
253+
| InstanceSegmentationEvaluator
254+
)

0 commit comments

Comments
 (0)