diff --git a/mmocr/evaluation/evaluator/multi_datasets_evaluator.py b/mmocr/evaluation/evaluator/multi_datasets_evaluator.py index f01aa70f6..e0c0d8b1f 100644 --- a/mmocr/evaluation/evaluator/multi_datasets_evaluator.py +++ b/mmocr/evaluation/evaluator/multi_datasets_evaluator.py @@ -92,9 +92,32 @@ def evaluate(self, size: int) -> dict: metrics_results.update(metric_results) metric.results.clear() if is_main_process(): - metrics_results = [metrics_results] + averaged_results = [self.average_results(metrics_results)] else: - metrics_results = [None] # type: ignore + averaged_results = [None] + + metrics_results = [metrics_results] broadcast_object_list(metrics_results) + broadcast_object_list([averaged_results]) + results = { + 'metric_results': metrics_results[0], + 'averaged_results': averaged_results + } + return results + + def average_results(self, metrics_results): + """Compute the average of metric results across all datasets. + + Args: + metrics_results (dict): Evaluation results of all metrics. + + Returns:pre + dict: Average evaluation results of all metrics. + """ + averaged_results = {} + num_datasets = len(self.dataset_prefixes) + for metric_name, metric_result in metrics_results.items(): + metric_avg = metric_result / num_datasets + averaged_results[metric_name] = metric_avg - return metrics_results[0] + return averaged_results diff --git a/tests/test_evaluation/test_evaluator/test_multi_datasets_evaluator.py b/tests/test_evaluation/test_evaluator/test_multi_datasets_evaluator.py index 0b2ae3d73..f9b0f1410 100644 --- a/tests/test_evaluation/test_evaluator/test_multi_datasets_evaluator.py +++ b/tests/test_evaluation/test_evaluator/test_multi_datasets_evaluator.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import math +from collections import OrderedDict from typing import Dict, List, Optional from unittest import TestCase @@ -75,7 +76,7 @@ def generate_test_results(size, batch_size, pred, label): predictions = [ BaseDataElement(pred=pred, label=label) for _ in range(bs) ] - yield (data_batch, predictions) + yield data_batch, predictions class TestMultiDatasetsEvaluator(TestCase): @@ -96,11 +97,11 @@ def test_composed_metrics(self): size, batch_size, pred=1, label=1): evaluator.process(predictions, data_samples) - metrics = evaluator.evaluate(size=size) + metrics_results, averaged_results = evaluator.evaluate(size=size) - self.assertAlmostEqual(metrics['Fake/Toy/accuracy'], 1.0) - self.assertAlmostEqual(metrics['Fake/Toy/mAP'], 0.0) - self.assertEqual(metrics['Fake/Toy/size'], size) + self.assertAlmostEqual(metrics_results['Fake/Toy/accuracy'], 1.0) + self.assertAlmostEqual(metrics_results['Fake/Toy/mAP'], 0.0) + self.assertEqual(metrics_results['Fake/Toy/size'], size) with self.assertWarns(Warning): evaluator.evaluate(size=0) @@ -123,6 +124,25 @@ def test_composed_metrics(self): for data_samples, predictions in generate_test_results( size, batch_size, pred=1, label=1): evaluator.process(predictions, data_samples) - metrics = evaluator.evaluate(size=size) - self.assertIn('Fake/Toy/accuracy', metrics) - self.assertIn('Fake/accuracy', metrics) + metrics_results, averaged_results = evaluator.evaluate(size=size) + self.assertIn('Fake/Toy/accuracy', metrics_results) + self.assertIn('Fake/accuracy', metrics_results) + + metrics_results = OrderedDict({ + 'dataset1/metric1/accuracy': 0.9, + 'dataset1/metric2/f1_score': 0.8, + 'dataset2/metric1/accuracy': 0.85, + 'dataset2/metric2/f1_score': 0.75 + }) + + evaluator = MultiDatasetsEvaluator(cfg, dataset_prefixes=['Fake']) + averaged_results = evaluator.average_results(metrics_results) + + expected_averaged_results = { + 'dataset1/metric1/accuracy': 0.9, + 'dataset1/metric2/f1_score': 0.8, + 'dataset2/metric1/accuracy': 0.85, + 'dataset2/metric2/f1_score': 0.75 + } + + self.assertEqual(averaged_results, expected_averaged_results)