refactor: change metric update function to fit evaluation agent expectations

begumcig · begumcig · commit c4bb8d6adfd3 · 2026-03-11T15:35:33.000Z
diff --git a/.github/actions/setup-uv-project/action.yml b/.github/actions/setup-uv-project/action.yml
@@ -12,4 +12,4 @@ runs:
         github-token: ${{ github.token }}
 
     - shell: bash
-      run: uv sync --extra dev --extra evalharness
+      run: uv sync --extra dev --extra lmharness
diff --git a/src/pruna/evaluation/metrics/metric_evalharness.py b/src/pruna/evaluation/metrics/metric_evalharness.py
@@ -16,12 +16,14 @@
 
 from typing import Any, List, Tuple
 
+import torch
 from lm_eval.api import metrics  # noqa: F401  # needed to register lm-eval metrics
 from lm_eval.api import registry as lm_registry
 
 from pruna.evaluation.metrics.metric_stateful import StatefulMetric
 from pruna.evaluation.metrics.registry import MetricRegistry
 from pruna.evaluation.metrics.result import MetricResult
+from pruna.evaluation.metrics.utils import metric_data_processor
 from pruna.logging.logger import pruna_logger
 
 METRIC_EVALHARNESS = "lm_eval_metric"
@@ -71,12 +73,26 @@ def __init__(self, metric_name: str, call_type: str = "y_gt") -> None:
 
         pruna_logger.info(f"LMEvalMetric initialized: {metric_name} (higher_is_better={self.higher_is_better})")
 
-    def update(self, preds, refs) -> None:
-        """Accumulate predictions and references for later aggregation."""
-        if len(preds) != len(refs):
-            raise ValueError(f"Preds and refs length mismatch: {len(preds)} vs {len(refs)}")
-
-        for ref, pred in zip(refs, preds):
+    def update(
+        self,
+        x: List[Any] | torch.Tensor,
+        gt: List[Any] | torch.Tensor,
+        outputs: List[Any] | torch.Tensor,
+    ) -> None:
+        """
+        Accumulate predictions and references for later aggregation.
+
+        Parameters
+        ----------
+        x : List[Any] | torch.Tensor
+            Input data.
+        gt : List[Any] | torch.Tensor
+            Ground truth data.
+        outputs : List[Any] | torch.Tensor
+            Output data.
+        """
+        inputs = metric_data_processor(x, gt, outputs, self.call_type)
+        for ref, pred in zip(inputs[0], inputs[1]):
             raw_item = self.metric_fn((ref, pred))
             self.pairs.append(raw_item)
 
diff --git a/tests/evaluation/test_evalharness_metrics.py b/tests/evaluation/test_evalharness_metrics.py
@@ -18,7 +18,7 @@ def test_lm_eval_metric_bleu_like():
     preds = ["the cat is on mat", "a quick brown fox"]
 
     metric = LMEvalMetric(metric_name="bleu")
-    metric.update(preds, refs)
+    metric.update(refs,refs, preds)
     result = metric.compute()
 
     assert isinstance(result, MetricResult)
@@ -49,7 +49,7 @@ def test_lm_eval_metric_length_mismatch():
     preds = ["a", "b"]
 
     with pytest.raises(ValueError, match="Preds and refs length mismatch"):
-        metric.update(preds, refs)
+        metric.update(refs, refs, preds)
 
 
 @pytest.mark.cpu