Add the ability to write Evals

mrinalwadhwa · mrinalwadhwa · commit 6762f0679d41 · 2025-08-11T08:31:27.000-07:00
diff --git a/source/python/python/autonomy/__init__.py b/source/python/python/autonomy/__init__.py
@@ -1,8 +1,15 @@
+from .evals import Eval, Metric, TestCase, TestOk, TestError
 from .models import Model
 
 __doc__ = ""
 
 __all__ = [
+  # from .evals
+  "Eval",
+  "Metric",
+  "TestCase",
+  "TestOk",
+  "TestError",
   # from .models
   "Model"
 ]
diff --git a/source/python/python/autonomy/evals/__init__.py b/source/python/python/autonomy/evals/__init__.py
@@ -0,0 +1,9 @@
+from .eval import Eval, Metric, TestCase, TestOk, TestError
+
+__all__ = [
+  "Eval",
+  "Metric",
+  "TestCase",
+  "TestOk",
+  "TestError",
+]
diff --git a/source/python/python/autonomy/evals/eval.py b/source/python/python/autonomy/evals/eval.py
@@ -0,0 +1,69 @@
+from dataclasses import dataclass, field
+from typing import Protocol, Any, Callable, Dict, List, Optional, Union
+
+
+class Metric(Protocol):
+  @property
+  def name(self) -> str: ...
+
+  def calculate(self, expected_output: Any, observed_output: Any) -> float: ...
+
+  def aggregate(self, values: List[float]) -> float: ...
+
+
+@dataclass
+class TestCase:
+  input: Any
+  expected_output: Any
+
+
+@dataclass
+class TestOk:
+  test_case: TestCase
+  output: Any
+  metrics: Dict[str, float] = field(default_factory=dict)
+
+
+@dataclass
+class TestError:
+  test_case: TestCase
+  error: str
+
+
+class Eval:
+  def run(
+    self, subject: Callable, cases: List[TestCase], metrics: List[Metric]
+  ) -> tuple[Dict[str, float], List[Union[TestOk, TestError]]]:
+    if not callable(subject):
+      raise ValueError("subject is not callable")
+
+    if not cases:
+      raise ValueError("test case list is empty")
+
+    if not metrics:
+      raise ValueError("list of metrics is empty")
+
+    results = []
+    for case in cases:
+      try:
+        observed_output = subject(case.input)
+
+        # Calculate all metrics for this test case
+        calculated = {}
+        for m in metrics:
+          calculated[m.name] = m.calculate(case.expected_output, observed_output)
+
+        results.append(TestOk(test_case=case, output=observed_output, metrics=calculated))
+      except Exception as e:
+        results.append(TestError(test_case=case, error=str(e)))
+
+    # Aggregate each metric across all test cases
+    aggregated = {}
+    for m in metrics:
+      values = []
+      for result in results:
+        if isinstance(result, TestOk) and m.name in result.metrics:
+          values.append(result.metrics[m.name])
+      aggregated[m.name] = m.aggregate(values) if values else float("nan")
+
+    return aggregated, results