Skip to content

Commit 4c57007

Browse files
authored
feat(evals): add run_batched_evaluation (#1436)
1 parent 94b0211 commit 4c57007

16 files changed

+3242
-230
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ jobs:
8888
- name: Setup node (for langfuse server)
8989
uses: actions/setup-node@v3
9090
with:
91-
node-version: 20
91+
node-version: 24
9292

9393
- name: Cache langfuse server dependencies
9494
uses: actions/cache@v3

.pre-commit-config.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
repos:
22
- repo: https://github.com/astral-sh/ruff-pre-commit
3-
rev: v0.3.2
3+
rev: v0.14.4
44
hooks:
55
# Run the linter and fix
66
- id: ruff
@@ -10,6 +10,7 @@ repos:
1010
# Run the formatter.
1111
- id: ruff-format
1212
types_or: [python, pyi, jupyter]
13+
args: [--config=ci.ruff.toml]
1314

1415
- repo: https://github.com/pre-commit/mirrors-mypy
1516
rev: v1.18.2

langfuse/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
""".. include:: ../README.md"""
22

3+
from langfuse.batch_evaluation import (
4+
BatchEvaluationResult,
5+
BatchEvaluationResumeToken,
6+
CompositeEvaluatorFunction,
7+
EvaluatorInputs,
8+
EvaluatorStats,
9+
MapperFunction,
10+
)
311
from langfuse.experiment import Evaluation
412

513
from ._client import client as _client_module
@@ -41,6 +49,12 @@
4149
"LangfuseRetriever",
4250
"LangfuseGuardrail",
4351
"Evaluation",
52+
"EvaluatorInputs",
53+
"MapperFunction",
54+
"CompositeEvaluatorFunction",
55+
"EvaluatorStats",
56+
"BatchEvaluationResumeToken",
57+
"BatchEvaluationResult",
4458
"experiment",
4559
"api",
4660
]

langfuse/_client/client.py

Lines changed: 296 additions & 1 deletion
Large diffs are not rendered by default.

langfuse/_client/datasets.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from opentelemetry.util._decorator import _agnosticcontextmanager
66

7+
from langfuse.batch_evaluation import CompositeEvaluatorFunction
78
from langfuse.experiment import (
89
EvaluatorFunction,
910
ExperimentResult,
@@ -204,6 +205,7 @@ def run_experiment(
204205
description: Optional[str] = None,
205206
task: TaskFunction,
206207
evaluators: List[EvaluatorFunction] = [],
208+
composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
207209
run_evaluators: List[RunEvaluatorFunction] = [],
208210
max_concurrency: int = 50,
209211
metadata: Optional[Dict[str, Any]] = None,
@@ -234,6 +236,10 @@ def run_experiment(
234236
.metadata attributes. Signature should be: task(*, item, **kwargs) -> Any
235237
evaluators: List of functions to evaluate each item's output individually.
236238
These will have access to the item's expected_output for comparison.
239+
composite_evaluator: Optional function that creates composite scores from item-level evaluations.
240+
Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
241+
plus the list of evaluations from item-level evaluators. Useful for weighted averages,
242+
pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
237243
run_evaluators: List of functions to evaluate the entire experiment run.
238244
Useful for computing aggregate statistics across all dataset items.
239245
max_concurrency: Maximum number of concurrent task executions (default: 50).
@@ -411,6 +417,7 @@ def content_diversity(*, item_results, **kwargs):
411417
data=self.items,
412418
task=task,
413419
evaluators=evaluators,
420+
composite_evaluator=composite_evaluator,
414421
run_evaluators=run_evaluators,
415422
max_concurrency=max_concurrency,
416423
metadata=metadata,

langfuse/_client/observe.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,9 @@ def __next__(self) -> Any:
589589
raise # Re-raise StopIteration
590590

591591
except Exception as e:
592-
self.span.update(level="ERROR", status_message=str(e) or type(e).__name__).end()
592+
self.span.update(
593+
level="ERROR", status_message=str(e) or type(e).__name__
594+
).end()
593595

594596
raise
595597

@@ -654,6 +656,8 @@ async def __anext__(self) -> Any:
654656

655657
raise # Re-raise StopAsyncIteration
656658
except Exception as e:
657-
self.span.update(level="ERROR", status_message=str(e) or type(e).__name__).end()
659+
self.span.update(
660+
level="ERROR", status_message=str(e) or type(e).__name__
661+
).end()
658662

659663
raise

0 commit comments

Comments
 (0)