Skip to content

Commit 1585ddb

Browse files
committed
new: evaluation mode
1 parent 8499cbb commit 1585ddb

File tree

13 files changed

+869
-60
lines changed

13 files changed

+869
-60
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Nerve is an ADK ( _Agent Development Kit_ ) designed to be a simple yet powerful
2626
- Define agents as simple YAML files.
2727
- Simple CLI for creating, installing, and running agents with step-by-step guidance.
2828
- Comes with a library of predefined, built-in tools for common tasks.
29-
- Easily integrate a vast amount of [MCP servers](https://github.com/punkpeye/awesome-mcp-servers), or create your own custom tools.
29+
- Seamlessly [integrated with MCP](https://github.com/evilsocket/nerve/blob/main/docs/mcp.md).
3030
- Support for [any model provider](https://docs.litellm.ai/docs/providers).
3131

3232
## Quick Start

docs/evaluation.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Evaluation Mode
2+
3+
Nerve provides an evaluation mode that allows you to test your agent's performance against a set of predefined test cases. This is useful for:
4+
5+
- Validating agent behavior during development
6+
- Regression testing after making changes
7+
- Benchmarking different models
8+
- Collecting metrics on agent performance
9+
10+
An evaluation consists of an agent and a corresponding set of test cases. These cases can be defined in a `cases.yml` file, stored in a `cases.parquet` file, or organized as individual entries within separate folders.
11+
12+
Regardless of how you organize the evaluation cases, the agent will be executed for each one, with a specified number of runs per case. Task completion data and runtime statistics will be collected and saved to an output file.
13+
14+
```bash
15+
nerve eval path/to/evaluation --output results.json
16+
```
17+
18+
## YAML
19+
20+
You can place a `cases.yml` file in the agent folder with the different test cases. For instance, this is used in the [ab evaluation](https://github.com/evilsocket/eval-ab), where the evaluation cases look like:
21+
22+
```yaml
23+
- level1:
24+
program: "A# #A"
25+
- level2:
26+
program: "A# #B B# #A"
27+
# ... and so on
28+
```
29+
30+
These cases are interpolated in the agent prompt:
31+
32+
```yaml
33+
task: >
34+
## Problem
35+
36+
Now, consider the following program:
37+
38+
{{ program }}
39+
40+
Fully compute it, step by step and then submit the final result.
41+
```
42+
43+
## Parquet
44+
45+
For more complex test suite you can use a `cases.parquet` file. An example of this is [this MMLU evaluation](https://github.com/evilsocket/eval-mmlu) that is loading data from the [MMLU (dev) dataset](https://huggingface.co/datasets/cais/mmlu) and using it in the agent prompt:
46+
47+
```yaml
48+
task: >
49+
## Question
50+
51+
{{ question }}
52+
53+
Use the select_choice tool to select the correct answer from this list of possible answers:
54+
55+
{% for choice in choices %}
56+
- [{{ loop.index0 }}] {{ choice }}
57+
{% endfor %}
58+
```
59+
60+
## Folders
61+
62+
You can also divide your cases in a `cases` folder in order like in [the regex evaluation](https://github.com/evilsocket/eval-regex) where each input file is organized in `ccases/level0`, `cases/level1`, etc and [read at runtime](https://github.com/evilsocket/eval-regex/blob/main/tools.py#L11) by the tools.

nerve/cli/create.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,6 @@ def _collect_user_prompts() -> list[str]:
104104
return sorted(prompts)
105105

106106

107-
# TODO: create a doc page for this.
108-
109-
110107
async def create_agent(path: pathlib.Path, task: str | None = None, default: bool = False) -> None:
111108
if path.exists():
112109
print(f"❌ {path} already exists.")

nerve/cli/eval.py

Lines changed: 196 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
11
import asyncio
2-
import json
32
import pathlib
43
import time
54
import typing as t
5+
from enum import Enum
66

7-
from termcolor import colored
87
import typer
8+
from fastparquet import ParquetFile # type: ignore[import-untyped]
99
from loguru import logger
10+
from natsort import natsorted
11+
from pydantic import BaseModel
12+
from pydantic_yaml import parse_yaml_file_as
13+
from termcolor import colored
1014
from typer_di import Depends, TyperDI
1115

1216
import nerve
1317
from nerve.cli.defaults import DEFAULT_EVAL_RUNS
1418
from nerve.cli.utils import _get_run_args
15-
from nerve.models import Configuration
19+
from nerve.models import Configuration, Evaluation
1620
from nerve.runtime import logging
1721
from nerve.server.runner import Arguments, Output, Runner
1822

@@ -23,16 +27,109 @@
2327
)
2428

2529

30+
class CaseIterator:
31+
class Mode(Enum):
32+
# cases have their own individual folders
33+
FOLDER = 0
34+
# cases are listed in a single file
35+
YAML = 1
36+
# parquet file
37+
PARQUET = 2
38+
39+
class Case(BaseModel):
40+
name: str
41+
input_state: dict[str, t.Any]
42+
43+
def _from_folder(self, cases_folder: pathlib.Path) -> None:
44+
logger.info(f"📊 loading evaluation cases from folder {cases_folder}")
45+
self._mode = self.Mode.FOLDER
46+
for path in natsorted(cases_folder.glob("*")):
47+
self._cases.append(
48+
CaseIterator.Case(
49+
name=path.name,
50+
input_state={
51+
"CASE_NAME": path.name,
52+
"CASE_PATH": path.absolute().as_posix(),
53+
},
54+
)
55+
)
56+
57+
def _from_yaml(self, cases_file: pathlib.Path) -> None:
58+
logger.info(f"📊 loading evaluation cases from file {cases_file}")
59+
self._mode = self.Mode.YAML
60+
for case in parse_yaml_file_as(list[dict[str, dict[str, t.Any]]], cases_file): # type: ignore[type-var]
61+
for case_name, input_state in case.items():
62+
self._cases.append(CaseIterator.Case(name=case_name, input_state=input_state))
63+
64+
def _from_parquet(self, cases_file: pathlib.Path) -> None:
65+
logger.info(f"📊 loading evaluation cases from parquet file {cases_file}")
66+
self._mode = self.Mode.PARQUET
67+
pf = ParquetFile(cases_file)
68+
df = pf.to_pandas()
69+
num_rows = len(df)
70+
for index, row in df.iterrows():
71+
self._cases.append(
72+
CaseIterator.Case(
73+
name=f"case_{index}_of_{num_rows}",
74+
input_state=row.to_dict(),
75+
)
76+
)
77+
78+
def __init__(self, eval_path: pathlib.Path):
79+
self._eval_path = eval_path
80+
self._cases: list[CaseIterator.Case] = []
81+
self._mode = self.Mode.FOLDER
82+
83+
cases_folder = self._eval_path / "cases"
84+
cases_file_yml = self._eval_path / "cases.yml"
85+
cases_file_parquet = self._eval_path / "cases.parquet"
86+
87+
if cases_folder.exists():
88+
self._from_folder(cases_folder)
89+
90+
elif cases_file_yml.exists():
91+
self._from_yaml(cases_file_yml)
92+
93+
elif cases_file_parquet.exists():
94+
self._from_parquet(cases_file_parquet)
95+
96+
if not self._cases:
97+
logger.error(f"no cases found in {self._eval_path}")
98+
raise typer.Abort()
99+
100+
def __iter__(self) -> t.Iterator["CaseIterator.Case"]:
101+
return iter(self._cases)
102+
103+
def __len__(self) -> int:
104+
return len(self._cases)
105+
106+
107+
def _get_output_path(args: Arguments) -> pathlib.Path:
108+
output_name = f"{args.generator}-{args.input_path.name}"
109+
sanitized = ""
110+
for char in output_name:
111+
if char.isalnum() or char in "._- ":
112+
sanitized += char
113+
else:
114+
sanitized += "-"
115+
116+
while "--" in sanitized:
117+
sanitized = sanitized.replace("--", "-")
118+
119+
return pathlib.Path(f"{sanitized}.json")
120+
121+
26122
@cli.command(
27123
context_settings={"allow_extra_args": True, "ignore_unknown_options": True, "help_option_names": ["-h", "--help"]},
28124
help="Execute an agent or a workflow in evaluation mode.",
29125
)
30126
def eval(
31127
args: Arguments = Depends(_get_run_args),
32128
runs: t.Annotated[int, typer.Option("--runs", "-r", help="Number of runs per case.")] = DEFAULT_EVAL_RUNS,
33-
output: t.Annotated[pathlib.Path, typer.Option("--output", "-o", help="Path to save the output.")] = pathlib.Path(
34-
"eval.json"
35-
),
129+
output: t.Annotated[
130+
pathlib.Path | None,
131+
typer.Option("--output", "-o", help="Path to save the output. If not set will be auto generated."),
132+
] = None,
36133
) -> None:
37134
logging.init(
38135
args.log_path,
@@ -41,61 +138,119 @@ def eval(
41138
)
42139
logger.info(f"🧠 nerve v{nerve.__version__}")
43140

44-
# validate and collect inputs from the agent
45-
if not Configuration.is_agent_config(args.input_path):
46-
logger.error(f"path '{args.input_path}' is not a valid agent configuration")
47-
raise typer.Abort()
141+
try:
142+
config = Configuration.from_path(args.input_path)
143+
except Exception as e:
144+
logger.error(f"path '{args.input_path}' is not a valid agent configuration: {e}")
145+
raise typer.Abort() from e
48146

49-
cases_path = args.input_path / "cases"
50-
if not cases_path.exists():
51-
logger.error(f"cases path {cases_path} does not exist")
52-
raise typer.Abort()
147+
output = output or _get_output_path(args)
148+
cases = CaseIterator(args.input_path)
149+
new_runs = False
53150

54-
result = {
55-
"started_at": time.time(),
56-
"args": args.to_serializable(),
57-
"cases": {},
58-
}
151+
if config.limits:
152+
if config.limits.runs:
153+
runs = config.limits.runs
154+
if config.limits.max_steps:
155+
args.max_steps = config.limits.max_steps
156+
if config.limits.max_cost:
157+
args.max_cost = config.limits.max_cost
158+
if config.limits.timeout:
159+
args.timeout = config.limits.timeout
59160

60-
cases = sorted(cases_path.glob("*"))
61161
eval_name = colored(args.input_path.name, "green", attrs=["bold"])
62162
logger.info(f"📊 {args.generator} vs {eval_name} | cases: {len(cases)} | runs: {runs}")
63163

64-
for case_path in cases:
65-
result["cases"][case_path.name] = {
66-
"started_at": time.time(),
67-
"runs": [],
68-
}
164+
if output.exists():
165+
logger.info(f"📊 loading evaluation results from {output}")
166+
eval_result = Evaluation.load_from(output)
167+
else:
168+
logger.info(f"📊 saving evaluation results to {output}")
169+
eval_result = Evaluation.build(args, runs, len(cases))
170+
171+
for case in cases:
172+
if case.name not in eval_result.cases:
173+
eval_result.cases[case.name] = Evaluation.Case(started_at=time.time())
174+
new_runs = True
69175

70176
for run in range(runs):
71-
logger.debug(f"running {case_path.name} ({run + 1}/{runs})")
72-
run_output = asyncio.run(_run_case(args, case_path))
73-
result["cases"][case_path.name]["runs"].append(run_output.model_dump())
177+
num_runs_done = len(eval_result.cases[case.name].runs)
178+
do_run = num_runs_done < (run + 1)
179+
if not do_run:
180+
# check that the run has been completed
181+
if eval_result.cases[case.name].runs[run].steps == 0:
182+
do_run = True
183+
logger.warning(f"run {run} for {case.name} has not been completed, re-running")
184+
185+
logger.debug(f"got {num_runs_done} runs for {case.name}")
186+
187+
if not do_run:
188+
logger.debug(f"skipping {case.name} ({run + 1}/{runs})")
189+
run_output = eval_result.cases[case.name].runs[run]
190+
else:
191+
logger.debug(f"running {case.name} ({run + 1}/{runs})")
192+
run_output = asyncio.run(_run_case(args, case))
193+
eval_result.add_run(case.name, run_output)
194+
new_runs = True
195+
196+
usage = run_output.usage
74197
if run_output.task_success:
75198
logger.success(
76-
f" {eval_name} / {case_path.name} ({run + 1}/{runs}): {run_output.steps} steps, {run_output.time}s, {run_output.usage}"
199+
f" [{run + 1}/{runs}] {eval_name} / {case.name} : {run_output.steps} steps | {run_output.time:.1f} s | {usage.get('total_tokens', 0)} tokens | {usage.get('cost', 0.0)} $"
77200
)
78201
else:
79202
logger.error(
80-
f" {eval_name} / {case_path.name} ({run + 1}/{runs}): {run_output.steps} steps, {run_output.time}s, {run_output.usage}"
203+
f" [{run + 1}/{runs}] {eval_name} / {case.name} : {run_output.steps} steps | {run_output.time:.1f} s | {usage.get('total_tokens', 0)} tokens | {usage.get('cost', 0.0)} $"
81204
)
82205

83-
break
206+
if do_run:
207+
# save at each run so we can restore later
208+
eval_result.save_to(output)
209+
210+
logger.debug(f"evaluation results: {eval_result}")
211+
212+
# save if we did any runs
213+
if new_runs:
214+
eval_result.save_to(output)
215+
logger.info(f"📊 evaluation results saved to {output}")
216+
217+
_show_results(eval_result)
218+
219+
220+
def _show_results(eval: Evaluation) -> None:
221+
print()
222+
logger.info("📊 Results")
223+
logger.info(f"Model: {eval.args['generator']}")
224+
logger.info(f"Cases: {eval.stats.cases}")
225+
logger.info(f"Runs: {eval.stats.runs}")
226+
logger.info(f"Pass: {eval.stats.passed}")
227+
logger.info(f"Fail: {eval.stats.failed}")
228+
229+
total_cost = 0.0
230+
# total_tokens = 0
231+
total_steps = 0
232+
total_time = 0.0
233+
total_tests = eval.stats.passed + eval.stats.failed
234+
score = eval.stats.passed / total_tests * 100
84235

85-
break
236+
for _case_name, case in eval.cases.items():
237+
for run in case.runs:
238+
total_cost += run.usage.get("cost", 0.0)
239+
# total_tokens += run.usage.get("total_tokens", 0)
240+
total_steps += run.steps
241+
total_time += run.time
86242

87-
logger.debug(f"evaluation results: {result}")
243+
logger.info(f"Total cost: {total_cost:.2f} $")
244+
logger.info(f"Total time: {total_time:.2f} s")
245+
logger.info(f"Avg time: {total_time / total_tests:.2f} s")
246+
logger.info(f"Avg steps: {total_steps / total_tests:.2f}")
247+
logger.info(f"Avg cost: {total_cost / total_tests} $")
88248

89-
result["finished_at"] = time.time()
90-
output.write_text(json.dumps(result))
91-
logger.info(f"evaluation results saved to {output}")
249+
logger.info(f"Score: {score:.2f} %")
92250

93251

94-
async def _run_case(args: Arguments, case_path: pathlib.Path) -> Output:
252+
async def _run_case(args: Arguments, case: CaseIterator.Case) -> Output:
95253
return await Runner(
96254
args,
97-
{
98-
"CASE_NAME": case_path.name,
99-
"CASE_PATH": case_path.absolute().as_posix(),
100-
},
255+
case.input_state,
101256
).run()

nerve/generation/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def __init__(
4141
for tool_name, tool_fn in self.tools.items():
4242
if not tool_fn.__doc__:
4343
raise ValueError(f"Tool {tool_name} has no docstring")
44-
self.tools_schemas.append(get_tool_schema(tool_fn))
44+
self.tools_schemas.append(get_tool_schema(self.generator_id, tool_fn))
4545

4646
def _parse_generator_params(self) -> None:
4747
if "?" in self.generator_id:
@@ -81,7 +81,7 @@ def _get_extended_tooling_schema(self, extra_tools: dict[str, t.Callable[..., t.
8181
raise ValueError(f"Tool {tool_name} already exists")
8282
else:
8383
logger.debug(f"adding extra tool: {tool_name} / {tool_fn.__name__}")
84-
extra_schemas.append(get_tool_schema(tool_fn))
84+
extra_schemas.append(get_tool_schema(self.generator_id, tool_fn))
8585

8686
tools_schemas.extend(extra_schemas)
8787

0 commit comments

Comments
 (0)