Skip to content

Commit e3c2183

Browse files
committed
fix: various fixes and refactorings
1 parent a1bc7a1 commit e3c2183

File tree

17 files changed

+259
-214
lines changed

17 files changed

+259
-214
lines changed

nerve/cli/agents.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from termcolor import colored
77

88
import nerve
9-
from nerve.cli.defaults import (
9+
from nerve.defaults import (
1010
DEFAULT_AGENTS_LOAD_PATH,
1111
)
1212
from nerve.models import Configuration, Workflow

nerve/cli/create.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pydantic_yaml import to_yaml_str
99

1010
import nerve
11-
from nerve.cli.defaults import (
11+
from nerve.defaults import (
1212
DEFAULT_AGENT_PATH,
1313
DEFAULT_AGENT_SYSTEM_PROMPT,
1414
DEFAULT_AGENT_TASK,

nerve/cli/eval.py

Lines changed: 40 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,19 @@
11
import asyncio
22
import pathlib
3-
import time
43
import typing as t
5-
from enum import Enum
64

75
import typer
8-
from fastparquet import ParquetFile # type: ignore[import-untyped]
96
from loguru import logger
10-
from natsort import natsorted
11-
from pydantic import BaseModel
12-
from pydantic_yaml import parse_yaml_file_as
137
from termcolor import colored
148
from typer_di import Depends, TyperDI
159

1610
import nerve
17-
from nerve.cli.defaults import DEFAULT_EVAL_RUNS
1811
from nerve.cli.utils import _get_run_args
19-
from nerve.models import Configuration, Evaluation
12+
from nerve.defaults import DEFAULT_EVAL_RUNS
13+
from nerve.models import Configuration
2014
from nerve.runtime import logging
21-
from nerve.server.runner import Arguments, Output, Runner
15+
from nerve.runtime.eval import Case, Cases, Evaluation
16+
from nerve.runtime.runner import Arguments, Output, Runner
2217

2318
cli = TyperDI(
2419
no_args_is_help=True,
@@ -27,83 +22,6 @@
2722
)
2823

2924

30-
class CaseIterator:
31-
class Mode(Enum):
32-
# cases have their own individual folders
33-
FOLDER = 0
34-
# cases are listed in a single file
35-
YAML = 1
36-
# parquet file
37-
PARQUET = 2
38-
39-
class Case(BaseModel):
40-
name: str
41-
input_state: dict[str, t.Any]
42-
43-
def _from_folder(self, cases_folder: pathlib.Path) -> None:
44-
logger.info(f"📊 loading evaluation cases from folder {cases_folder}")
45-
self._mode = self.Mode.FOLDER
46-
for path in natsorted(cases_folder.glob("*")):
47-
self._cases.append(
48-
CaseIterator.Case(
49-
name=path.name,
50-
input_state={
51-
"CASE_NAME": path.name,
52-
"CASE_PATH": path.absolute().as_posix(),
53-
},
54-
)
55-
)
56-
57-
def _from_yaml(self, cases_file: pathlib.Path) -> None:
58-
logger.info(f"📊 loading evaluation cases from file {cases_file}")
59-
self._mode = self.Mode.YAML
60-
for case in parse_yaml_file_as(list[dict[str, dict[str, t.Any]]], cases_file): # type: ignore[type-var]
61-
for case_name, input_state in case.items():
62-
self._cases.append(CaseIterator.Case(name=case_name, input_state=input_state))
63-
64-
def _from_parquet(self, cases_file: pathlib.Path) -> None:
65-
logger.info(f"📊 loading evaluation cases from parquet file {cases_file}")
66-
self._mode = self.Mode.PARQUET
67-
pf = ParquetFile(cases_file)
68-
df = pf.to_pandas()
69-
num_rows = len(df)
70-
for index, row in df.iterrows():
71-
self._cases.append(
72-
CaseIterator.Case(
73-
name=f"case_{index}_of_{num_rows}",
74-
input_state=row.to_dict(),
75-
)
76-
)
77-
78-
def __init__(self, eval_path: pathlib.Path):
79-
self._eval_path = eval_path
80-
self._cases: list[CaseIterator.Case] = []
81-
self._mode = self.Mode.FOLDER
82-
83-
cases_folder = self._eval_path / "cases"
84-
cases_file_yml = self._eval_path / "cases.yml"
85-
cases_file_parquet = self._eval_path / "cases.parquet"
86-
87-
if cases_folder.exists():
88-
self._from_folder(cases_folder)
89-
90-
elif cases_file_yml.exists():
91-
self._from_yaml(cases_file_yml)
92-
93-
elif cases_file_parquet.exists():
94-
self._from_parquet(cases_file_parquet)
95-
96-
if not self._cases:
97-
logger.error(f"no cases found in {self._eval_path}")
98-
raise typer.Abort()
99-
100-
def __iter__(self) -> t.Iterator["CaseIterator.Case"]:
101-
return iter(self._cases)
102-
103-
def __len__(self) -> int:
104-
return len(self._cases)
105-
106-
10725
def _get_output_path(args: Arguments) -> pathlib.Path:
10826
output_name = f"{args.generator}-{args.input_path.name}"
10927
sanitized = ""
@@ -145,9 +63,9 @@ def eval(
14563
raise typer.Abort() from e
14664

14765
output = output or _get_output_path(args)
148-
cases = CaseIterator(args.input_path)
149-
new_runs = False
66+
cases = Cases(args.input_path)
15067

68+
# apply limits from the config if available
15169
if config.limits:
15270
if config.limits.runs:
15371
runs = config.limits.runs
@@ -163,58 +81,57 @@ def eval(
16381

16482
if output.exists():
16583
logger.info(f"📊 loading evaluation results from {output}")
166-
eval_result = Evaluation.load_from(output)
84+
evaluation = Evaluation.load_from(output)
16785
else:
16886
logger.info(f"📊 saving evaluation results to {output}")
169-
eval_result = Evaluation.build(args, runs, len(cases))
87+
evaluation = Evaluation.build(args, runs, len(cases))
17088

17189
for case in cases:
172-
if case.name not in eval_result.cases:
173-
eval_result.cases[case.name] = Evaluation.Case(started_at=time.time())
174-
new_runs = True
175-
17690
for run in range(runs):
177-
num_runs_done = len(eval_result.cases[case.name].runs)
178-
do_run = num_runs_done < (run + 1)
179-
if not do_run:
180-
# check that the run has been completed
181-
if eval_result.cases[case.name].runs[run].steps == 0:
91+
do_run = True
92+
if evaluation.num_runs(case.name) >= runs:
93+
# we already have enough runs for this case
94+
do_run = False
95+
if not evaluation.is_run_done(case.name, run):
96+
# we don't have enough runs for this case
18297
do_run = True
18398
logger.warning(f"run {run} for {case.name} has not been completed, re-running")
184-
185-
logger.debug(f"got {num_runs_done} runs for {case.name}")
99+
evaluation.remove_run(case.name, run)
186100

187101
if not do_run:
188102
logger.debug(f"skipping {case.name} ({run + 1}/{runs})")
189-
run_output = eval_result.cases[case.name].runs[run]
103+
run_output = evaluation.get_run(case.name, run)
190104
else:
191105
logger.debug(f"running {case.name} ({run + 1}/{runs})")
192106
run_output = asyncio.run(_run_case(args, case))
193-
eval_result.add_run(case.name, run_output)
194-
new_runs = True
107+
evaluation.add_run(case.name, run_output)
195108

196-
usage = run_output.usage
197-
if run_output.task_success:
198-
logger.success(
199-
f" [{run + 1}/{runs}] {eval_name} / {case.name} : {run_output.steps} steps | {run_output.time:.1f} s | {usage.get('total_tokens', 0)} tokens | {usage.get('cost', 0.0)} $"
200-
)
201-
else:
202-
logger.error(
203-
f" [{run + 1}/{runs}] {eval_name} / {case.name} : {run_output.steps} steps | {run_output.time:.1f} s | {usage.get('total_tokens', 0)} tokens | {usage.get('cost', 0.0)} $"
204-
)
109+
_show_run(run_output, run + 1, runs, eval_name, case.name)
205110

206-
if do_run:
111+
if evaluation.needs_flush():
207112
# save at each run so we can restore later
208-
eval_result.save_to(output)
113+
evaluation.save_to(output)
209114

210-
logger.debug(f"evaluation results: {eval_result}")
115+
logger.debug(f"evaluation results: {evaluation}")
211116

212-
# save if we did any runs
213-
if new_runs:
214-
eval_result.save_to(output)
117+
# save if needed
118+
if evaluation.needs_flush():
119+
evaluation.save_to(output)
215120
logger.info(f"📊 evaluation results saved to {output}")
216121

217-
_show_results(eval_result)
122+
_show_results(evaluation)
123+
124+
125+
def _show_run(output: Output, run: int, runs: int, eval_name: str, case_name: str) -> None:
126+
usage = output.usage
127+
if output.task_success:
128+
logger.success(
129+
f" [{run + 1}/{runs}] {eval_name} / {case_name} : {output.steps} steps | {output.time:.1f} s | {usage.get('total_tokens', 0)} tokens | {usage.get('cost', 0.0)} $"
130+
)
131+
else:
132+
logger.error(
133+
f" [{run + 1}/{runs}] {eval_name} / {case_name} : {output.steps} steps | {output.time:.1f} s | {usage.get('total_tokens', 0)} tokens | {usage.get('cost', 0.0)} $"
134+
)
218135

219136

220137
def _show_results(eval: Evaluation) -> None:
@@ -233,8 +150,8 @@ def _show_results(eval: Evaluation) -> None:
233150
total_tests = eval.stats.passed + eval.stats.failed
234151
score = eval.stats.passed / total_tests * 100
235152

236-
for _case_name, case in eval.cases.items():
237-
for run in case.runs:
153+
for _case_name, case_runs in eval.runs.items():
154+
for run in case_runs:
238155
total_cost += run.usage.get("cost", 0.0)
239156
# total_tokens += run.usage.get("total_tokens", 0)
240157
total_steps += run.steps
@@ -249,7 +166,7 @@ def _show_results(eval: Evaluation) -> None:
249166
logger.info(f"Score: {score:.2f} %")
250167

251168

252-
async def _run_case(args: Arguments, case: CaseIterator.Case) -> Output:
169+
async def _run_case(args: Arguments, case: Case) -> Output:
253170
return await Runner(
254171
args,
255172
case.input_state,

nerve/cli/install.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import typer
1010

1111
import nerve
12-
from nerve.cli.defaults import (
12+
from nerve.defaults import (
1313
DEFAULT_AGENTS_LOAD_PATH,
1414
)
1515
from nerve.models import Configuration, Workflow

nerve/cli/run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from nerve.runtime import logging, state
1111
from nerve.runtime.agent import Agent
1212
from nerve.runtime.flow import Flow
13-
from nerve.server.runner import Arguments
13+
from nerve.runtime.runner import Arguments
1414

1515
cli = TyperDI(
1616
no_args_is_help=True,

nerve/cli/serve.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,16 @@
88
from typer_di import Depends, TyperDI
99

1010
import nerve
11-
from nerve.cli.defaults import (
11+
from nerve.cli.utils import _get_run_args
12+
from nerve.defaults import (
1213
DEFAULT_SERVE_HOST,
1314
DEFAULT_SERVE_PORT,
1415
)
15-
from nerve.cli.utils import _get_run_args
1616
from nerve.models import Configuration
1717
from nerve.runtime import Runtime, logging
18+
from nerve.runtime.runner import Arguments
1819
from nerve.server.mcp import create_mcp_server, create_sse_app, serve_stdio_app
1920
from nerve.server.rest import create_rest_api, serve_http_app
20-
from nerve.server.runner import Arguments
2121

2222
cli = TyperDI(
2323
no_args_is_help=True,

nerve/cli/uninstall.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import typer
66

77
import nerve
8-
from nerve.cli.defaults import (
8+
from nerve.defaults import (
99
DEFAULT_AGENTS_LOAD_PATH,
1010
)
1111

nerve/cli/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import typer
66
from loguru import logger
77

8-
from nerve.cli.defaults import (
8+
from nerve.defaults import (
99
DEFAULT_AGENTS_LOAD_PATH,
1010
DEFAULT_CONVERSATION_STRATEGY,
1111
DEFAULT_GENERATOR,
@@ -14,7 +14,7 @@
1414
DEFAULT_TIMEOUT,
1515
)
1616
from nerve.generation import conversation
17-
from nerve.server.runner import Arguments
17+
from nerve.runtime.runner import Arguments
1818

1919

2020
def _resolve_input_path(input_path: pathlib.Path) -> pathlib.Path:

nerve/cli/utils_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import pytest
55
import typer
66

7-
from nerve.cli.defaults import DEFAULT_AGENTS_LOAD_PATH
87
from nerve.cli.utils import _resolve_input_path
8+
from nerve.defaults import DEFAULT_AGENTS_LOAD_PATH
99

1010

1111
def test_resolve_input_path_exists() -> None:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
DEFAULT_AGENTS_LOAD_PATH: pathlib.Path = DEFAULT_NERVE_HOME / "agents"
1515
DEFAULT_PROMPTS_LOAD_PATH: pathlib.Path = DEFAULT_NERVE_HOME / "prompts"
16+
DEFAULT_RUNS_PATH: pathlib.Path = DEFAULT_NERVE_HOME / "runs"
1617

1718
DEFAULT_AGENT_PATH: pathlib.Path = pathlib.Path("agent.yml")
1819
DEFAULT_AGENT_SYSTEM_PROMPT: str = "You are a helpful assistant."

0 commit comments

Comments
 (0)