Skip to content

Commit 642c47a

Browse files
committed
fix: fixed evaluation restore logic
1 parent e3c2183 commit 642c47a

File tree

4 files changed

+29
-16
lines changed

4 files changed

+29
-16
lines changed

nerve/cli/agents.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pathlib
22
import typing as t
33

4+
import natsort
45
import requests
56
import typer
67
from termcolor import colored
@@ -74,7 +75,11 @@ def _show_installed_agents(path: pathlib.Path) -> None:
7475
print()
7576
print(f"📁 Installed in {path.absolute()}:\n")
7677

78+
items = []
7779
for item in path.iterdir():
80+
items.append(item)
81+
82+
for item in natsort.natsorted(items):
7883
if Workflow.is_workflow(item):
7984
workflow = Workflow.from_path(item)
8085
print(

nerve/cli/eval.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def eval(
7777
args.timeout = config.limits.timeout
7878

7979
eval_name = colored(args.input_path.name, "green", attrs=["bold"])
80-
logger.info(f"📊 {args.generator} vs {eval_name} | cases: {len(cases)} | runs: {runs}")
80+
logger.info(f"📊 {args.generator} / {eval_name} / cases: {len(cases)} / runs: {runs}")
8181

8282
if output.exists():
8383
logger.info(f"📊 loading evaluation results from {output}")
@@ -106,7 +106,7 @@ def eval(
106106
run_output = asyncio.run(_run_case(args, case))
107107
evaluation.add_run(case.name, run_output)
108108

109-
_show_run(run_output, run + 1, runs, eval_name, case.name)
109+
_show_run(args, run_output, runs, run, case.name, do_run)
110110

111111
if evaluation.needs_flush():
112112
# save at each run so we can restore later
@@ -122,16 +122,17 @@ def eval(
122122
_show_results(evaluation)
123123

124124

125-
def _show_run(output: Output, run: int, runs: int, eval_name: str, case_name: str) -> None:
125+
def _show_run(args: Arguments, output: Output, runs: int, run: int, case_name: str, live: bool) -> None:
126126
usage = output.usage
127+
one_of = f"[{run + 1}/{runs}]" if live else f"({run + 1}/{runs})"
128+
subject = f"{one_of} {args.generator} / {args.input_path.name} / {case_name}"
129+
stats = (
130+
f"{output.steps} steps, {output.time:.1f} s, {usage.get('total_tokens', 0)} tokens, {usage.get('cost', 0.0)} $"
131+
)
127132
if output.task_success:
128-
logger.success(
129-
f" [{run + 1}/{runs}] {eval_name} / {case_name} : {output.steps} steps | {output.time:.1f} s | {usage.get('total_tokens', 0)} tokens | {usage.get('cost', 0.0)} $"
130-
)
133+
logger.success(f" {subject} : {stats}")
131134
else:
132-
logger.error(
133-
f" [{run + 1}/{runs}] {eval_name} / {case_name} : {output.steps} steps | {output.time:.1f} s | {usage.get('total_tokens', 0)} tokens | {usage.get('cost', 0.0)} $"
134-
)
135+
logger.error(f" {subject} : {stats}")
135136

136137

137138
def _show_results(eval: Evaluation) -> None:

nerve/runtime/eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def remove_run(self, case_name: str, run_idx: int) -> None:
7878
self._flush = True
7979

8080
def num_runs(self, case_name: str) -> int:
81-
return len(self.runs[case_name])
81+
return len(self.runs[case_name]) if case_name in self.runs else 0
8282

8383
def num_run_steps(self, case_name: str, run_idx: int) -> int:
8484
return self.runs[case_name][run_idx].steps

nerve/runtime/runner.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,13 +208,17 @@ def __init__(
208208
)
209209
self._stdout_fn: t.Callable[[str], t.Awaitable[None]] = _default_stdout_fn
210210
self._stderr_fn: t.Callable[[str], t.Awaitable[None]] = _default_stderr_fn
211+
self._process: asyncio.subprocess.Process | None = None
211212

212213
if clean_at_exit:
213214
atexit.register(self._clean_up)
214215

215216
def _clean_up(self) -> None:
216217
if self.events_file.exists():
217218
logger.debug(f"removing events file {self.events_file}")
219+
if self._process is not None:
220+
self._process.kill()
221+
self._process = None
218222
self.events_file.unlink()
219223

220224
def set_stdout_fn(self, fn: t.Callable[[str], t.Awaitable[None]]) -> None:
@@ -247,20 +251,20 @@ async def read_stream(stream: asyncio.StreamReader | None, name: str) -> None:
247251

248252
outerr[name].append(line.decode().rstrip())
249253

250-
process = await asyncio.create_subprocess_exec(
254+
self._process = await asyncio.create_subprocess_exec(
251255
*self.command_line,
252256
stdout=asyncio.subprocess.PIPE,
253257
stderr=asyncio.subprocess.PIPE,
254258
env=os.environ.copy(),
255259
)
256-
stdout_task = asyncio.create_task(read_stream(process.stdout, "stdout"))
257-
stderr_task = asyncio.create_task(read_stream(process.stderr, "stderr"))
260+
stdout_task = asyncio.create_task(read_stream(self._process.stdout, "stdout"))
261+
stderr_task = asyncio.create_task(read_stream(self._process.stderr, "stderr"))
258262

259263
# wait for the process and stdout/stderr readers to complete
260-
await process.wait()
264+
await self._process.wait()
261265
await asyncio.gather(stdout_task, stderr_task)
262266

263-
logger.debug(f"process exited with code {process.returncode}, reading events ...")
267+
logger.debug(f"process exited with code {self._process.returncode}, reading events ...")
264268

265269
# read the events file
266270
events = []
@@ -285,10 +289,13 @@ async def read_stream(stream: asyncio.StreamReader | None, name: str) -> None:
285289

286290
logger.debug(f"output value: {parsed.output_object}")
287291

292+
exit_code = self._process.returncode or 0
293+
self._process = None
294+
288295
return Output(
289296
generated_at=generated_at,
290297
command_line=self.command_line,
291-
exit_code=process.returncode or 0,
298+
exit_code=exit_code,
292299
stdout=outerr["stdout"],
293300
stderr=outerr["stderr"],
294301
events=events,

0 commit comments

Comments
 (0)