11import asyncio
22import pathlib
3- import time
43import typing as t
5- from enum import Enum
64
75import typer
8- from fastparquet import ParquetFile # type: ignore[import-untyped]
96from loguru import logger
10- from natsort import natsorted
11- from pydantic import BaseModel
12- from pydantic_yaml import parse_yaml_file_as
137from termcolor import colored
148from typer_di import Depends , TyperDI
159
1610import nerve
17- from nerve .cli .defaults import DEFAULT_EVAL_RUNS
1811from nerve .cli .utils import _get_run_args
19- from nerve .models import Configuration , Evaluation
12+ from nerve .defaults import DEFAULT_EVAL_RUNS
13+ from nerve .models import Configuration
2014from nerve .runtime import logging
21- from nerve .server .runner import Arguments , Output , Runner
15+ from nerve .runtime .eval import Case , Cases , Evaluation
16+ from nerve .runtime .runner import Arguments , Output , Runner
2217
2318cli = TyperDI (
2419 no_args_is_help = True ,
2722)
2823
2924
30- class CaseIterator :
31- class Mode (Enum ):
32- # cases have their own individual folders
33- FOLDER = 0
34- # cases are listed in a single file
35- YAML = 1
36- # parquet file
37- PARQUET = 2
38-
39- class Case (BaseModel ):
40- name : str
41- input_state : dict [str , t .Any ]
42-
43- def _from_folder (self , cases_folder : pathlib .Path ) -> None :
44- logger .info (f"📊 loading evaluation cases from folder { cases_folder } " )
45- self ._mode = self .Mode .FOLDER
46- for path in natsorted (cases_folder .glob ("*" )):
47- self ._cases .append (
48- CaseIterator .Case (
49- name = path .name ,
50- input_state = {
51- "CASE_NAME" : path .name ,
52- "CASE_PATH" : path .absolute ().as_posix (),
53- },
54- )
55- )
56-
57- def _from_yaml (self , cases_file : pathlib .Path ) -> None :
58- logger .info (f"📊 loading evaluation cases from file { cases_file } " )
59- self ._mode = self .Mode .YAML
60- for case in parse_yaml_file_as (list [dict [str , dict [str , t .Any ]]], cases_file ): # type: ignore[type-var]
61- for case_name , input_state in case .items ():
62- self ._cases .append (CaseIterator .Case (name = case_name , input_state = input_state ))
63-
64- def _from_parquet (self , cases_file : pathlib .Path ) -> None :
65- logger .info (f"📊 loading evaluation cases from parquet file { cases_file } " )
66- self ._mode = self .Mode .PARQUET
67- pf = ParquetFile (cases_file )
68- df = pf .to_pandas ()
69- num_rows = len (df )
70- for index , row in df .iterrows ():
71- self ._cases .append (
72- CaseIterator .Case (
73- name = f"case_{ index } _of_{ num_rows } " ,
74- input_state = row .to_dict (),
75- )
76- )
77-
78- def __init__ (self , eval_path : pathlib .Path ):
79- self ._eval_path = eval_path
80- self ._cases : list [CaseIterator .Case ] = []
81- self ._mode = self .Mode .FOLDER
82-
83- cases_folder = self ._eval_path / "cases"
84- cases_file_yml = self ._eval_path / "cases.yml"
85- cases_file_parquet = self ._eval_path / "cases.parquet"
86-
87- if cases_folder .exists ():
88- self ._from_folder (cases_folder )
89-
90- elif cases_file_yml .exists ():
91- self ._from_yaml (cases_file_yml )
92-
93- elif cases_file_parquet .exists ():
94- self ._from_parquet (cases_file_parquet )
95-
96- if not self ._cases :
97- logger .error (f"no cases found in { self ._eval_path } " )
98- raise typer .Abort ()
99-
100- def __iter__ (self ) -> t .Iterator ["CaseIterator.Case" ]:
101- return iter (self ._cases )
102-
103- def __len__ (self ) -> int :
104- return len (self ._cases )
105-
106-
10725def _get_output_path (args : Arguments ) -> pathlib .Path :
10826 output_name = f"{ args .generator } -{ args .input_path .name } "
10927 sanitized = ""
@@ -145,9 +63,9 @@ def eval(
14563 raise typer .Abort () from e
14664
14765 output = output or _get_output_path (args )
148- cases = CaseIterator (args .input_path )
149- new_runs = False
66+ cases = Cases (args .input_path )
15067
68+ # apply limits from the config if available
15169 if config .limits :
15270 if config .limits .runs :
15371 runs = config .limits .runs
@@ -163,58 +81,57 @@ def eval(
16381
16482 if output .exists ():
16583 logger .info (f"📊 loading evaluation results from { output } " )
166- eval_result = Evaluation .load_from (output )
84+ evaluation = Evaluation .load_from (output )
16785 else :
16886 logger .info (f"📊 saving evaluation results to { output } " )
169- eval_result = Evaluation .build (args , runs , len (cases ))
87+ evaluation = Evaluation .build (args , runs , len (cases ))
17088
17189 for case in cases :
172- if case .name not in eval_result .cases :
173- eval_result .cases [case .name ] = Evaluation .Case (started_at = time .time ())
174- new_runs = True
175-
17690 for run in range (runs ):
177- num_runs_done = len (eval_result .cases [case .name ].runs )
178- do_run = num_runs_done < (run + 1 )
179- if not do_run :
180- # check that the run has been completed
181- if eval_result .cases [case .name ].runs [run ].steps == 0 :
91+ do_run = True
92+ if evaluation .num_runs (case .name ) >= runs :
93+ # we already have enough runs for this case
94+ do_run = False
95+ if not evaluation .is_run_done (case .name , run ):
96+ # we don't have enough runs for this case
18297 do_run = True
18398 logger .warning (f"run { run } for { case .name } has not been completed, re-running" )
184-
185- logger .debug (f"got { num_runs_done } runs for { case .name } " )
99+ evaluation .remove_run (case .name , run )
186100
187101 if not do_run :
188102 logger .debug (f"skipping { case .name } ({ run + 1 } /{ runs } )" )
189- run_output = eval_result . cases [ case .name ]. runs [ run ]
103+ run_output = evaluation . get_run ( case .name , run )
190104 else :
191105 logger .debug (f"running { case .name } ({ run + 1 } /{ runs } )" )
192106 run_output = asyncio .run (_run_case (args , case ))
193- eval_result .add_run (case .name , run_output )
194- new_runs = True
107+ evaluation .add_run (case .name , run_output )
195108
196- usage = run_output .usage
197- if run_output .task_success :
198- logger .success (
199- f" [{ run + 1 } /{ runs } ] { eval_name } / { case .name } : { run_output .steps } steps | { run_output .time :.1f} s | { usage .get ('total_tokens' , 0 )} tokens | { usage .get ('cost' , 0.0 )} $"
200- )
201- else :
202- logger .error (
203- f" [{ run + 1 } /{ runs } ] { eval_name } / { case .name } : { run_output .steps } steps | { run_output .time :.1f} s | { usage .get ('total_tokens' , 0 )} tokens | { usage .get ('cost' , 0.0 )} $"
204- )
109+ _show_run (run_output , run + 1 , runs , eval_name , case .name )
205110
206- if do_run :
111+ if evaluation . needs_flush () :
207112 # save at each run so we can restore later
208- eval_result .save_to (output )
113+ evaluation .save_to (output )
209114
210- logger .debug (f"evaluation results: { eval_result } " )
115+ logger .debug (f"evaluation results: { evaluation } " )
211116
212- # save if we did any runs
213- if new_runs :
214- eval_result .save_to (output )
117+ # save if needed
118+ if evaluation . needs_flush () :
119+ evaluation .save_to (output )
215120 logger .info (f"📊 evaluation results saved to { output } " )
216121
217- _show_results (eval_result )
122+ _show_results (evaluation )
123+
124+
125+ def _show_run (output : Output , run : int , runs : int , eval_name : str , case_name : str ) -> None :
126+ usage = output .usage
127+ if output .task_success :
128+ logger .success (
129+ f" [{ run + 1 } /{ runs } ] { eval_name } / { case_name } : { output .steps } steps | { output .time :.1f} s | { usage .get ('total_tokens' , 0 )} tokens | { usage .get ('cost' , 0.0 )} $"
130+ )
131+ else :
132+ logger .error (
133+ f" [{ run + 1 } /{ runs } ] { eval_name } / { case_name } : { output .steps } steps | { output .time :.1f} s | { usage .get ('total_tokens' , 0 )} tokens | { usage .get ('cost' , 0.0 )} $"
134+ )
218135
219136
220137def _show_results (eval : Evaluation ) -> None :
@@ -233,8 +150,8 @@ def _show_results(eval: Evaluation) -> None:
233150 total_tests = eval .stats .passed + eval .stats .failed
234151 score = eval .stats .passed / total_tests * 100
235152
236- for _case_name , case in eval .cases .items ():
237- for run in case . runs :
153+ for _case_name , case_runs in eval .runs .items ():
154+ for run in case_runs :
238155 total_cost += run .usage .get ("cost" , 0.0 )
239156 # total_tokens += run.usage.get("total_tokens", 0)
240157 total_steps += run .steps
@@ -249,7 +166,7 @@ def _show_results(eval: Evaluation) -> None:
249166 logger .info (f"Score: { score :.2f} %" )
250167
251168
252- async def _run_case (args : Arguments , case : CaseIterator . Case ) -> Output :
169+ async def _run_case (args : Arguments , case : Case ) -> Output :
253170 return await Runner (
254171 args ,
255172 case .input_state ,
0 commit comments