@@ -44,6 +44,12 @@ interface TestResult {
4444 } [ ] ;
4545}
4646
47+ interface EvalConfig {
48+ workflows : Array < { name ?: string } > ;
49+ passThreshold ?: number ;
50+ [ key : string ] : unknown ;
51+ }
52+
4753const program = new Command ( ) ;
4854
4955program
@@ -63,6 +69,10 @@ program
6369 . option ( "-j, --json" , "Output results as JSON" )
6470 . option ( "-l, --llm" , "Enable LLM judge" )
6571 . option ( "-o, --output <path>" , "Save results to file" )
72+ . option (
73+ "-p, --pass-threshold <number>" ,
74+ "Minimum average score (0-1) required to pass. Can also be set via EVAL_PASS_THRESHOLD env var." ,
75+ )
6676 . option ( "-t, --timeout <ms>" , "Override timeout in milliseconds" )
6777 . action ( async ( options ) => {
6878 try {
@@ -134,7 +144,7 @@ program
134144
135145 // Load config to get workflow count for display
136146 const configContent = await fs . readFile ( configPath , "utf-8" ) ;
137- const config = JSON . parse ( configContent ) ;
147+ const config : EvalConfig = JSON . parse ( configContent ) ;
138148
139149 console . log ( chalk . blue ( `Running evaluation tests from: ${ configPath } ` ) ) ;
140150 console . log ( chalk . gray ( `Workflows to test: ${ config . workflows . length } ` ) ) ;
@@ -179,19 +189,42 @@ program
179189 const allEvaluations = reports . flatMap ( ( r ) => r . evaluations ) ;
180190 const duration = Date . now ( ) - startTime ;
181191
192+ // Determine pass/fail based on threshold instead of strict all-pass
193+ const avgScore =
194+ allEvaluations . length === 0
195+ ? 0
196+ : allEvaluations . reduce ( ( sum , e ) => sum + e . overallScore , 0 ) /
197+ allEvaluations . length ;
198+
199+ const thresholdFromEnv =
200+ ( process . env . EVAL_PASS_THRESHOLD || process . env . PASS_THRESHOLD ) ?? "" ;
201+ const thresholdFromCli = options . passThreshold ?? "" ;
202+ const thresholdFromConfig =
203+ typeof config . passThreshold === "number"
204+ ? String ( config . passThreshold )
205+ : "" ;
206+ const threshold = ( ( ) => {
207+ const raw = String (
208+ thresholdFromCli || thresholdFromEnv || thresholdFromConfig ,
209+ ) . trim ( ) ;
210+ const parsed = Number . parseFloat ( raw ) ;
211+ if ( ! Number . isFinite ( parsed ) ) return 0.6 ; // default lowered threshold
212+ return parsed ;
213+ } ) ( ) ;
214+
215+ const passed = avgScore >= threshold ;
216+
182217 const finalReport : EvaluationReport = {
183218 config : { parallel : true , source : configPath } ,
184219 evaluations : allEvaluations ,
185- passed : reports . every ( ( r ) => r . passed ) ,
220+ passed,
186221 timestamp : new Date ( ) ,
187222 } ;
188223
189224 const finalResult : TestResult = {
190225 config : configPath ,
191- passed : finalReport . passed ,
192- score :
193- allEvaluations . reduce ( ( sum , e ) => sum + e . overallScore , 0 ) /
194- Math . max ( 1 , allEvaluations . length ) ,
226+ passed,
227+ score : avgScore ,
195228 duration,
196229 workflows : allEvaluations . map ( ( e ) => ( {
197230 name : e . workflowName ,
@@ -217,6 +250,11 @@ program
217250 `\nTest execution completed in ${ ( finalResult . duration / 1000 ) . toFixed ( 2 ) } s` ,
218251 ) ,
219252 ) ;
253+ console . log (
254+ chalk . gray (
255+ `Threshold for pass: ${ threshold . toFixed ( 2 ) } | Average score: ${ finalResult . score . toFixed ( 3 ) } ` ,
256+ ) ,
257+ ) ;
220258 console . log (
221259 chalk [ finalResult . passed ? "green" : "red" ] (
222260 `Overall result: ${ finalResult . passed ? "PASSED" : "FAILED" } (${ ( finalResult . score * 100 ) . toFixed ( 1 ) } %)` ,
0 commit comments