@@ -19,6 +19,7 @@ import {Environment} from '../configuration/environment.js';
19
19
import { rateGeneratedCode } from '../ratings/rate-code.js' ;
20
20
import { redX } from '../reporting/format.js' ;
21
21
import {
22
+ AssessmentConfig ,
22
23
AssessmentResult ,
23
24
AttemptDetails ,
24
25
CompletionStats ,
@@ -49,7 +50,7 @@ import {getRunGroupId} from './grouping.js';
49
50
import { executeCommand } from '../utils/exec.js' ;
50
51
import { EvalID , Gateway } from './gateway.js' ;
51
52
import { LocalEnvironment } from '../configuration/environment-local.js' ;
52
- import { getRunnerByName , RunnerName } from '../codegen/runner-creation.js' ;
53
+ import { getRunnerByName } from '../codegen/runner-creation.js' ;
53
54
import { summarizeReportWithAI } from '../reporting/report-ai-summary.js' ;
54
55
55
56
/**
@@ -64,29 +65,7 @@ import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';
64
65
* @returns A Promise that resolves to an array of AssessmentResult objects,
65
66
* each containing the prompt, generated code, and final validation status.
66
67
*/
67
- export async function generateCodeAndAssess ( options : {
68
- model : string ;
69
- runner : RunnerName ;
70
- environmentConfigPath : string ;
71
- localMode : boolean ;
72
- limit : number ;
73
- concurrency : number | 'auto' ;
74
- reportName : string ;
75
- skipScreenshots : boolean ;
76
- startMcp ?: boolean ;
77
- ragEndpoint ?: string ;
78
- outputDirectory ?: string ;
79
- promptFilter ?: string ;
80
- labels : string [ ] ;
81
- skipAiSummary ?: boolean ;
82
- skipAxeTesting : boolean ;
83
- enableUserJourneyTesting ?: boolean ;
84
- enableAutoCsp ?: boolean ;
85
- logging ?: 'text-only' | 'dynamic' ;
86
- autoraterModel ?: string ;
87
- a11yRepairAttempts ?: number ;
88
- skipLighthouse ?: boolean ;
89
- } ) : Promise < RunInfo > {
68
+ export async function generateCodeAndAssess ( options : AssessmentConfig ) : Promise < RunInfo > {
90
69
const env = await getEnvironmentByPath ( options . environmentConfigPath , options . runner ) ;
91
70
const ratingLlm = await getRunnerByName ( 'genkit' ) ;
92
71
@@ -162,25 +141,15 @@ export async function generateCodeAndAssess(options: {
162
141
`Evaluation of ${ rootPromptDef . name } ` ,
163
142
async abortSignal =>
164
143
startEvaluationTask (
144
+ options ,
165
145
evalID ,
166
146
env ,
167
147
env . gateway ,
168
148
ratingLlm ,
169
- options . model ,
170
149
rootPromptDef ,
171
- options . localMode ,
172
- options . skipScreenshots ,
173
- options . outputDirectory ,
174
- options . ragEndpoint ,
175
150
abortSignal ,
176
- options . skipAxeTesting ,
177
- ! ! options . enableUserJourneyTesting ,
178
- ! ! options . enableAutoCsp ,
179
151
workerConcurrencyQueue ,
180
152
progress ,
181
- options . autoraterModel || DEFAULT_AUTORATER_MODEL_NAME ,
182
- options . a11yRepairAttempts ?? 0 ,
183
- ! ! options . skipLighthouse ,
184
153
) ,
185
154
// 10min max per app evaluation. We just want to make sure it never gets stuck.
186
155
10 ,
@@ -291,40 +260,30 @@ export async function generateCodeAndAssess(options: {
291
260
* @returns A Promise that resolves to an AssessmentResult object containing all details of the task's execution.
292
261
*/
293
262
async function startEvaluationTask (
263
+ config : AssessmentConfig ,
294
264
evalID : EvalID ,
295
265
env : Environment ,
296
266
gateway : Gateway < Environment > ,
297
267
ratingLlm : GenkitRunner ,
298
- model : string ,
299
268
rootPromptDef : PromptDefinition | MultiStepPromptDefinition ,
300
- localMode : boolean ,
301
- skipScreenshots : boolean ,
302
- outputDirectory : string | undefined ,
303
- ragEndpoint : string | undefined ,
304
269
abortSignal : AbortSignal ,
305
- skipAxeTesting : boolean ,
306
- enableUserJourneyTesting : boolean ,
307
- enableAutoCsp : boolean ,
308
270
workerConcurrencyQueue : PQueue ,
309
271
progress : ProgressLogger ,
310
- autoraterModel : string ,
311
- a11yRepairAttempts : number ,
312
- skipLighthouse : boolean ,
313
272
) : Promise < AssessmentResult [ ] > {
314
273
// Set up the project structure once for the root project.
315
274
const { directory, cleanup} = await setupProjectStructure (
316
275
env ,
317
276
rootPromptDef ,
318
277
progress ,
319
- outputDirectory ,
278
+ config . outputDirectory ,
320
279
) ;
321
280
322
281
const results : AssessmentResult [ ] = [ ] ;
323
282
const defsToExecute = rootPromptDef . kind === 'single' ? [ rootPromptDef ] : rootPromptDef . steps ;
324
283
325
284
for ( const promptDef of defsToExecute ) {
326
285
const [ fullPromptText , systemInstructions ] = await Promise . all ( [
327
- env . getPrompt ( promptDef . systemPromptType , promptDef . prompt , ragEndpoint ) ,
286
+ env . getPrompt ( promptDef . systemPromptType , promptDef . prompt , config . ragEndpoint ) ,
328
287
env . getPrompt ( promptDef . systemPromptType , '' ) ,
329
288
] ) ;
330
289
@@ -334,9 +293,8 @@ async function startEvaluationTask(
334
293
335
294
// Generate the initial set of files through the LLM.
336
295
const initialResponse = await generateInitialFiles (
296
+ config ,
337
297
evalID ,
338
- gateway ,
339
- model ,
340
298
env ,
341
299
promptDef ,
342
300
{
@@ -349,7 +307,6 @@ async function startEvaluationTask(
349
307
possiblePackageManagers : getPossiblePackageManagers ( ) . slice ( ) ,
350
308
} ,
351
309
contextFiles ,
352
- localMode ,
353
310
abortSignal ,
354
311
progress ,
355
312
) ;
@@ -406,21 +363,22 @@ async function startEvaluationTask(
406
363
407
364
// TODO: Only execute the serve command on the "final working attempt".
408
365
// TODO: Incorporate usage.
409
- const userJourneyAgentTaskInput : BrowserAgentTaskInput | undefined = enableUserJourneyTesting
410
- ? {
411
- userJourneys : userJourneys . result ,
412
- appPrompt : defsToExecute [ 0 ] . prompt ,
413
- }
414
- : undefined ;
366
+ const userJourneyAgentTaskInput : BrowserAgentTaskInput | undefined =
367
+ config . enableUserJourneyTesting
368
+ ? {
369
+ userJourneys : userJourneys . result ,
370
+ appPrompt : defsToExecute [ 0 ] . prompt ,
371
+ }
372
+ : undefined ;
415
373
416
374
const attemptDetails : AttemptDetails [ ] = [ ] ; // Store details for assessment.json
417
375
418
376
// Try to build the files in the root prompt directory.
419
377
// This will also attempt to fix issues with the generated code.
420
378
const attempt = await attemptBuild (
379
+ config ,
421
380
evalID ,
422
381
gateway ,
423
- model ,
424
382
env ,
425
383
rootPromptDef ,
426
384
directory ,
@@ -430,12 +388,7 @@ async function startEvaluationTask(
430
388
abortSignal ,
431
389
workerConcurrencyQueue ,
432
390
progress ,
433
- skipScreenshots ,
434
- skipAxeTesting ,
435
- enableAutoCsp ,
436
- skipLighthouse ,
437
391
userJourneyAgentTaskInput ,
438
- a11yRepairAttempts ,
439
392
) ;
440
393
441
394
if ( ! attempt ) {
@@ -455,7 +408,7 @@ async function startEvaluationTask(
455
408
attempt . axeRepairAttempts ,
456
409
abortSignal ,
457
410
progress ,
458
- autoraterModel ,
411
+ config . autoraterModel || DEFAULT_AUTORATER_MODEL_NAME ,
459
412
) ;
460
413
461
414
results . push ( {
@@ -493,18 +446,16 @@ async function startEvaluationTask(
493
446
* @param abortSignal Signal to fire when this process should be aborted.
494
447
*/
495
448
async function generateInitialFiles (
449
+ options : AssessmentConfig ,
496
450
evalID : EvalID ,
497
- gateway : Gateway < Environment > ,
498
- model : string ,
499
451
env : Environment ,
500
452
promptDef : RootPromptDefinition ,
501
453
codegenContext : LlmGenerateFilesContext ,
502
454
contextFiles : LlmContextFile [ ] ,
503
- localMode : boolean ,
504
455
abortSignal : AbortSignal ,
505
456
progress : ProgressLogger ,
506
457
) : Promise < LlmGenerateFilesResponse > {
507
- if ( localMode ) {
458
+ if ( options . localMode ) {
508
459
const localFilesDirectory = join ( LLM_OUTPUT_DIR , env . id , promptDef . name ) ;
509
460
const filePaths = globSync ( '**/*' , { cwd : localFilesDirectory } ) ;
510
461
@@ -531,10 +482,10 @@ async function generateInitialFiles(
531
482
532
483
progress . log ( promptDef , 'codegen' , 'Generating code with AI' ) ;
533
484
534
- const response = await gateway . generateInitialFiles (
485
+ const response = await env . gateway . generateInitialFiles (
535
486
evalID ,
536
487
codegenContext ,
537
- model ,
488
+ options . model ,
538
489
contextFiles ,
539
490
abortSignal ,
540
491
) ;
0 commit comments