You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: evals/evaluator.ts
+94-4Lines changed: 94 additions & 4 deletions
Original file line number
Diff line number
Diff line change
@@ -53,9 +53,7 @@ export class Evaluator {
53
53
question,
54
54
answer,
55
55
screenshot =true,
56
-
systemPrompt =`You are an expert evaluator that confidently returns YES or NO given a question and the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
57
-
Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
58
-
Today's date is ${newDate().toLocaleDateString()}`,
56
+
systemPrompt,
59
57
screenshotDelayMs =250,
60
58
}=options;
61
59
if(!question){
@@ -65,6 +63,20 @@ export class Evaluator {
65
63
thrownewError("Either answer (text) or screenshot must be provided");
66
64
}
67
65
66
+
// Handle multiple screenshots case
67
+
if(Array.isArray(screenshot)){
68
+
returnthis._evaluateWithMultipleScreenshots({
69
+
question,
70
+
screenshots: screenshot,
71
+
systemPrompt,
72
+
});
73
+
}
74
+
75
+
// Single screenshot case (existing logic)
76
+
constdefaultSystemPrompt=`You are an expert evaluator that confidently returns YES or NO given a question and the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
77
+
Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
78
+
Today's date is ${newDate().toLocaleDateString()}`;
systemPrompt =`You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
268
+
Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
269
+
Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
270
+
Be critical about the question but consider the ENTIRE sequence when making your determination.
271
+
Today's date is ${newDate().toLocaleDateString()}`,
272
+
}=options;
273
+
274
+
if(!question){
275
+
thrownewError("Question cannot be an empty string");
276
+
}
277
+
278
+
if(!screenshots||screenshots.length===0){
279
+
thrownewError("At least one screenshot must be provided");
text: `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${awaitstagehand.page.title()}`,
captureOnNavigation: true,// Also capture on page navigation
44
+
});
45
+
46
+
screenshotCollector.start();
47
+
38
48
awaitagent.execute({
39
49
instruction: params.ques,
40
50
maxSteps: 50,
41
51
});
42
52
53
+
// Stop collecting and get all screenshots
54
+
constscreenshots=screenshotCollector.stop();
55
+
56
+
logger.log({
57
+
category: "evaluation",
58
+
message: `Collected ${screenshots.length} screenshots for evaluation`,
59
+
level: 1,
60
+
});
61
+
43
62
constevaluator=newEvaluator(stagehand);
44
63
constevalResult=awaitevaluator.ask({
45
-
question: `Did the agent successfully complete this task: "${params.ques}"? Look at the current state of the page to verify if the task was completed successfully.`,
46
-
screenshot: true,
64
+
question: `Did the agent successfully complete this task: "${params.ques}"? Look at all the screenshots showing the progression of the task to verify if it was completed successfully.`,
0 commit comments