Skip to content

Commit dbf2dec

Browse files
authored
refactor(browserbase): update evaluation status and reason handling (#1923)
1 parent d0b353c commit dbf2dec

File tree

1 file changed

+10
-75
lines changed

1 file changed

+10
-75
lines changed

apps/api/src/browserbase/browserbase.service.ts

Lines changed: 10 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -548,15 +548,15 @@ export class BrowserbaseService {
548548
completedAt: new Date(),
549549
durationMs: run.startedAt ? Date.now() - run.startedAt.getTime() : 0,
550550
screenshotUrl: screenshotKey,
551-
evaluationStatus: 'pass',
552-
evaluationReason: result.evaluationReason ?? 'Requirement verified',
551+
evaluationStatus: result.evaluationStatus ?? null,
552+
evaluationReason: result.evaluationReason ?? 'Screenshot captured',
553553
},
554554
});
555555

556556
return {
557557
success: true,
558558
screenshotUrl: presignedUrl,
559-
evaluationStatus: 'pass',
559+
evaluationStatus: result.evaluationStatus,
560560
evaluationReason: result.evaluationReason,
561561
};
562562
} catch (err) {
@@ -687,16 +687,16 @@ export class BrowserbaseService {
687687
completedAt: new Date(),
688688
durationMs: Date.now() - run.startedAt!.getTime(),
689689
screenshotUrl: screenshotKey,
690-
evaluationStatus: 'pass',
691-
evaluationReason: result.evaluationReason ?? 'Requirement verified',
690+
evaluationStatus: result.evaluationStatus ?? null,
691+
evaluationReason: result.evaluationReason ?? 'Screenshot captured',
692692
},
693693
});
694694

695695
return {
696696
runId: run.id,
697697
success: true,
698698
screenshotUrl: presignedUrl,
699-
evaluationStatus: 'pass',
699+
evaluationStatus: result.evaluationStatus,
700700
evaluationReason: result.evaluationReason,
701701
};
702702
} finally {
@@ -793,73 +793,7 @@ export class BrowserbaseService {
793793
// Wait for final page to settle
794794
await delay(2000);
795795

796-
// Evaluate if the automation fulfills the task requirements BEFORE taking screenshot
797-
if (taskContext) {
798-
// Re-acquire page in case the agent closed/replaced it during execution
799-
page = await this.ensureActivePage(stagehand);
800-
801-
const evaluationSchema = z.object({
802-
passes: z
803-
.boolean()
804-
.describe(
805-
'Whether the current page state shows that the requirement is fulfilled',
806-
),
807-
reason: z
808-
.string()
809-
.describe(
810-
'A brief explanation of why it passes or fails the requirement',
811-
),
812-
});
813-
814-
const evaluationPrompt = `You are evaluating whether a compliance requirement is being met.
815-
816-
Task/Requirement: "${taskContext.title}"
817-
${taskContext.description ? `Description: "${taskContext.description}"` : ''}
818-
819-
Navigation completed: "${instruction}"
820-
821-
Look at the current page and determine if the visible configuration, settings, or state demonstrates that this requirement is fulfilled.
822-
823-
For example:
824-
- If the task is about "branch protection", check if branch protection rules are visible and enabled
825-
- If the task is about "MFA/2FA", check if multi-factor authentication is shown as enabled
826-
- If the task is about "access controls", check if appropriate access restrictions are configured
827-
828-
Be strict: if the setting is disabled, not configured, or shows a warning/error state, it should FAIL.
829-
Only pass if there is clear evidence the requirement is properly configured and active.`;
830-
831-
try {
832-
const evaluation = (await stagehand.extract(
833-
evaluationPrompt,
834-
evaluationSchema as any,
835-
)) as { passes: boolean; reason: string };
836-
837-
this.logger.log(
838-
`Automation evaluation: ${evaluation.passes ? 'PASS' : 'FAIL'} - ${evaluation.reason}`,
839-
);
840-
841-
// If evaluation fails, abort without taking screenshot
842-
if (!evaluation.passes) {
843-
return {
844-
success: false,
845-
evaluationStatus: 'fail',
846-
evaluationReason: evaluation.reason,
847-
error: `Requirement not met: ${evaluation.reason}`,
848-
};
849-
}
850-
} catch (evalErr) {
851-
this.logger.warn(
852-
`Failed to evaluate automation: ${evalErr instanceof Error ? evalErr.message : String(evalErr)}`,
853-
);
854-
// If evaluation itself errors, fail the automation
855-
return {
856-
success: false,
857-
error: `Evaluation error: ${evalErr instanceof Error ? evalErr.message : 'Unknown error'}`,
858-
};
859-
}
860-
}
861-
862-
// Only take screenshot if evaluation passed (or no task context)
796+
// Always take a screenshot at the end (no pass/fail criteria gate)
863797
page = await this.ensureActivePage(stagehand);
864798
const screenshot = await page.screenshot({
865799
type: 'jpeg',
@@ -870,8 +804,9 @@ Only pass if there is clear evidence the requirement is properly configured and
870804
return {
871805
success: true,
872806
screenshot: screenshot.toString('base64'),
873-
evaluationStatus: 'pass',
874-
evaluationReason: 'Requirement verified successfully',
807+
evaluationReason: taskContext
808+
? `Navigation completed for "${taskContext.title}". Screenshot captured.`
809+
: 'Navigation completed. Screenshot captured.',
875810
};
876811
} catch (err) {
877812
this.logger.error('Failed to execute automation', err);

0 commit comments

Comments
 (0)