browserbase
diff --git a/‎evals/tasks/agent/google_flights.ts
Lines changed: 49 additions & 40 deletions b/‎evals/tasks/agent/google_flights.ts
Lines changed: 49 additions & 40 deletions
diff --git a/‎evals/tasks/agent/iframe_form.ts
Lines changed: 66 additions & 58 deletions b/‎evals/tasks/agent/iframe_form.ts
Lines changed: 66 additions & 58 deletions
diff --git a/‎evals/tasks/agent/iframe_form_multiple.ts
Lines changed: 59 additions & 50 deletions b/‎evals/tasks/agent/iframe_form_multiple.ts
Lines changed: 59 additions & 50 deletions
diff --git a/‎evals/tasks/agent/kayak.ts
Lines changed: 1 addition & 1 deletion b/‎evals/tasks/agent/kayak.ts
Lines changed: 1 addition & 1 deletion
@@ -8,56 +8,65 @@ export const google_flights: EvalFunction = async ({
   logger,
   modelName,
 }) => {
-  await stagehand.page.goto("https://google.com/travel/flights");
+  try {
+    await stagehand.page.goto("https://google.com/travel/flights");
 
-  const agent = stagehand.agent({
-    model: modelName,
-    provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-    instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}. The current page is ${await stagehand.page.title()}`,
-  });
+    const agent = stagehand.agent({
+      model: modelName,
+      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
+      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}. The current page is ${await stagehand.page.title()}`,
+    });
 
-  const agentResult = await agent.execute({
-    instruction:
-      "Search for flights from San Francisco to New York for next weekend",
-    maxSteps: 15,
-  });
-  logger.log(agentResult);
+    const agentResult = await agent.execute({
+      instruction:
+        "Search for flights from San Francisco to New York for next weekend",
+      maxSteps: 15,
+    });
+    logger.log(agentResult);
 
-  const evaluator = new Evaluator(stagehand);
-  const result = await evaluator.evaluate({
-    question:
-      "Does the page show flights (options, available flights, not a search form) from San Francisco to New York?",
-    strictResponse: true,
-  });
+    const evaluator = new Evaluator(stagehand);
+    const result = await evaluator.evaluate({
+      question:
+        "Does the page show flights (options, available flights, not a search form) from San Francisco to New York?",
+      strictResponse: true,
+    });
 
-  if (result.evaluation !== "YES" && result.evaluation !== "NO") {
-    await stagehand.close();
-    return {
-      _success: false,
-      observations: "Evaluator provided an invalid response",
-      debugUrl,
-      sessionUrl,
-      logs: logger.getLogs(),
-    };
-  }
+    if (result.evaluation !== "YES" && result.evaluation !== "NO") {
+      return {
+        _success: false,
+        observations: "Evaluator provided an invalid response",
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
 
-  if (result.evaluation === "YES") {
-    await stagehand.close();
-    return {
-      _success: true,
-      observations: result.reasoning,
-      debugUrl,
-      sessionUrl,
-      logs: logger.getLogs(),
-    };
-  } else {
-    await stagehand.close();
+    if (result.evaluation === "YES") {
+      return {
+        _success: true,
+        observations: result.reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    } else {
+      return {
+        _success: false,
+        observations: result.reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+  } catch (error) {
     return {
       _success: false,
-      observations: result.reasoning,
+      error: error,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),
     };
+  } finally {
+    await stagehand.close();
   }
 };
@@ -8,77 +8,85 @@ export const iframe_form: EvalFunction = async ({
   logger,
   modelName,
 }) => {
-  await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/");
+  try {
+    await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/");
 
-  const agent = stagehand.agent({
-    provider: "anthropic",
-    model: modelName,
-  });
+    const agent = stagehand.agent({
+      provider: "anthropic",
+      model: modelName,
+    });
 
-  const agentResult = await agent.execute({
-    instruction: "Fill in the form name with 'John Smith'",
-    maxSteps: 3,
-  });
-  logger.log(agentResult);
+    const agentResult = await agent.execute({
+      instruction: "Fill in the form name with 'John Smith'",
+      maxSteps: 3,
+    });
+    logger.log(agentResult);
 
-  await stagehand.page.mouse.wheel(0, -1000);
-  const evaluator = new Evaluator(stagehand);
-  const result = await evaluator.evaluate({
-    question: "Is the form name input filled with 'John Smith'?",
-    strictResponse: true,
-  });
+    await stagehand.page.mouse.wheel(0, -1000);
+    const evaluator = new Evaluator(stagehand);
+    const result = await evaluator.evaluate({
+      question: "Is the form name input filled with 'John Smith'?",
+      strictResponse: true,
+    });
 
-  if (result.evaluation !== "YES" && result.evaluation !== "NO") {
-    await stagehand.close();
-    return {
-      _success: false,
-      observations: "Evaluator provided an invalid response",
-      debugUrl,
-      sessionUrl,
-      logs: logger.getLogs(),
-    };
-  }
+    if (result.evaluation !== "YES" && result.evaluation !== "NO") {
+      return {
+        _success: false,
+        observations: "Evaluator provided an invalid response",
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
 
-  const agentResult2 = await agent.execute({
-    instruction: "Fill in the form email with '[email protected]'",
-    maxSteps: 3,
-  });
-  logger.log(agentResult2);
+    const agentResult2 = await agent.execute({
+      instruction: "Fill in the form email with '[email protected]'",
+      maxSteps: 3,
+    });
+    logger.log(agentResult2);
 
-  await stagehand.page.mouse.wheel(0, -1000);
-  const result2 = await evaluator.evaluate({
-    question: "Is the form email input filled with '[email protected]'?",
-    strictResponse: true,
-  });
+    await stagehand.page.mouse.wheel(0, -1000);
+    const result2 = await evaluator.evaluate({
+      question: "Is the form email input filled with '[email protected]'?",
+      strictResponse: true,
+    });
 
-  if (result2.evaluation !== "YES" && result2.evaluation !== "NO") {
-    await stagehand.close();
-    return {
-      _success: false,
-      observations: "Evaluator provided an invalid response",
-      debugUrl,
-      sessionUrl,
-      logs: logger.getLogs(),
-    };
-  }
+    if (result2.evaluation !== "YES" && result2.evaluation !== "NO") {
+      return {
+        _success: false,
+        observations: "Evaluator provided an invalid response",
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
 
-  if (result.evaluation === "YES" && result2.evaluation === "YES") {
-    await stagehand.close();
-    return {
-      _success: true,
-      observations: "All fields were filled correctly",
-      debugUrl,
-      sessionUrl,
-      logs: logger.getLogs(),
-    };
-  } else {
-    await stagehand.close();
+    if (result.evaluation === "YES" && result2.evaluation === "YES") {
+      return {
+        _success: true,
+        observations: "All fields were filled correctly",
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    } else {
+      return {
+        _success: false,
+        observations: "One or more fields were not filled correctly",
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+  } catch (error) {
     return {
       _success: false,
-      observations: "One or more fields were not filled correctly",
+      error: error,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),
     };
+  } finally {
+    await stagehand.close();
   }
 };
@@ -8,60 +8,69 @@ export const iframe_form_multiple: EvalFunction = async ({
   logger,
   modelName,
 }) => {
-  await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/");
+  try {
+    await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/");
 
-  const agent = stagehand.agent({
-    provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-    model: modelName,
-  });
+    const agent = stagehand.agent({
+      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
+      model: modelName,
+    });
 
-  const agentResult = await agent.execute({
-    instruction:
-      "Fill in the form name with 'John Smith', the email with '[email protected]', and select the 'Are you the domain owner?' option as 'No'",
-    maxSteps: 10,
-  });
-  logger.log(agentResult);
+    const agentResult = await agent.execute({
+      instruction:
+        "Fill in the form name with 'John Smith', the email with '[email protected]', and select the 'Are you the domain owner?' option as 'No'",
+      maxSteps: 10,
+    });
+    logger.log(agentResult);
 
-  await stagehand.page.mouse.wheel(0, -1000);
-  const evaluator = new Evaluator(stagehand);
-  const results = await evaluator.batchEvaluate({
-    questions: [
-      "Is the form name input filled with 'John Smith'?",
-      "Is the form email input filled with '[email protected]'?",
-      "Is the 'Are you the domain owner?' option selected as 'No'?",
-    ],
-    strictResponse: true,
-  });
+    await stagehand.page.mouse.wheel(0, -1000);
+    const evaluator = new Evaluator(stagehand);
+    const results = await evaluator.batchEvaluate({
+      questions: [
+        "Is the form name input filled with 'John Smith'?",
+        "Is the form email input filled with '[email protected]'?",
+        "Is the 'Are you the domain owner?' option selected as 'No'?",
+      ],
+      strictResponse: true,
+    });
 
-  for (const r of results) {
-    if (r.evaluation !== "YES" && r.evaluation !== "NO") {
-      await stagehand.close();
-      return {
-        _success: false,
-        observations: "Evaluator provided an invalid response",
-        debugUrl,
-        sessionUrl,
-        logs: logger.getLogs(),
-      };
+    for (const r of results) {
+      if (r.evaluation !== "YES" && r.evaluation !== "NO") {
+        return {
+          _success: false,
+          observations: "Evaluator provided an invalid response",
+          debugUrl,
+          sessionUrl,
+          logs: logger.getLogs(),
+        };
+      }
+      if (r.evaluation === "NO") {
+        return {
+          _success: false,
+          observations: r.reasoning,
+          debugUrl,
+          sessionUrl,
+          logs: logger.getLogs(),
+        };
+      }
     }
-    if (r.evaluation === "NO") {
-      await stagehand.close();
-      return {
-        _success: false,
-        observations: r.reasoning,
-        debugUrl,
-        sessionUrl,
-        logs: logger.getLogs(),
-      };
-    }
-  }
 
-  await stagehand.close();
-  return {
-    _success: true,
-    observations: "All fields were filled correctly",
-    debugUrl,
-    sessionUrl,
-    logs: logger.getLogs(),
-  };
+    return {
+      _success: true,
+      observations: "All fields were filled correctly",
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      error: error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
 };
@@ -65,6 +65,6 @@ export const kayak: EvalFunction = async ({
       logs: logger.getLogs(),
     };
   } finally {
-    stagehand.close();
+    await stagehand.close();
   }
 };
Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,6 @@ export const kayak: EvalFunction = async ({`
`65`	`65`	`logs: logger.getLogs(),`
`66`	`66`	`};`
`67`	`67`	`} finally {`
`68`		`- stagehand.close();`
	`68`	`+ await stagehand.close();`
`69`	`69`	`}`
`70`	`70`	`};`