diff --git a/.changeset/rich-colts-march.md b/.changeset/rich-colts-march.md
new file mode 100644
index 00000000..dc1ddc6d
--- /dev/null
+++ b/.changeset/rich-colts-march.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Add more evals for stagehand agent
diff --git a/evals/evals.config.json b/evals/evals.config.json
index 9d3c7ccb..e5641a01 100644
--- a/evals/evals.config.json
+++ b/evals/evals.config.json
@@ -323,6 +323,34 @@
       "name": "agent/google_flights",
       "categories": ["agent"]
     },
+    {
+      "name": "agent/github_react_version",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/steam_games",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/ubereats",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/kith",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/apple_tv",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/apple_trade_in",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/arxiv_gpt_report",
+      "categories": ["agent"]
+    },
     {
       "name": "agent/sf_library_card",
       "categories": ["agent"]
@@ -331,6 +359,14 @@
       "name": "agent/sf_library_card_multiple",
       "categories": ["agent"]
     },
+    {
+      "name": "agent/hugging_face",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/google_maps_3",
+      "categories": ["agent"]
+    },
     {
       "name": "login",
       "categories": ["act", "regression"]
@@ -423,5 +459,26 @@
       "name": "namespace_xpath",
       "categories": ["act"]
     }
+    ,
+    {
+      "name": "agent/nba_trades",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/hotel_booking",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/github",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/all_recipes",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/google_shopping",
+      "categories": ["agent"]
+    }
   ]
 }
diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts
index 16ff63da..cf8de26c 100644
--- a/evals/initStagehand.ts
+++ b/evals/initStagehand.ts
@@ -18,6 +18,7 @@ import {
 } from "@browserbasehq/stagehand";
 import { EvalLogger } from "./logger";
 import type { StagehandInitResult } from "@/types/evals";
+import type { AgentConfig } from "@/dist";
 import { AvailableModel } from "@browserbasehq/stagehand";
 
 /**
@@ -104,6 +105,19 @@ export const initStagehand = async ({
   // Set navigation timeout to 60 seconds for evaluations
   stagehand.context.setDefaultNavigationTimeout(60_000);
 
+  const isCUAModel = (model: string): boolean =>
+    model.includes("computer-use-preview") || model.startsWith("claude");
+
+  let agentConfig: AgentConfig | undefined;
+  if (isCUAModel(modelName)) {
+    agentConfig = {
+      model: modelName,
+      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
+    } as AgentConfig;
+  }
+
+  const agent = stagehand.agent(agentConfig);
+
   return {
     stagehand,
     stagehandConfig: config,
@@ -111,5 +125,6 @@ export const initStagehand = async ({
     debugUrl,
     sessionUrl,
     modelName,
+    agent,
   };
 };
diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts
index 238f849c..d67fbebc 100644
--- a/evals/taskConfig.ts
+++ b/evals/taskConfig.ts
@@ -106,7 +106,7 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
 
 const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
   ? process.env.EVAL_AGENT_MODELS.split(",")
-  : ["computer-use-preview-2025-03-11", "claude-3-7-sonnet-latest"];
+  : ["computer-use-preview-2025-03-11", "claude-sonnet-4-20250514"];
 
 /**
  * getModelList:
diff --git a/evals/tasks/agent/all_recipes.ts b/evals/tasks/agent/all_recipes.ts
new file mode 100644
index 00000000..c416b4c5
--- /dev/null
+++ b/evals/tasks/agent/all_recipes.ts
@@ -0,0 +1,61 @@
+import { Evaluator } from "@/evals/evaluator";
+import { EvalFunction } from "@/types/evals";
+
+export const all_recipes: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://www.allrecipes.com/");
+    const evaluator = new Evaluator(stagehand);
+    const agentResult = await agent.execute({
+      instruction:
+        "Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
+      maxSteps: 20,
+    });
+
+    const { evaluation, reasoning } = await evaluator.evaluate({
+      question: "Did the agent find a recipe for Beef Wellington",
+    });
+
+    logger.log(agentResult);
+
+    const success =
+      agentResult.success &&
+      evaluation === "YES" &&
+      stagehand.page.url() ===
+        "https://www.allrecipes.com/recipe/16899/beef-wellington/";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    } as unknown as ReturnType<EvalFunction> extends Promise<infer R>
+      ? R
+      : never;
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/apple_trade_in.ts b/evals/tasks/agent/apple_trade_in.ts
new file mode 100644
index 00000000..0cd25154
--- /dev/null
+++ b/evals/tasks/agent/apple_trade_in.ts
@@ -0,0 +1,60 @@
+//this eval is expected to fail due to issues scrolling within the trade in dialog
+import { EvalFunction } from "@/types/evals";
+import { z } from "zod";
+
+export const apple_trade_in: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://www.apple.com/shop/trade-in");
+    const agentResult = await agent.execute({
+      instruction:
+        "Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
+      maxSteps: 30,
+    });
+
+    const { tradeInValue } = await stagehand.page.extract({
+      modelName: "google/gemini-2.5-flash",
+      instruction:
+        "Extract the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website. it will be inside this text : Get x trade-in credit toward a new iPhone', provide just the number",
+      schema: z.object({
+        tradeInValue: z.number(),
+      }),
+    });
+
+    const success =
+      agentResult.success &&
+      tradeInValue === 360 &&
+      stagehand.page.url().includes("https://www.apple.com/shop/trade-in");
+
+    if (!success) {
+      return {
+        _success: false,
+        message: agentResult.message,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/apple_tv.ts b/evals/tasks/agent/apple_tv.ts
new file mode 100644
index 00000000..7cab4c3a
--- /dev/null
+++ b/evals/tasks/agent/apple_tv.ts
@@ -0,0 +1,61 @@
+import { EvalFunction } from "@/types/evals";
+import { z } from "zod";
+
+export const apple_tv: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://www.apple.com/");
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
+      maxSteps: 30,
+    });
+
+    const { height, width } = await stagehand.page.extract({
+      modelName: "google/gemini-2.5-flash",
+      instruction: "Extract the size and weight of the Apple TV 4K",
+      schema: z.object({
+        height: z.number().describe("The height of the Apple TV 4K in inches"),
+        width: z.number().describe("The width of the Apple TV 4K in inches"),
+      }),
+    });
+
+    const success =
+      agentResult.success &&
+      height === 1.2 &&
+      width === 3.66 &&
+      stagehand.page.url().includes("https://www.apple.com/apple-tv-4k/specs/");
+
+    if (!success) {
+      return {
+        _success: false,
+        message: agentResult.message,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/arxiv_gpt_report.ts b/evals/tasks/agent/arxiv_gpt_report.ts
new file mode 100644
index 00000000..578bbb53
--- /dev/null
+++ b/evals/tasks/agent/arxiv_gpt_report.ts
@@ -0,0 +1,60 @@
+//agent often fails on this one,
+import { EvalFunction } from "@/types/evals";
+import { z } from "zod";
+export const arxiv_gpt_report: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://arxiv.org/");
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
+      maxSteps: 20,
+    });
+
+    // Mon, 27 Mar 2023 17:46:54 UTC
+    const { date } = await stagehand.page.extract({
+      modelName: "google/gemini-2.5-flash",
+      instruction:
+        "Extract the date of the v3 submission history, it should be in the format 'MM-DD-YYYY'",
+      schema: z.object({
+        date: z.string().describe("The date of the v3 submission history"),
+      }),
+    });
+
+    console.log(`date: ${date}`);
+
+    const success = agentResult.success && date === "03-27-2023";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: agentResult.message,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/github.ts b/evals/tasks/agent/github.ts
new file mode 100644
index 00000000..795ec670
--- /dev/null
+++ b/evals/tasks/agent/github.ts
@@ -0,0 +1,56 @@
+import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "@/evals/evaluator";
+export const github: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://github.com/");
+    const evaluator = new Evaluator(stagehand);
+    const agentResult = await agent.execute({
+      instruction:
+        "Find a Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.",
+      maxSteps: 14,
+    });
+    logger.log(agentResult);
+
+    const { evaluation, reasoning } = await evaluator.evaluate({
+      question:
+        "Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.",
+    });
+
+    const success = agentResult.success && evaluation === "YES";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    } as unknown as ReturnType<EvalFunction> extends Promise<infer R>
+      ? R
+      : never;
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/github_react_version.ts b/evals/tasks/agent/github_react_version.ts
new file mode 100644
index 00000000..cbd29001
--- /dev/null
+++ b/evals/tasks/agent/github_react_version.ts
@@ -0,0 +1,52 @@
+import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "@/evals/evaluator";
+export const github_react_version: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    const evaluator = new Evaluator(stagehand);
+    await stagehand.page.goto("https://github.com/");
+    await agent.execute({
+      instruction:
+        "Check the latest release version of React and the date it was published. ",
+      maxSteps: 20,
+    });
+    const { evaluation, reasoning } = await evaluator.evaluate({
+      question:
+        "Does the page show the latest version of react and the date it was published",
+    });
+    console.log(`evaluation: ${evaluation}`);
+    console.log(`reasoning: ${reasoning}`);
+    // only use url check for now, as using extract on the version is prone to breaking in future
+    const success = evaluation === "YES";
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/google_flights.ts b/evals/tasks/agent/google_flights.ts
index a5785d73..b507ef1d 100644
--- a/evals/tasks/agent/google_flights.ts
+++ b/evals/tasks/agent/google_flights.ts
@@ -6,17 +6,11 @@ export const google_flights: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
-  modelName,
+  agent,
 }) => {
   try {
     await stagehand.page.goto("https://google.com/travel/flights");
 
-    const agent = stagehand.agent({
-      model: modelName,
-      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}. The current page is ${await stagehand.page.title()}`,
-    });
-
     const agentResult = await agent.execute({
       instruction:
         "Search for flights from San Francisco to New York for next weekend",
diff --git a/evals/tasks/agent/google_maps.ts b/evals/tasks/agent/google_maps.ts
index 13d7389e..9ac140e3 100644
--- a/evals/tasks/agent/google_maps.ts
+++ b/evals/tasks/agent/google_maps.ts
@@ -6,17 +6,11 @@ export const google_maps: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
-  modelName,
+  agent,
 }) => {
   try {
     await stagehand.page.goto("https://maps.google.com");
 
-    const agent = stagehand.agent({
-      model: modelName,
-      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
-    });
-
     const agentResult = await agent.execute({
       instruction:
         "How long does it take to get from San Francisco to New York driving?",
diff --git a/evals/tasks/agent/google_maps_2.ts b/evals/tasks/agent/google_maps_2.ts
index e80478fa..05ce5dbd 100644
--- a/evals/tasks/agent/google_maps_2.ts
+++ b/evals/tasks/agent/google_maps_2.ts
@@ -7,17 +7,11 @@ export const google_maps_2: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
-  modelName,
+  agent,
 }) => {
   try {
     await stagehand.page.goto("https://maps.google.com");
 
-    const agent = stagehand.agent({
-      model: modelName,
-      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
-    });
-
     const agentResult = await agent.execute({
       instruction:
         "Search for the fastest walking route from La Puerta de Alcalá to La Puerta del Sol",
diff --git a/evals/tasks/agent/google_maps_3.ts b/evals/tasks/agent/google_maps_3.ts
new file mode 100644
index 00000000..fe47b4ec
--- /dev/null
+++ b/evals/tasks/agent/google_maps_3.ts
@@ -0,0 +1,52 @@
+import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "@/evals/evaluator";
+export const google_maps_3: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://maps.google.com/");
+    const evaluator = new Evaluator(stagehand);
+    const agentResult = await agent.execute({
+      instruction:
+        "Search for locksmiths open now but not open 24 hours in Texas City.",
+      maxSteps: 30,
+    });
+
+    const { evaluation, reasoning } = await evaluator.evaluate({
+      question:
+        "Does the page show a locksmiths open now but not open 24 hours in Texas City?",
+    });
+
+    const success = agentResult.success && evaluation === "YES";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/google_shopping.ts b/evals/tasks/agent/google_shopping.ts
new file mode 100644
index 00000000..b557f4cf
--- /dev/null
+++ b/evals/tasks/agent/google_shopping.ts
@@ -0,0 +1,58 @@
+import { Evaluator } from "@/evals/evaluator";
+import { EvalFunction } from "@/types/evals";
+
+export const google_shopping: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://www.google.com/shopping");
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Find a drip coffee maker that is on sale and within $25-60 and has a black finish",
+      maxSteps: 20,
+    });
+    logger.log(agentResult);
+
+    const evaluator = new Evaluator(stagehand);
+    const { evaluation, reasoning } = await evaluator.evaluate({
+      question:
+        "Does the page show a drip coffee maker that is on sale and within $25-60 and has a black finish?",
+    });
+
+    const success = agentResult.success && evaluation === "YES";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    } as unknown as ReturnType<EvalFunction> extends Promise<infer R>
+      ? R
+      : never;
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/hotel_booking.ts b/evals/tasks/agent/hotel_booking.ts
new file mode 100644
index 00000000..292b691e
--- /dev/null
+++ b/evals/tasks/agent/hotel_booking.ts
@@ -0,0 +1,58 @@
+//this eval is expected to fail.
+import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "@/evals/evaluator";
+export const hotel_booking: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://www.booking.com/");
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Find a hotel in Sydney with a rating of 8 or higher, providing free Wi-Fi and parking, available for a four-night stay starting on December 10, 2025.",
+      maxSteps: 20,
+    });
+    logger.log(agentResult);
+
+    const evaluator = new Evaluator(stagehand);
+    const { evaluation, reasoning } = await evaluator.evaluate({
+      question:
+        "Does the page show a hotel in Sydney with a rating of 8 or higher, providing free Wi-Fi and parking, available for a four-night stay starting on December 10, 2025?",
+    });
+
+    const success = agentResult.success && evaluation === "YES";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    } as unknown as ReturnType<EvalFunction> extends Promise<infer R>
+      ? R
+      : never;
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/hugging_face.ts b/evals/tasks/agent/hugging_face.ts
new file mode 100644
index 00000000..d7e1e26a
--- /dev/null
+++ b/evals/tasks/agent/hugging_face.ts
@@ -0,0 +1,54 @@
+import { EvalFunction } from "@/types/evals";
+import { z } from "zod";
+
+export const hugging_face: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://huggingface.co/");
+    const agentResult = await agent.execute({
+      instruction:
+        "Search for a model on Hugging Face with an Apache-2.0 license that has received the highest number of likes.",
+      maxSteps: 15,
+    });
+
+    const { modelName } = await stagehand.page.extract({
+      modelName: "google/gemini-2.5-flash",
+      instruction: "Extract the name of the model",
+      schema: z.object({
+        modelName: z.string(),
+      }),
+    });
+    console.log(`modelName: ${modelName}`);
+    const success = agentResult.success && modelName === "Kokoro-82M";
+    if (!success) {
+      return {
+        _success: false,
+        message: agentResult.message,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/iframe_form.ts b/evals/tasks/agent/iframe_form.ts
index a24057f5..056668be 100644
--- a/evals/tasks/agent/iframe_form.ts
+++ b/evals/tasks/agent/iframe_form.ts
@@ -6,16 +6,11 @@ export const iframe_form: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
-  modelName,
+  agent,
 }) => {
   try {
     await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/");
 
-    const agent = stagehand.agent({
-      provider: "anthropic",
-      model: modelName,
-    });
-
     const agentResult = await agent.execute({
       instruction: "Fill in the form name with 'John Smith'",
       maxSteps: 3,
diff --git a/evals/tasks/agent/iframe_form_multiple.ts b/evals/tasks/agent/iframe_form_multiple.ts
index 0b2f4854..cad7c9a3 100644
--- a/evals/tasks/agent/iframe_form_multiple.ts
+++ b/evals/tasks/agent/iframe_form_multiple.ts
@@ -6,16 +6,11 @@ export const iframe_form_multiple: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
-  modelName,
+  agent,
 }) => {
   try {
     await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/");
 
-    const agent = stagehand.agent({
-      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-      model: modelName,
-    });
-
     const agentResult = await agent.execute({
       instruction:
         "Fill in the form name with 'John Smith', the email with 'john.smith@example.com', and select the 'Are you the domain owner?' option as 'No'",
diff --git a/evals/tasks/agent/kayak.ts b/evals/tasks/agent/kayak.ts
index e176a241..464ae3e1 100644
--- a/evals/tasks/agent/kayak.ts
+++ b/evals/tasks/agent/kayak.ts
@@ -6,18 +6,12 @@ export const kayak: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
+  agent,
 }) => {
   try {
     const evaluator = new Evaluator(stagehand);
     await stagehand.page.goto("https://www.kayak.com");
-    const agent = stagehand.agent({
-      provider: "openai",
-      model: "computer-use-preview",
-      instructions: `You are a helpful assistant that can help me find flights. DON'T ASK FOLLOW UP QUESTIONS UNTIL YOU HAVE FULFILLED THE USER'S REQUEST. Today is ${new Date().toLocaleDateString()}.`,
-      options: {
-        apiKey: process.env.OPENAI_API_KEY,
-      },
-    });
+
     await agent.execute({
       instruction: "Find flights from San Francisco to Tokyo next week",
       maxSteps: 15,
diff --git a/evals/tasks/agent/kith.ts b/evals/tasks/agent/kith.ts
new file mode 100644
index 00000000..d796374c
--- /dev/null
+++ b/evals/tasks/agent/kith.ts
@@ -0,0 +1,78 @@
+import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "@/evals/evaluator";
+
+export const kith: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    const evaluator = new Evaluator(stagehand);
+    await stagehand.page.goto(
+      "https://kith.com/collections/nike-air-force-1/products/nkcw2288-111?variant=19439468707968",
+    );
+
+    await agent.execute({
+      instruction:
+        "add the shoes to cart, go to checkout, and fill the delivery information",
+      maxSteps: 25,
+    });
+
+    const { evaluation, reasoning } = await evaluator.evaluate({
+      question: "Did the agent fill the delivery information",
+    });
+
+    const success = evaluation === "YES";
+
+    if (success) {
+      await agent.execute({
+        instruction: "fill the credit card information",
+        maxSteps: 10,
+      });
+
+      const { evaluation: evaluation2, reasoning: reasoning2 } =
+        await evaluator.evaluate({
+          question: "Did the agent fill the payment information",
+        });
+
+      const success2 = evaluation2 === "YES";
+
+      if (success2) {
+        return {
+          _success: true,
+          debugUrl,
+          sessionUrl,
+          logs: logger.getLogs(),
+        };
+      } else {
+        return {
+          _success: false,
+          message: reasoning2,
+          debugUrl,
+          sessionUrl,
+          logs: logger.getLogs(),
+        };
+      }
+    } else {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/nba_trades.ts b/evals/tasks/agent/nba_trades.ts
new file mode 100644
index 00000000..861f1d99
--- /dev/null
+++ b/evals/tasks/agent/nba_trades.ts
@@ -0,0 +1,59 @@
+import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "@/evals/evaluator";
+export const nba_trades: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    const evaluator = new Evaluator(stagehand);
+    await stagehand.page.goto("https://www.espn.com/");
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Find the latest Team transaction in the NBA within the past week.",
+      maxSteps: 20,
+    });
+    logger.log(agentResult);
+
+    const { evaluation, reasoning } = await evaluator.evaluate({
+      question: "Did the agent make it to the nba transactions page?",
+    });
+
+    const success =
+      agentResult.success &&
+      stagehand.page.url() === "https://www.espn.com/nba/transactions" &&
+      evaluation === "YES";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    } as unknown as ReturnType<EvalFunction> extends Promise<infer R>
+      ? R
+      : never;
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/sf_library_card.ts b/evals/tasks/agent/sf_library_card.ts
index a31fb1cd..e29c9d14 100644
--- a/evals/tasks/agent/sf_library_card.ts
+++ b/evals/tasks/agent/sf_library_card.ts
@@ -6,17 +6,11 @@ export const sf_library_card: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
-  modelName,
+  agent,
 }) => {
   try {
     await stagehand.page.goto("https://sflib1.sfpl.org/selfreg");
 
-    const agent = stagehand.agent({
-      model: modelName,
-      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
-    });
-
     const agentResult = await agent.execute({
       instruction:
         "Fill in the 'Residential Address' field with '166 Geary St'",
diff --git a/evals/tasks/agent/sf_library_card_multiple.ts b/evals/tasks/agent/sf_library_card_multiple.ts
index 367ff763..8ea952fe 100644
--- a/evals/tasks/agent/sf_library_card_multiple.ts
+++ b/evals/tasks/agent/sf_library_card_multiple.ts
@@ -6,17 +6,11 @@ export const sf_library_card_multiple: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
-  modelName,
+  agent,
 }) => {
   try {
     await stagehand.page.goto("https://sflib1.sfpl.org/selfreg");
 
-    const agent = stagehand.agent({
-      model: modelName,
-      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
-    });
-
     const agentResult = await agent.execute({
       instruction:
         "Fill in ALL the required fields with mock data. DO NOT submit the form",
diff --git a/evals/tasks/agent/sign_in.ts b/evals/tasks/agent/sign_in.ts
index d03ae211..487cc760 100644
--- a/evals/tasks/agent/sign_in.ts
+++ b/evals/tasks/agent/sign_in.ts
@@ -5,17 +5,11 @@ export const sign_in: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
-  modelName,
+  agent,
 }) => {
   try {
     await stagehand.page.goto("https://v0-modern-login-flow.vercel.app/");
 
-    const agent = stagehand.agent({
-      model: modelName,
-      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
-    });
-
     const agentResult = await agent.execute({
       instruction:
         "Sign in with the email address 'test@browserbaser.com' and the password 'stagehand=goated' ",
diff --git a/evals/tasks/agent/steam_games.ts b/evals/tasks/agent/steam_games.ts
new file mode 100644
index 00000000..a7ad41da
--- /dev/null
+++ b/evals/tasks/agent/steam_games.ts
@@ -0,0 +1,50 @@
+import { EvalFunction } from "@/types/evals";
+
+export const steam_games: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://store.steampowered.com/");
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Show most played games in Steam. And tell me the number of players in In game at this time",
+      maxSteps: 30,
+    });
+
+    //strictly used url check and no extract as the top games / players can vary
+    const success =
+      agentResult.success &&
+      stagehand.page.url().includes("https://store.steampowered.com/");
+
+    if (!success) {
+      return {
+        _success: false,
+        message: agentResult.message,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/trivago.ts b/evals/tasks/agent/trivago.ts
index 3b71c80f..847bb77d 100644
--- a/evals/tasks/agent/trivago.ts
+++ b/evals/tasks/agent/trivago.ts
@@ -5,17 +5,11 @@ export const trivago: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
-  modelName,
+  agent,
 }) => {
   try {
     await stagehand.page.goto("https://www.trivago.com/");
 
-    const agent = stagehand.agent({
-      model: modelName,
-      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}.The current page is ${await stagehand.page.title()}`,
-    });
-
     const agentResult = await agent.execute({
       instruction:
         "Find the cheapest room in the hotel H10 Tribeca in Madrid next weekend. Stop at the trivago page showing the results",
diff --git a/evals/tasks/agent/ubereats.ts b/evals/tasks/agent/ubereats.ts
new file mode 100644
index 00000000..ee5d6720
--- /dev/null
+++ b/evals/tasks/agent/ubereats.ts
@@ -0,0 +1,54 @@
+import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "@/evals/evaluator";
+
+export const ubereats: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    const evaluator = new Evaluator(stagehand);
+    await stagehand.page.goto("https://www.ubereats.com/");
+
+    await agent.execute({
+      instruction:
+        "Order a pizza from ubereats to 639 geary st in sf, call the task complete once the login page is shown after adding pizza and viewing the cart",
+      maxSteps: 30,
+    });
+
+    const { evaluation, reasoning } = await evaluator.evaluate({
+      question: "Did the agent make it to the login page?",
+    });
+
+    const success =
+      evaluation === "YES" &&
+      stagehand.page.url().includes("https://auth.uber.com/");
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/youtube.ts b/evals/tasks/agent/youtube.ts
index f3b755bc..02d10020 100644
--- a/evals/tasks/agent/youtube.ts
+++ b/evals/tasks/agent/youtube.ts
@@ -5,17 +5,11 @@ export const youtube: EvalFunction = async ({
   sessionUrl,
   stagehand,
   logger,
-  modelName,
+  agent,
 }) => {
   try {
     await stagehand.page.goto("https://youtube.com");
 
-    const agent = stagehand.agent({
-      model: modelName,
-      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
-      instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`,
-    });
-
     const agentResult = await agent.execute({
       instruction:
         "Search for Keinemusik's set under some very famous pointy landmarks",
diff --git a/types/evals.ts b/types/evals.ts
index d97e2796..51742896 100644
--- a/types/evals.ts
+++ b/types/evals.ts
@@ -13,6 +13,7 @@ export type StagehandInitResult = {
   sessionUrl: string;
   stagehandConfig: ConstructorParams;
   modelName: AvailableModel;
+  agent: ReturnType<Stagehand["agent"]>;
 };
 
 export type EvalFunction = (taskInput: StagehandInitResult) => Promise<{