consolidate llm provider for evals (#893)

miguelg719 · web-flow · commit f75f5fb848f3 · 2025-07-17T15:18:04.000-07:00
# why

# what changed

# test plan
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -11,7 +11,7 @@ on:
       - "docs/**"
 
 env:
-  EVAL_MODELS: "gpt-4.1,gemini-2.0-flash,claude-3-5-sonnet-latest"
+  EVAL_MODELS: "openai/gpt-4.1,google/gemini-2.0-flash,anthropic/claude-3-5-sonnet-latest"
   EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract"
   EVAL_MAX_CONCURRENCY: 25
   EVAL_TRIAL_COUNT: 5
@@ -179,9 +179,6 @@ jobs:
           npm i -g pnpm
           pnpm install --no-frozen-lockfile
 
-      - name: Install Playwright browsers
-        run: pnpm exec playwright install --with-deps
-
       - name: Build Stagehand
         run: pnpm run build
 
@@ -222,9 +219,6 @@ jobs:
       - name: Build Stagehand
         run: pnpm run build
 
-      - name: Install Playwright browsers
-        run: pnpm exec playwright install --with-deps
-
       - name: Run Regression Evals
         run: pnpm run evals category regression trials=2 concurrency=20 env=BROWSERBASE
 
@@ -288,10 +282,6 @@ jobs:
         if: needs.determine-evals.outputs.run-combination == 'true'
         run: pnpm run build
 
-      - name: Install Playwright browsers
-        if: needs.determine-evals.outputs.run-combination == 'true'
-        run: pnpm exec playwright install --with-deps
-
       - name: Run Combination Evals
         if: needs.determine-evals.outputs.run-combination == 'true'
         run: pnpm run evals category combination
@@ -354,10 +344,6 @@ jobs:
         if: needs.determine-evals.outputs.run-act == 'true'
         run: pnpm run build
 
-      - name: Install Playwright browsers
-        if: needs.determine-evals.outputs.run-act == 'true'
-        run: pnpm exec playwright install --with-deps
-
       - name: Run Act Evals
         if: needs.determine-evals.outputs.run-act == 'true'
         run: pnpm run evals category act
@@ -423,10 +409,6 @@ jobs:
         if: needs.determine-evals.outputs.run-extract == 'true'
         run: pnpm run build
 
-      - name: Install Playwright browsers
-        if: needs.determine-evals.outputs.run-extract == 'true'
-        run: pnpm exec playwright install --with-deps
-
       # 1. Run extract category with domExtract
       - name: Run Extract Evals (domExtract)
         if: needs.determine-evals.outputs.run-extract == 'true'
@@ -491,10 +473,6 @@ jobs:
           npm i -g pnpm
           pnpm install --no-frozen-lockfile
 
-      - name: Install Playwright browsers
-        if: needs.determine-evals.outputs.run-observe == 'true'
-        run: pnpm exec playwright install --with-deps
-
       - name: Build Stagehand
         if: needs.determine-evals.outputs.run-observe == 'true'
         run: pnpm run build
@@ -560,10 +538,6 @@ jobs:
           npm i -g pnpm
           pnpm install --no-frozen-lockfile
 
-      - name: Install Playwright browsers
-        if: needs.determine-evals.outputs.run-targeted-extract == 'true'
-        run: pnpm exec playwright install --with-deps
-
       - name: Build Stagehand
         if: needs.determine-evals.outputs.run-targeted-extract == 'true'
         run: pnpm run build
diff --git a/evals/deterministic/tests/browserbase/contexts.test.ts b/evals/deterministic/tests/browserbase/contexts.test.ts
@@ -76,6 +76,7 @@ test.describe("Contexts", () => {
       // We will be adding cookies to the context in this session, so we need mark persist=true
       stagehand = new Stagehand({
         ...StagehandConfig,
+        env: "BROWSERBASE",
         useAPI: false,
         browserbaseSessionCreateParams: {
           projectId: BROWSERBASE_PROJECT_ID,
@@ -116,6 +117,7 @@ test.describe("Contexts", () => {
       // We don't need to persist cookies in this session, so we can mark persist=false
       const newStagehand = new Stagehand({
         ...StagehandConfig,
+        env: "BROWSERBASE",
         useAPI: false,
         browserbaseSessionCreateParams: {
           projectId: BROWSERBASE_PROJECT_ID,
diff --git a/evals/index.eval.ts b/evals/index.eval.ts
@@ -33,13 +33,9 @@ import { StagehandEvalError } from "@/types/stagehandErrors";
 import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI";
 import OpenAI from "openai";
 import { initStagehand } from "./initStagehand";
-import { google } from "@ai-sdk/google";
-import { anthropic } from "@ai-sdk/anthropic";
-import { groq } from "@ai-sdk/groq";
-import { cerebras } from "@ai-sdk/cerebras";
-import { openai } from "@ai-sdk/openai";
 import { AISdkClient } from "@/examples/external_clients/aisdk";
-import { xai } from "@ai-sdk/xai";
+import { getAISDKLanguageModel } from "@/lib/llm/LLMProvider";
+
 dotenv.config();
 
 /**
@@ -321,41 +317,16 @@ const generateFilteredTestcases = (): Testcase[] => {
 
           // Execute the task
           let llmClient: LLMClient;
-          if (
-            input.modelName.startsWith("gpt") ||
-            input.modelName.startsWith("o")
-          ) {
-            llmClient = new AISdkClient({
-              model: wrapAISDKModel(openai(input.modelName)),
-            });
-          } else if (input.modelName.startsWith("gemini")) {
-            llmClient = new AISdkClient({
-              model: wrapAISDKModel(google(input.modelName)),
-            });
-          } else if (input.modelName.startsWith("claude")) {
-            llmClient = new AISdkClient({
-              model: wrapAISDKModel(anthropic(input.modelName)),
-            });
-          } else if (
-            input.modelName.includes("groq") ||
-            input.modelName.includes("kimi")
-          ) {
-            llmClient = new AISdkClient({
-              model: wrapAISDKModel(groq(input.modelName)),
-            });
-          } else if (input.modelName.includes("cerebras")) {
+          if (input.modelName.includes("/")) {
             llmClient = new AISdkClient({
               model: wrapAISDKModel(
-                cerebras(
-                  input.modelName.substring(input.modelName.indexOf("/") + 1),
+                getAISDKLanguageModel(
+                  input.modelName.split("/")[0],
+                  input.modelName.split("/")[1],
                 ),
               ),
             });
-          } else if (input.modelName.startsWith("grok")) {
-            llmClient = new AISdkClient({
-              model: wrapAISDKModel(xai(input.modelName)),
-            });
-          } else if (input.modelName.includes("/")) {
+          } else {
             llmClient = new CustomOpenAIClient({
               modelName: input.modelName as AvailableModel,
               client: wrapOpenAI(
diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts
@@ -98,7 +98,11 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) {
  */
 const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
   ? process.env.EVAL_MODELS.split(",")
-  : ["gemini-2.0-flash", "gpt-4.1-mini", "claude-3-5-sonnet-latest"];
+  : [
+      "google/gemini-2.0-flash",
+      "openai/gpt-4.1-mini",
+      "anthropic/claude-3-5-sonnet-latest",
+    ];
 
 const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
   ? process.env.EVAL_AGENT_MODELS.split(",")
diff --git a/lib/llm/LLMProvider.ts b/lib/llm/LLMProvider.ts
@@ -94,7 +94,7 @@ const modelToProviderMap: { [key in AvailableModel]: ModelProvider } = {
   "gemini-2.5-pro-preview-03-25": "google",
 };
 
-function getAISDKLanguageModel(
+export function getAISDKLanguageModel(
   subProvider: string,
   subModelName: string,
   apiKey?: string,