Skip to content

Commit f75f5fb

Browse files
authored
consolidate llm provider for evals (#893)
# why # what changed # test plan
1 parent bcb8887 commit f75f5fb

File tree

5 files changed

+16
-65
lines changed

5 files changed

+16
-65
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ on:
1111
- "docs/**"
1212

1313
env:
14-
EVAL_MODELS: "gpt-4.1,gemini-2.0-flash,claude-3-5-sonnet-latest"
14+
EVAL_MODELS: "openai/gpt-4.1,google/gemini-2.0-flash,anthropic/claude-3-5-sonnet-latest"
1515
EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract"
1616
EVAL_MAX_CONCURRENCY: 25
1717
EVAL_TRIAL_COUNT: 5
@@ -179,9 +179,6 @@ jobs:
179179
npm i -g pnpm
180180
pnpm install --no-frozen-lockfile
181181
182-
- name: Install Playwright browsers
183-
run: pnpm exec playwright install --with-deps
184-
185182
- name: Build Stagehand
186183
run: pnpm run build
187184

@@ -222,9 +219,6 @@ jobs:
222219
- name: Build Stagehand
223220
run: pnpm run build
224221

225-
- name: Install Playwright browsers
226-
run: pnpm exec playwright install --with-deps
227-
228222
- name: Run Regression Evals
229223
run: pnpm run evals category regression trials=2 concurrency=20 env=BROWSERBASE
230224

@@ -288,10 +282,6 @@ jobs:
288282
if: needs.determine-evals.outputs.run-combination == 'true'
289283
run: pnpm run build
290284

291-
- name: Install Playwright browsers
292-
if: needs.determine-evals.outputs.run-combination == 'true'
293-
run: pnpm exec playwright install --with-deps
294-
295285
- name: Run Combination Evals
296286
if: needs.determine-evals.outputs.run-combination == 'true'
297287
run: pnpm run evals category combination
@@ -354,10 +344,6 @@ jobs:
354344
if: needs.determine-evals.outputs.run-act == 'true'
355345
run: pnpm run build
356346

357-
- name: Install Playwright browsers
358-
if: needs.determine-evals.outputs.run-act == 'true'
359-
run: pnpm exec playwright install --with-deps
360-
361347
- name: Run Act Evals
362348
if: needs.determine-evals.outputs.run-act == 'true'
363349
run: pnpm run evals category act
@@ -423,10 +409,6 @@ jobs:
423409
if: needs.determine-evals.outputs.run-extract == 'true'
424410
run: pnpm run build
425411

426-
- name: Install Playwright browsers
427-
if: needs.determine-evals.outputs.run-extract == 'true'
428-
run: pnpm exec playwright install --with-deps
429-
430412
# 1. Run extract category with domExtract
431413
- name: Run Extract Evals (domExtract)
432414
if: needs.determine-evals.outputs.run-extract == 'true'
@@ -491,10 +473,6 @@ jobs:
491473
npm i -g pnpm
492474
pnpm install --no-frozen-lockfile
493475
494-
- name: Install Playwright browsers
495-
if: needs.determine-evals.outputs.run-observe == 'true'
496-
run: pnpm exec playwright install --with-deps
497-
498476
- name: Build Stagehand
499477
if: needs.determine-evals.outputs.run-observe == 'true'
500478
run: pnpm run build
@@ -560,10 +538,6 @@ jobs:
560538
npm i -g pnpm
561539
pnpm install --no-frozen-lockfile
562540
563-
- name: Install Playwright browsers
564-
if: needs.determine-evals.outputs.run-targeted-extract == 'true'
565-
run: pnpm exec playwright install --with-deps
566-
567541
- name: Build Stagehand
568542
if: needs.determine-evals.outputs.run-targeted-extract == 'true'
569543
run: pnpm run build

evals/deterministic/tests/browserbase/contexts.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ test.describe("Contexts", () => {
7676
// We will be adding cookies to the context in this session, so we need mark persist=true
7777
stagehand = new Stagehand({
7878
...StagehandConfig,
79+
env: "BROWSERBASE",
7980
useAPI: false,
8081
browserbaseSessionCreateParams: {
8182
projectId: BROWSERBASE_PROJECT_ID,
@@ -116,6 +117,7 @@ test.describe("Contexts", () => {
116117
// We don't need to persist cookies in this session, so we can mark persist=false
117118
const newStagehand = new Stagehand({
118119
...StagehandConfig,
120+
env: "BROWSERBASE",
119121
useAPI: false,
120122
browserbaseSessionCreateParams: {
121123
projectId: BROWSERBASE_PROJECT_ID,

evals/index.eval.ts

Lines changed: 7 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,9 @@ import { StagehandEvalError } from "@/types/stagehandErrors";
3333
import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI";
3434
import OpenAI from "openai";
3535
import { initStagehand } from "./initStagehand";
36-
import { google } from "@ai-sdk/google";
37-
import { anthropic } from "@ai-sdk/anthropic";
38-
import { groq } from "@ai-sdk/groq";
39-
import { cerebras } from "@ai-sdk/cerebras";
40-
import { openai } from "@ai-sdk/openai";
4136
import { AISdkClient } from "@/examples/external_clients/aisdk";
42-
import { xai } from "@ai-sdk/xai";
37+
import { getAISDKLanguageModel } from "@/lib/llm/LLMProvider";
38+
4339
dotenv.config();
4440

4541
/**
@@ -321,41 +317,16 @@ const generateFilteredTestcases = (): Testcase[] => {
321317

322318
// Execute the task
323319
let llmClient: LLMClient;
324-
if (
325-
input.modelName.startsWith("gpt") ||
326-
input.modelName.startsWith("o")
327-
) {
328-
llmClient = new AISdkClient({
329-
model: wrapAISDKModel(openai(input.modelName)),
330-
});
331-
} else if (input.modelName.startsWith("gemini")) {
332-
llmClient = new AISdkClient({
333-
model: wrapAISDKModel(google(input.modelName)),
334-
});
335-
} else if (input.modelName.startsWith("claude")) {
336-
llmClient = new AISdkClient({
337-
model: wrapAISDKModel(anthropic(input.modelName)),
338-
});
339-
} else if (
340-
input.modelName.includes("groq") ||
341-
input.modelName.includes("kimi")
342-
) {
343-
llmClient = new AISdkClient({
344-
model: wrapAISDKModel(groq(input.modelName)),
345-
});
346-
} else if (input.modelName.includes("cerebras")) {
320+
if (input.modelName.includes("/")) {
347321
llmClient = new AISdkClient({
348322
model: wrapAISDKModel(
349-
cerebras(
350-
input.modelName.substring(input.modelName.indexOf("/") + 1),
323+
getAISDKLanguageModel(
324+
input.modelName.split("/")[0],
325+
input.modelName.split("/")[1],
351326
),
352327
),
353328
});
354-
} else if (input.modelName.startsWith("grok")) {
355-
llmClient = new AISdkClient({
356-
model: wrapAISDKModel(xai(input.modelName)),
357-
});
358-
} else if (input.modelName.includes("/")) {
329+
} else {
359330
llmClient = new CustomOpenAIClient({
360331
modelName: input.modelName as AvailableModel,
361332
client: wrapOpenAI(

evals/taskConfig.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,11 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) {
9898
*/
9999
const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
100100
? process.env.EVAL_MODELS.split(",")
101-
: ["gemini-2.0-flash", "gpt-4.1-mini", "claude-3-5-sonnet-latest"];
101+
: [
102+
"google/gemini-2.0-flash",
103+
"openai/gpt-4.1-mini",
104+
"anthropic/claude-3-5-sonnet-latest",
105+
];
102106

103107
const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
104108
? process.env.EVAL_AGENT_MODELS.split(",")

lib/llm/LLMProvider.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ const modelToProviderMap: { [key in AvailableModel]: ModelProvider } = {
9494
"gemini-2.5-pro-preview-03-25": "google",
9595
};
9696

97-
function getAISDKLanguageModel(
97+
export function getAISDKLanguageModel(
9898
subProvider: string,
9999
subModelName: string,
100100
apiKey?: string,

0 commit comments

Comments
 (0)