diff --git a/.changeset/chatty-pans-create.md b/.changeset/chatty-pans-create.md new file mode 100644 index 000000000..45c08a23b --- /dev/null +++ b/.changeset/chatty-pans-create.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand-evals": patch +--- + +Paramaterize execution model in agent evals diff --git a/.env.example b/.env.example index b31704e32..792df1fc5 100644 --- a/.env.example +++ b/.env.example @@ -11,3 +11,4 @@ EXPERIMENTAL_EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest,o1-mini,o1-preview" EVAL_CATEGORIES="observe,act,combination,extract,experimental" AGENT_EVAL_MAX_STEPS=50 STAGEHAND_API_URL="http://localhost:80" +AGENT_EVAL_EXECUTION_MODEL="google/gemini-2.5-flash \ No newline at end of file diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts index 519636fdc..f7b21d685 100644 --- a/evals/initStagehand.ts +++ b/evals/initStagehand.ts @@ -115,8 +115,10 @@ export const initStagehand = async ({ provider: modelName.startsWith("claude") ? "anthropic" : "openai", } as AgentConfig; } else { + const executionModelFromEnv = process.env.AGENT_EVAL_EXECUTION_MODEL; agentConfig = { model: modelName, + executionModel: executionModelFromEnv ?? "google/gemini-2.5-flash", } as AgentConfig; }