You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
* feat: add anthropic models to evals
* chore: CI fixes
* chore: remove unused import and delete one model to avoid rate limits from anthropic
* chore: use 3.5-sonnet
* add gemini evals
* chore: syncpack things
* chore: update types
* feat: working gemini evals
* chore: remove anthropic eval for now
* chore: fix formatting
* feat: update openai models to use ai wholesaling
"You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
@@ -53,15 +53,10 @@ export async function runTask(
53
53
maxSteps: 10,
54
54
})
55
55
56
-
// we need to consume the fill stream, so this is empty
57
-
// eslint-disable-next-line no-empty
58
-
forawait(const_ofres.fullStream){
59
-
}
60
-
61
56
// convert into an LLM readable result so our factuality checker can validate tool calls
@@ -40,11 +87,10 @@ function getWorkersAiModel(modelName: AiTextGenerationModels) {
40
87
exportconsteachModel=describe.each([
41
88
getOpenAiModel('gpt-4o'),
42
89
getOpenAiModel('gpt-4o-mini'),
43
-
90
+
// getAnthropicModel('claude-3-5-sonnet-20241022'), TODO: The evals pass with anthropic, but our rate limit is so low with AI wholesaling that we can't use it in CI because it's impossible to get a complete run with the current limits
0 commit comments