llm: fix OpenAI o1 model support and improve test framework

haraldschilly · claude · haraldschilly · commit 02d7498a04c6 · 2025-07-18T19:45:26.000+02:00
- Fix o1 models stream_options error by only including stream_options when streaming is enabled - Fix o1 models system role error by omitting system messages entirely (o1 models don't support system roles) - Update tests to use USE_NEWER_LC_IMPL flag to switch between legacy and unified LangChain implementations - Export USE_NEWER_LC_IMPL flag for test usage - All 22 LLM tests now pass including both o1 and o1-mini models 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/src/packages/server/llm/evaluate-lc.ts b/src/packages/server/llm/evaluate-lc.ts
@@ -110,8 +110,7 @@ export const PROVIDER_CONFIGS = {
     },
     supportsStreaming: (model) =>
       !normalizeOpenAIModel(model).startsWith("o1-"),
-    getSystemRole: (model) =>
-      normalizeOpenAIModel(model).startsWith("o1-") ? "developer" : "system",
+    getSystemRole: (_model) => "system",
     shouldContinueOnNonString: true,
     getTokenCountFallback: async (input, output, historyTokens) => ({
       prompt_tokens: numTokens(input) + historyTokens,
@@ -316,11 +315,18 @@ export async function evaluateWithLangChain(
   const historyMessagesKey = "history";
 
   // Create prompt template
-  const prompt = ChatPromptTemplate.fromMessages([
-    [systemRole, system ?? ""],
-    new MessagesPlaceholder(historyMessagesKey),
-    ["human", "{input}"],
-  ]);
+  // For o1 models, omit the system message entirely since they don't support system roles
+  const isO1Model = model.includes("o1");
+  const prompt = isO1Model
+    ? ChatPromptTemplate.fromMessages([
+        new MessagesPlaceholder(historyMessagesKey),
+        ["human", system ? `${system}\n\n{input}` : "{input}"],
+      ])
+    : ChatPromptTemplate.fromMessages([
+        [systemRole, system ?? ""],
+        new MessagesPlaceholder(historyMessagesKey),
+        ["human", "{input}"],
+      ]);
 
   const chain = prompt.pipe(client);
 
@@ -333,9 +339,8 @@ export async function evaluateWithLangChain(
     inputMessagesKey: "input",
     historyMessagesKey,
     getMessageHistory: async () => {
-      const { messageHistory, tokens } = await transformHistoryToMessages(
-        history,
-      );
+      const { messageHistory, tokens } =
+        await transformHistoryToMessages(history);
       historyTokens = tokens;
       return messageHistory;
     },
diff --git a/src/packages/server/llm/index.ts b/src/packages/server/llm/index.ts
@@ -66,7 +66,7 @@ const DEBUG_THROW_LLM_ERROR = process.env.DEBUG_THROW_LLM_ERROR === "true";
 const log = getLogger("llm");
 
 // Feature flag to use the new unified LangChain implementation
-const USE_NEWER_LC_IMPL =
+export const USE_NEWER_LC_IMPL =
   (process.env.COCALC_LLM_USE_NEWER_LC_IMPL ?? "true") === "true";
 
 async function getDefaultModel(): Promise<LanguageModel> {
diff --git a/src/packages/server/llm/openai-lc.ts b/src/packages/server/llm/openai-lc.ts
@@ -55,8 +55,8 @@ export async function evaluateOpenAILC(
 
   // As of Jan 2025: reasoning models (o1) do not support streaming
   // https://platform.openai.com/docs/guides/reasoning/
-  const isO1 = model != "o1-mini" && model != "o1";
-  const streaming = stream != null && isO1;
+  const isO1 = model.includes("o1");
+  const streaming = stream != null && !isO1;
 
   // This is also quite big -- only uncomment when developing and needing this.
   //   log.debug("evaluateOpenAILC", {
@@ -75,10 +75,10 @@ export async function evaluateOpenAILC(
     ...params,
     maxTokens,
     streaming,
-  }).bind(isO1 ? {} : { stream_options: { include_usage: true } });
+  }).withConfig(streaming ? { stream_options: { include_usage: true } } : {});
 
   const prompt = ChatPromptTemplate.fromMessages([
-    [isO1 ? "developer" : "system", system ?? ""],
+    ["system", system ?? ""],
     new MessagesPlaceholder("history"),
     ["human", "{input}"],
   ]);
diff --git a/src/packages/server/llm/test/models.test.ts b/src/packages/server/llm/test/models.test.ts
@@ -11,14 +11,15 @@ import {
   isMistralModel,
   isOpenAIModel,
 } from "@cocalc/util/db-schema/llm-utils";
-// import { evaluateMistral } from "../mistral";
+import { evaluateGoogleGenAI } from "..";
 import { evaluateAnthropic } from "../anthropic";
+import { getClient } from "../client";
+import { evaluateWithLangChain } from "../evaluate-lc";
 import { GoogleGenAIClient } from "../google-genai-client";
+import { USE_NEWER_LC_IMPL } from "../index";
 import { evaluateMistral } from "../mistral";
 import { evaluateOpenAILC } from "../openai-lc";
 import { enableModels, setupAPIKeys, test_llm } from "./shared";
-import { evaluateGoogleGenAI } from "..";
-import { getClient } from "../client";
 
 const LLM_TIMEOUT = 10_000;
 
@@ -54,10 +55,15 @@ async function llmOpenAI(model: LanguageModelCore) {
     throw new Error(`model: ${model} is not an OpenAI model`);
   }
 
-  const answer = await evaluateOpenAILC({
-    model,
-    ...QUERY,
-  });
+  const answer = USE_NEWER_LC_IMPL
+    ? await evaluateWithLangChain({
+        model,
+        ...QUERY,
+      })
+    : await evaluateOpenAILC({
+        model,
+        ...QUERY,
+      });
 
   checkAnswer(answer);
 }
@@ -66,12 +72,21 @@ async function llmGoogle(model: LanguageModelCore) {
   if (!isGoogleModel(model)) {
     throw new Error(`model: ${model} is not a Google model`);
   }
-  const client = (await getClient(model)) as GoogleGenAIClient;
-  const answer = await evaluateGoogleGenAI({
-    model,
-    client,
-    ...QUERY,
-  });
+
+  const answer = USE_NEWER_LC_IMPL
+    ? await evaluateWithLangChain({
+        model,
+        ...QUERY,
+      })
+    : await (async () => {
+        const client = (await getClient(model)) as GoogleGenAIClient;
+        return await evaluateGoogleGenAI({
+          model,
+          client,
+          ...QUERY,
+        });
+      })();
+
   checkAnswer(answer);
 }
 
@@ -80,95 +95,96 @@ test_llm("openai")("OpenAI", () => {
   test(
     "gpt3.5 works",
     async () => {
-      llmOpenAI("gpt-3.5-turbo");
+      await llmOpenAI("gpt-3.5-turbo");
     },
     LLM_TIMEOUT,
   );
   test(
     "gpt 4 works",
     async () => {
-      llmOpenAI("gpt-4");
+      await llmOpenAI("gpt-4");
     },
     LLM_TIMEOUT,
   );
   test(
     "gpt 4 turbo works",
     async () => {
-      llmOpenAI("gpt-4-turbo-8k");
+      await llmOpenAI("gpt-4-turbo-8k");
     },
     LLM_TIMEOUT,
   );
   test(
     "gpt 4 omni works",
     async () => {
-      llmOpenAI("gpt-4o-8k");
+      await llmOpenAI("gpt-4o-8k");
     },
     LLM_TIMEOUT,
   );
   test(
     "gpt 4o mini works",
     async () => {
-      llmOpenAI("gpt-4o-mini-8k");
+      await llmOpenAI("gpt-4o-mini-8k");
     },
     LLM_TIMEOUT,
   );
   test(
     "gpt 4.1 works",
     async () => {
-      llmOpenAI("gpt-4.1");
+      await llmOpenAI("gpt-4.1");
     },
     LLM_TIMEOUT,
   );
   test(
-    "gpt 4.1 mini works",
+    "openai 4.1 mini works",
     async () => {
       llmOpenAI("gpt-4.1-mini");
     },
     LLM_TIMEOUT,
   );
 
-  // test("gpt o1", async () => {
-  //   llmOpenAI("o1-8k");
-  // });
-  // test("gpt o1 mini works", async () => {
-  //   llmOpenAI("o1-mini-8k");
-  // });
+  test("openai o1", async () => {
+    await llmOpenAI("o1-8k");
+  });
+
+  test("gpt o1 mini works", async () => {
+    await llmOpenAI("o1-mini-8k");
+  });
 });
 
 // ATTN: does not work everywhere around, geolocation matters
 test_llm("google")("Google GenAI", () => {
   test(
     "gemini 1.5 pro works",
     async () => {
-      llmGoogle("gemini-1.5-pro");
+      await llmGoogle("gemini-1.5-pro");
     },
     LLM_TIMEOUT,
   );
   test(
     "gemini 2.0 flash works",
     async () => {
-      llmGoogle("gemini-2.0-flash-8k");
+      await llmGoogle("gemini-2.0-flash-8k");
     },
     LLM_TIMEOUT,
   );
   test(
     "gemini 2.0 flash lite works",
     async () => {
-      llmGoogle("gemini-2.0-flash-lite-8k");
+      await llmGoogle("gemini-2.0-flash-lite-8k");
     },
     LLM_TIMEOUT,
   );
   test(
     "gemini 2.5 flash works",
     async () => {
-      llmGoogle("gemini-2.5-flash-8k");
+      await llmGoogle("gemini-2.5-flash-8k");
     },
     LLM_TIMEOUT,
   );
   test(
     "gemini 2.5 pro works",
     async () => {
-      llmGoogle("gemini-2.5-pro-8k");
+      await llmGoogle("gemini-2.5-pro-8k");
     },
     LLM_TIMEOUT,
   );
@@ -188,7 +204,9 @@ test_llm("mistralai")("Mistral AI", () => {
   test(
     "small",
     async () => {
-      const answer = await evaluateMistral({ model: small, ...QUERY });
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: small, ...QUERY })
+        : await evaluateMistral({ model: small, ...QUERY });
       checkAnswer(answer);
     },
     LLM_TIMEOUT,
@@ -197,7 +215,9 @@ test_llm("mistralai")("Mistral AI", () => {
   test(
     "medium",
     async () => {
-      const answer = await evaluateMistral({ model: medium, ...QUERY });
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: medium, ...QUERY })
+        : await evaluateMistral({ model: medium, ...QUERY });
       checkAnswer(answer);
     },
     LLM_TIMEOUT,
@@ -206,7 +226,9 @@ test_llm("mistralai")("Mistral AI", () => {
   test(
     "large",
     async () => {
-      const answer = await evaluateMistral({ model: large, ...QUERY });
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: large, ...QUERY })
+        : await evaluateMistral({ model: large, ...QUERY });
       checkAnswer(answer);
     },
     LLM_TIMEOUT,
@@ -227,7 +249,9 @@ test_llm("anthropic")("Anthropic", () => {
   test(
     "haiku",
     async () => {
-      const answer = await evaluateAnthropic({ model: haiku, ...QUERY });
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: haiku, ...QUERY })
+        : await evaluateAnthropic({ model: haiku, ...QUERY });
       checkAnswer(answer);
     },
     LLM_TIMEOUT,
@@ -236,7 +260,9 @@ test_llm("anthropic")("Anthropic", () => {
   test(
     "sonnet",
     async () => {
-      const answer = await evaluateAnthropic({ model: sonnet, ...QUERY });
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: sonnet, ...QUERY })
+        : await evaluateAnthropic({ model: sonnet, ...QUERY });
       checkAnswer(answer);
     },
     LLM_TIMEOUT,
@@ -245,7 +271,9 @@ test_llm("anthropic")("Anthropic", () => {
   test(
     "opus",
     async () => {
-      const answer = await evaluateAnthropic({ model: opus, ...QUERY });
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: opus, ...QUERY })
+        : await evaluateAnthropic({ model: opus, ...QUERY });
       checkAnswer(answer);
     },
     LLM_TIMEOUT,