Skip to content

Commit 02d7498

Browse files
haraldschillyclaude
andcommitted
llm: fix OpenAI o1 model support and improve test framework
- Fix o1 models stream_options error by only including stream_options when streaming is enabled - Fix o1 models system role error by omitting system messages entirely (o1 models don't support system roles) - Update tests to use USE_NEWER_LC_IMPL flag to switch between legacy and unified LangChain implementations - Export USE_NEWER_LC_IMPL flag for test usage - All 22 LLM tests now pass including both o1 and o1-mini models 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]>
1 parent 317c4fe commit 02d7498

File tree

4 files changed

+85
-52
lines changed

4 files changed

+85
-52
lines changed

src/packages/server/llm/evaluate-lc.ts

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,7 @@ export const PROVIDER_CONFIGS = {
110110
},
111111
supportsStreaming: (model) =>
112112
!normalizeOpenAIModel(model).startsWith("o1-"),
113-
getSystemRole: (model) =>
114-
normalizeOpenAIModel(model).startsWith("o1-") ? "developer" : "system",
113+
getSystemRole: (_model) => "system",
115114
shouldContinueOnNonString: true,
116115
getTokenCountFallback: async (input, output, historyTokens) => ({
117116
prompt_tokens: numTokens(input) + historyTokens,
@@ -316,11 +315,18 @@ export async function evaluateWithLangChain(
316315
const historyMessagesKey = "history";
317316

318317
// Create prompt template
319-
const prompt = ChatPromptTemplate.fromMessages([
320-
[systemRole, system ?? ""],
321-
new MessagesPlaceholder(historyMessagesKey),
322-
["human", "{input}"],
323-
]);
318+
// For o1 models, omit the system message entirely since they don't support system roles
319+
const isO1Model = model.includes("o1");
320+
const prompt = isO1Model
321+
? ChatPromptTemplate.fromMessages([
322+
new MessagesPlaceholder(historyMessagesKey),
323+
["human", system ? `${system}\n\n{input}` : "{input}"],
324+
])
325+
: ChatPromptTemplate.fromMessages([
326+
[systemRole, system ?? ""],
327+
new MessagesPlaceholder(historyMessagesKey),
328+
["human", "{input}"],
329+
]);
324330

325331
const chain = prompt.pipe(client);
326332

@@ -333,9 +339,8 @@ export async function evaluateWithLangChain(
333339
inputMessagesKey: "input",
334340
historyMessagesKey,
335341
getMessageHistory: async () => {
336-
const { messageHistory, tokens } = await transformHistoryToMessages(
337-
history,
338-
);
342+
const { messageHistory, tokens } =
343+
await transformHistoryToMessages(history);
339344
historyTokens = tokens;
340345
return messageHistory;
341346
},

src/packages/server/llm/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ const DEBUG_THROW_LLM_ERROR = process.env.DEBUG_THROW_LLM_ERROR === "true";
6666
const log = getLogger("llm");
6767

6868
// Feature flag to use the new unified LangChain implementation
69-
const USE_NEWER_LC_IMPL =
69+
export const USE_NEWER_LC_IMPL =
7070
(process.env.COCALC_LLM_USE_NEWER_LC_IMPL ?? "true") === "true";
7171

7272
async function getDefaultModel(): Promise<LanguageModel> {

src/packages/server/llm/openai-lc.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ export async function evaluateOpenAILC(
5555

5656
// As of Jan 2025: reasoning models (o1) do not support streaming
5757
// https://platform.openai.com/docs/guides/reasoning/
58-
const isO1 = model != "o1-mini" && model != "o1";
59-
const streaming = stream != null && isO1;
58+
const isO1 = model.includes("o1");
59+
const streaming = stream != null && !isO1;
6060

6161
// This is also quite big -- only uncomment when developing and needing this.
6262
// log.debug("evaluateOpenAILC", {
@@ -75,10 +75,10 @@ export async function evaluateOpenAILC(
7575
...params,
7676
maxTokens,
7777
streaming,
78-
}).bind(isO1 ? {} : { stream_options: { include_usage: true } });
78+
}).withConfig(streaming ? { stream_options: { include_usage: true } } : {});
7979

8080
const prompt = ChatPromptTemplate.fromMessages([
81-
[isO1 ? "developer" : "system", system ?? ""],
81+
["system", system ?? ""],
8282
new MessagesPlaceholder("history"),
8383
["human", "{input}"],
8484
]);

src/packages/server/llm/test/models.test.ts

Lines changed: 65 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,15 @@ import {
1111
isMistralModel,
1212
isOpenAIModel,
1313
} from "@cocalc/util/db-schema/llm-utils";
14-
// import { evaluateMistral } from "../mistral";
14+
import { evaluateGoogleGenAI } from "..";
1515
import { evaluateAnthropic } from "../anthropic";
16+
import { getClient } from "../client";
17+
import { evaluateWithLangChain } from "../evaluate-lc";
1618
import { GoogleGenAIClient } from "../google-genai-client";
19+
import { USE_NEWER_LC_IMPL } from "../index";
1720
import { evaluateMistral } from "../mistral";
1821
import { evaluateOpenAILC } from "../openai-lc";
1922
import { enableModels, setupAPIKeys, test_llm } from "./shared";
20-
import { evaluateGoogleGenAI } from "..";
21-
import { getClient } from "../client";
2223

2324
const LLM_TIMEOUT = 10_000;
2425

@@ -54,10 +55,15 @@ async function llmOpenAI(model: LanguageModelCore) {
5455
throw new Error(`model: ${model} is not an OpenAI model`);
5556
}
5657

57-
const answer = await evaluateOpenAILC({
58-
model,
59-
...QUERY,
60-
});
58+
const answer = USE_NEWER_LC_IMPL
59+
? await evaluateWithLangChain({
60+
model,
61+
...QUERY,
62+
})
63+
: await evaluateOpenAILC({
64+
model,
65+
...QUERY,
66+
});
6167

6268
checkAnswer(answer);
6369
}
@@ -66,12 +72,21 @@ async function llmGoogle(model: LanguageModelCore) {
6672
if (!isGoogleModel(model)) {
6773
throw new Error(`model: ${model} is not a Google model`);
6874
}
69-
const client = (await getClient(model)) as GoogleGenAIClient;
70-
const answer = await evaluateGoogleGenAI({
71-
model,
72-
client,
73-
...QUERY,
74-
});
75+
76+
const answer = USE_NEWER_LC_IMPL
77+
? await evaluateWithLangChain({
78+
model,
79+
...QUERY,
80+
})
81+
: await (async () => {
82+
const client = (await getClient(model)) as GoogleGenAIClient;
83+
return await evaluateGoogleGenAI({
84+
model,
85+
client,
86+
...QUERY,
87+
});
88+
})();
89+
7590
checkAnswer(answer);
7691
}
7792

@@ -80,95 +95,96 @@ test_llm("openai")("OpenAI", () => {
8095
test(
8196
"gpt3.5 works",
8297
async () => {
83-
llmOpenAI("gpt-3.5-turbo");
98+
await llmOpenAI("gpt-3.5-turbo");
8499
},
85100
LLM_TIMEOUT,
86101
);
87102
test(
88103
"gpt 4 works",
89104
async () => {
90-
llmOpenAI("gpt-4");
105+
await llmOpenAI("gpt-4");
91106
},
92107
LLM_TIMEOUT,
93108
);
94109
test(
95110
"gpt 4 turbo works",
96111
async () => {
97-
llmOpenAI("gpt-4-turbo-8k");
112+
await llmOpenAI("gpt-4-turbo-8k");
98113
},
99114
LLM_TIMEOUT,
100115
);
101116
test(
102117
"gpt 4 omni works",
103118
async () => {
104-
llmOpenAI("gpt-4o-8k");
119+
await llmOpenAI("gpt-4o-8k");
105120
},
106121
LLM_TIMEOUT,
107122
);
108123
test(
109124
"gpt 4o mini works",
110125
async () => {
111-
llmOpenAI("gpt-4o-mini-8k");
126+
await llmOpenAI("gpt-4o-mini-8k");
112127
},
113128
LLM_TIMEOUT,
114129
);
115130
test(
116131
"gpt 4.1 works",
117132
async () => {
118-
llmOpenAI("gpt-4.1");
133+
await llmOpenAI("gpt-4.1");
119134
},
120135
LLM_TIMEOUT,
121136
);
122137
test(
123-
"gpt 4.1 mini works",
138+
"openai 4.1 mini works",
124139
async () => {
125140
llmOpenAI("gpt-4.1-mini");
126141
},
127142
LLM_TIMEOUT,
128143
);
129144

130-
// test("gpt o1", async () => {
131-
// llmOpenAI("o1-8k");
132-
// });
133-
// test("gpt o1 mini works", async () => {
134-
// llmOpenAI("o1-mini-8k");
135-
// });
145+
test("openai o1", async () => {
146+
await llmOpenAI("o1-8k");
147+
});
148+
149+
test("gpt o1 mini works", async () => {
150+
await llmOpenAI("o1-mini-8k");
151+
});
136152
});
137153

138154
// ATTN: does not work everywhere around, geolocation matters
139155
test_llm("google")("Google GenAI", () => {
140156
test(
141157
"gemini 1.5 pro works",
142158
async () => {
143-
llmGoogle("gemini-1.5-pro");
159+
await llmGoogle("gemini-1.5-pro");
144160
},
145161
LLM_TIMEOUT,
146162
);
147163
test(
148164
"gemini 2.0 flash works",
149165
async () => {
150-
llmGoogle("gemini-2.0-flash-8k");
166+
await llmGoogle("gemini-2.0-flash-8k");
151167
},
152168
LLM_TIMEOUT,
153169
);
154170
test(
155171
"gemini 2.0 flash lite works",
156172
async () => {
157-
llmGoogle("gemini-2.0-flash-lite-8k");
173+
await llmGoogle("gemini-2.0-flash-lite-8k");
158174
},
159175
LLM_TIMEOUT,
160176
);
161177
test(
162178
"gemini 2.5 flash works",
163179
async () => {
164-
llmGoogle("gemini-2.5-flash-8k");
180+
await llmGoogle("gemini-2.5-flash-8k");
165181
},
166182
LLM_TIMEOUT,
167183
);
168184
test(
169185
"gemini 2.5 pro works",
170186
async () => {
171-
llmGoogle("gemini-2.5-pro-8k");
187+
await llmGoogle("gemini-2.5-pro-8k");
172188
},
173189
LLM_TIMEOUT,
174190
);
@@ -188,7 +204,9 @@ test_llm("mistralai")("Mistral AI", () => {
188204
test(
189205
"small",
190206
async () => {
191-
const answer = await evaluateMistral({ model: small, ...QUERY });
207+
const answer = USE_NEWER_LC_IMPL
208+
? await evaluateWithLangChain({ model: small, ...QUERY })
209+
: await evaluateMistral({ model: small, ...QUERY });
192210
checkAnswer(answer);
193211
},
194212
LLM_TIMEOUT,
@@ -197,7 +215,9 @@ test_llm("mistralai")("Mistral AI", () => {
197215
test(
198216
"medium",
199217
async () => {
200-
const answer = await evaluateMistral({ model: medium, ...QUERY });
218+
const answer = USE_NEWER_LC_IMPL
219+
? await evaluateWithLangChain({ model: medium, ...QUERY })
220+
: await evaluateMistral({ model: medium, ...QUERY });
201221
checkAnswer(answer);
202222
},
203223
LLM_TIMEOUT,
@@ -206,7 +226,9 @@ test_llm("mistralai")("Mistral AI", () => {
206226
test(
207227
"large",
208228
async () => {
209-
const answer = await evaluateMistral({ model: large, ...QUERY });
229+
const answer = USE_NEWER_LC_IMPL
230+
? await evaluateWithLangChain({ model: large, ...QUERY })
231+
: await evaluateMistral({ model: large, ...QUERY });
210232
checkAnswer(answer);
211233
},
212234
LLM_TIMEOUT,
@@ -227,7 +249,9 @@ test_llm("anthropic")("Anthropic", () => {
227249
test(
228250
"haiku",
229251
async () => {
230-
const answer = await evaluateAnthropic({ model: haiku, ...QUERY });
252+
const answer = USE_NEWER_LC_IMPL
253+
? await evaluateWithLangChain({ model: haiku, ...QUERY })
254+
: await evaluateAnthropic({ model: haiku, ...QUERY });
231255
checkAnswer(answer);
232256
},
233257
LLM_TIMEOUT,
@@ -236,7 +260,9 @@ test_llm("anthropic")("Anthropic", () => {
236260
test(
237261
"sonnet",
238262
async () => {
239-
const answer = await evaluateAnthropic({ model: sonnet, ...QUERY });
263+
const answer = USE_NEWER_LC_IMPL
264+
? await evaluateWithLangChain({ model: sonnet, ...QUERY })
265+
: await evaluateAnthropic({ model: sonnet, ...QUERY });
240266
checkAnswer(answer);
241267
},
242268
LLM_TIMEOUT,
@@ -245,7 +271,9 @@ test_llm("anthropic")("Anthropic", () => {
245271
test(
246272
"opus",
247273
async () => {
248-
const answer = await evaluateAnthropic({ model: opus, ...QUERY });
274+
const answer = USE_NEWER_LC_IMPL
275+
? await evaluateWithLangChain({ model: opus, ...QUERY })
276+
: await evaluateAnthropic({ model: opus, ...QUERY });
249277
checkAnswer(answer);
250278
},
251279
LLM_TIMEOUT,

0 commit comments

Comments
 (0)